3249e5748666fcb845b0c7f95d10f1c927d81947
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   uint64_t unneeded_reg[MAXBLOCK];
88   uint64_t unneeded_reg_upper[MAXBLOCK];
89   uint64_t branch_unneeded_reg[MAXBLOCK];
90   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
91   uint64_t p32[MAXBLOCK];
92   uint64_t pr32[MAXBLOCK];
93   signed char regmap_pre[MAXBLOCK][HOST_REGS];
94   signed char regmap[MAXBLOCK][HOST_REGS];
95   signed char regmap_entry[MAXBLOCK][HOST_REGS];
96   uint64_t constmap[MAXBLOCK][HOST_REGS];
97   uint64_t known_value[HOST_REGS];
98   u_int known_reg;
99   struct regstat regs[MAXBLOCK];
100   struct regstat branch_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124   u_int using_tlb;
125   u_int stop_after_jal;
126   extern u_char restore_candidate[512];
127   extern int cycle_count;
128
129   /* registers that may be allocated */
130   /* 1-31 gpr */
131 #define HIREG 32 // hi
132 #define LOREG 33 // lo
133 #define FSREG 34 // FPU status (FCSR)
134 #define CSREG 35 // Coprocessor status
135 #define CCREG 36 // Cycle count
136 #define INVCP 37 // Pointer to invalid_code
137 #define TEMPREG 38
138 #define FTEMP 38 // FPU temporary register
139 #define PTEMP 39 // Prefetch temporary register
140 #define TLREG 40 // TLB mapping offset
141 #define RHASH 41 // Return address hash
142 #define RHTBL 42 // Return address hash table address
143 #define RTEMP 43 // JR/JALR address register
144 #define MAXREG 43
145 #define AGEN1 44 // Address generation temporary register
146 #define AGEN2 45 // Address generation temporary register
147 #define MGEN1 46 // Maptable address generation temporary register
148 #define MGEN2 47 // Maptable address generation temporary register
149 #define BTREG 48 // Branch target temporary register
150
151   /* instruction types */
152 #define NOP 0     // No operation
153 #define LOAD 1    // Load
154 #define STORE 2   // Store
155 #define LOADLR 3  // Unaligned load
156 #define STORELR 4 // Unaligned store
157 #define MOV 5     // Move 
158 #define ALU 6     // Arithmetic/logic
159 #define MULTDIV 7 // Multiply/divide
160 #define SHIFT 8   // Shift by register
161 #define SHIFTIMM 9// Shift by immediate
162 #define IMM16 10  // 16-bit immediate
163 #define RJUMP 11  // Unconditional jump to register
164 #define UJUMP 12  // Unconditional jump
165 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
166 #define SJUMP 14  // Conditional branch (regimm format)
167 #define COP0 15   // Coprocessor 0
168 #define COP1 16   // Coprocessor 1
169 #define C1LS 17   // Coprocessor 1 load/store
170 #define FJUMP 18  // Conditional branch (floating point)
171 #define FLOAT 19  // Floating point unit
172 #define FCONV 20  // Convert integer to float
173 #define FCOMP 21  // Floating point compare (sets FSREG)
174 #define SYSCALL 22// SYSCALL
175 #define OTHER 23  // Other
176 #define SPAN 24   // Branch/delay slot spans 2 pages
177 #define NI 25     // Not implemented
178 #define HLECALL 26// PCSX fake opcodes for HLE
179
180   /* stubs */
181 #define CC_STUB 1
182 #define FP_STUB 2
183 #define LOADB_STUB 3
184 #define LOADH_STUB 4
185 #define LOADW_STUB 5
186 #define LOADD_STUB 6
187 #define LOADBU_STUB 7
188 #define LOADHU_STUB 8
189 #define STOREB_STUB 9
190 #define STOREH_STUB 10
191 #define STOREW_STUB 11
192 #define STORED_STUB 12
193 #define STORELR_STUB 13
194 #define INVCODE_STUB 14
195
196   /* branch codes */
197 #define TAKEN 1
198 #define NOTTAKEN 2
199 #define NULLDS 3
200
201 // asm linkage
202 int new_recompile_block(int addr);
203 void *get_addr_ht(u_int vaddr);
204 void invalidate_block(u_int block);
205 void invalidate_addr(u_int addr);
206 void remove_hash(int vaddr);
207 void jump_vaddr();
208 void dyna_linker();
209 void dyna_linker_ds();
210 void verify_code();
211 void verify_code_vm();
212 void verify_code_ds();
213 void cc_interrupt();
214 void fp_exception();
215 void fp_exception_ds();
216 void jump_syscall();
217 void jump_syscall_hle();
218 void jump_eret();
219 void jump_hlecall();
220 void new_dyna_leave();
221
222 // TLB
223 void TLBWI_new();
224 void TLBWR_new();
225 void read_nomem_new();
226 void read_nomemb_new();
227 void read_nomemh_new();
228 void read_nomemd_new();
229 void write_nomem_new();
230 void write_nomemb_new();
231 void write_nomemh_new();
232 void write_nomemd_new();
233 void write_rdram_new();
234 void write_rdramb_new();
235 void write_rdramh_new();
236 void write_rdramd_new();
237 extern u_int memory_map[1048576];
238
239 // Needed by assembler
240 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
241 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
242 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
243 void load_all_regs(signed char i_regmap[]);
244 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
245 void load_regs_entry(int t);
246 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
247
248 int tracedebug=0;
249
250 //#define DEBUG_CYCLE_COUNT 1
251
252 void nullf() {}
253 //#define assem_debug printf
254 //#define inv_debug printf
255 #define assem_debug nullf
256 #define inv_debug nullf
257
258 static void tlb_hacks()
259 {
260 #ifndef DISABLE_TLB
261   // Goldeneye hack
262   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
263   {
264     u_int addr;
265     int n;
266     switch (ROM_HEADER->Country_code&0xFF) 
267     {
268       case 0x45: // U
269         addr=0x34b30;
270         break;                   
271       case 0x4A: // J 
272         addr=0x34b70;    
273         break;    
274       case 0x50: // E 
275         addr=0x329f0;
276         break;                        
277       default: 
278         // Unknown country code
279         addr=0;
280         break;
281     }
282     u_int rom_addr=(u_int)rom;
283     #ifdef ROM_COPY
284     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
285     // in the lower 4G of memory to use this hack.  Copy it if necessary.
286     if((void *)rom>(void *)0xffffffff) {
287       munmap(ROM_COPY, 67108864);
288       if(mmap(ROM_COPY, 12582912,
289               PROT_READ | PROT_WRITE,
290               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
291               -1, 0) <= 0) {printf("mmap() failed\n");}
292       memcpy(ROM_COPY,rom,12582912);
293       rom_addr=(u_int)ROM_COPY;
294     }
295     #endif
296     if(addr) {
297       for(n=0x7F000;n<0x80000;n++) {
298         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
299       }
300     }
301   }
302 #endif
303 }
304
305 static u_int get_page(u_int vaddr)
306 {
307   u_int page=(vaddr^0x80000000)>>12;
308 #ifndef DISABLE_TLB
309   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
310 #endif
311   if(page>2048) page=2048+(page&2047);
312   return page;
313 }
314
315 static u_int get_vpage(u_int vaddr)
316 {
317   u_int vpage=(vaddr^0x80000000)>>12;
318 #ifndef DISABLE_TLB
319   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
320 #endif
321   if(vpage>2048) vpage=2048+(vpage&2047);
322   return vpage;
323 }
324
325 // Get address from virtual address
326 // This is called from the recompiled JR/JALR instructions
327 void *get_addr(u_int vaddr)
328 {
329   u_int page=get_page(vaddr);
330   u_int vpage=get_vpage(vaddr);
331   struct ll_entry *head;
332   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
333   head=jump_in[page];
334   while(head!=NULL) {
335     if(head->vaddr==vaddr&&head->reg32==0) {
336   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
337       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
338       ht_bin[3]=ht_bin[1];
339       ht_bin[2]=ht_bin[0];
340       ht_bin[1]=(int)head->addr;
341       ht_bin[0]=vaddr;
342       return head->addr;
343     }
344     head=head->next;
345   }
346   head=jump_dirty[vpage];
347   while(head!=NULL) {
348     if(head->vaddr==vaddr&&head->reg32==0) {
349       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
350       // Don't restore blocks which are about to expire from the cache
351       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
352       if(verify_dirty(head->addr)) {
353         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
354         invalid_code[vaddr>>12]=0;
355         memory_map[vaddr>>12]|=0x40000000;
356         if(vpage<2048) {
357 #ifndef DISABLE_TLB
358           if(tlb_LUT_r[vaddr>>12]) {
359             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
360             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
361           }
362 #endif
363           restore_candidate[vpage>>3]|=1<<(vpage&7);
364         }
365         else restore_candidate[page>>3]|=1<<(page&7);
366         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
367         if(ht_bin[0]==vaddr) {
368           ht_bin[1]=(int)head->addr; // Replace existing entry
369         }
370         else
371         {
372           ht_bin[3]=ht_bin[1];
373           ht_bin[2]=ht_bin[0];
374           ht_bin[1]=(int)head->addr;
375           ht_bin[0]=vaddr;
376         }
377         return head->addr;
378       }
379     }
380     head=head->next;
381   }
382   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
383   int r=new_recompile_block(vaddr);
384   if(r==0) return get_addr(vaddr);
385   // Execute in unmapped page, generate pagefault execption
386   Status|=2;
387   Cause=(vaddr<<31)|0x8;
388   EPC=(vaddr&1)?vaddr-5:vaddr;
389   BadVAddr=(vaddr&~1);
390   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
391   EntryHi=BadVAddr&0xFFFFE000;
392   return get_addr_ht(0x80000000);
393 }
394 // Look up address in hash table first
395 void *get_addr_ht(u_int vaddr)
396 {
397   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
398   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
399   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
400   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
401   return get_addr(vaddr);
402 }
403
404 void *get_addr_32(u_int vaddr,u_int flags)
405 {
406 #ifdef FORCE32
407   return get_addr(vaddr);
408 #endif
409   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
410   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
411   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
412   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
413   u_int page=get_page(vaddr);
414   u_int vpage=get_vpage(vaddr);
415   struct ll_entry *head;
416   head=jump_in[page];
417   while(head!=NULL) {
418     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
419       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
420       if(head->reg32==0) {
421         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
422         if(ht_bin[0]==-1) {
423           ht_bin[1]=(int)head->addr;
424           ht_bin[0]=vaddr;
425         }else if(ht_bin[2]==-1) {
426           ht_bin[3]=(int)head->addr;
427           ht_bin[2]=vaddr;
428         }
429         //ht_bin[3]=ht_bin[1];
430         //ht_bin[2]=ht_bin[0];
431         //ht_bin[1]=(int)head->addr;
432         //ht_bin[0]=vaddr;
433       }
434       return head->addr;
435     }
436     head=head->next;
437   }
438   head=jump_dirty[vpage];
439   while(head!=NULL) {
440     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
441       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
442       // Don't restore blocks which are about to expire from the cache
443       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
444       if(verify_dirty(head->addr)) {
445         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
446         invalid_code[vaddr>>12]=0;
447         memory_map[vaddr>>12]|=0x40000000;
448         if(vpage<2048) {
449 #ifndef DISABLE_TLB
450           if(tlb_LUT_r[vaddr>>12]) {
451             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
452             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
453           }
454 #endif
455           restore_candidate[vpage>>3]|=1<<(vpage&7);
456         }
457         else restore_candidate[page>>3]|=1<<(page&7);
458         if(head->reg32==0) {
459           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
460           if(ht_bin[0]==-1) {
461             ht_bin[1]=(int)head->addr;
462             ht_bin[0]=vaddr;
463           }else if(ht_bin[2]==-1) {
464             ht_bin[3]=(int)head->addr;
465             ht_bin[2]=vaddr;
466           }
467           //ht_bin[3]=ht_bin[1];
468           //ht_bin[2]=ht_bin[0];
469           //ht_bin[1]=(int)head->addr;
470           //ht_bin[0]=vaddr;
471         }
472         return head->addr;
473       }
474     }
475     head=head->next;
476   }
477   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
478   int r=new_recompile_block(vaddr);
479   if(r==0) return get_addr(vaddr);
480   // Execute in unmapped page, generate pagefault execption
481   Status|=2;
482   Cause=(vaddr<<31)|0x8;
483   EPC=(vaddr&1)?vaddr-5:vaddr;
484   BadVAddr=(vaddr&~1);
485   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
486   EntryHi=BadVAddr&0xFFFFE000;
487   return get_addr_ht(0x80000000);
488 }
489
490 void clear_all_regs(signed char regmap[])
491 {
492   int hr;
493   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
494 }
495
496 signed char get_reg(signed char regmap[],int r)
497 {
498   int hr;
499   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
500   return -1;
501 }
502
503 // Find a register that is available for two consecutive cycles
504 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
505 {
506   int hr;
507   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
508   return -1;
509 }
510
511 int count_free_regs(signed char regmap[])
512 {
513   int count=0;
514   int hr;
515   for(hr=0;hr<HOST_REGS;hr++)
516   {
517     if(hr!=EXCLUDE_REG) {
518       if(regmap[hr]<0) count++;
519     }
520   }
521   return count;
522 }
523
524 void dirty_reg(struct regstat *cur,signed char reg)
525 {
526   int hr;
527   if(!reg) return;
528   for (hr=0;hr<HOST_REGS;hr++) {
529     if((cur->regmap[hr]&63)==reg) {
530       cur->dirty|=1<<hr;
531     }
532   }
533 }
534
535 // If we dirty the lower half of a 64 bit register which is now being
536 // sign-extended, we need to dump the upper half.
537 // Note: Do this only after completion of the instruction, because
538 // some instructions may need to read the full 64-bit value even if
539 // overwriting it (eg SLTI, DSRA32).
540 static void flush_dirty_uppers(struct regstat *cur)
541 {
542   int hr,reg;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->dirty>>hr)&1) {
545       reg=cur->regmap[hr];
546       if(reg>=64) 
547         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
548     }
549   }
550 }
551
552 void set_const(struct regstat *cur,signed char reg,uint64_t value)
553 {
554   int hr;
555   if(!reg) return;
556   for (hr=0;hr<HOST_REGS;hr++) {
557     if(cur->regmap[hr]==reg) {
558       cur->isconst|=1<<hr;
559       cur->constmap[hr]=value;
560     }
561     else if((cur->regmap[hr]^64)==reg) {
562       cur->isconst|=1<<hr;
563       cur->constmap[hr]=value>>32;
564     }
565   }
566 }
567
568 void clear_const(struct regstat *cur,signed char reg)
569 {
570   int hr;
571   if(!reg) return;
572   for (hr=0;hr<HOST_REGS;hr++) {
573     if((cur->regmap[hr]&63)==reg) {
574       cur->isconst&=~(1<<hr);
575     }
576   }
577 }
578
579 int is_const(struct regstat *cur,signed char reg)
580 {
581   int hr;
582   if(!reg) return 1;
583   for (hr=0;hr<HOST_REGS;hr++) {
584     if((cur->regmap[hr]&63)==reg) {
585       return (cur->isconst>>hr)&1;
586     }
587   }
588   return 0;
589 }
590 uint64_t get_const(struct regstat *cur,signed char reg)
591 {
592   int hr;
593   if(!reg) return 0;
594   for (hr=0;hr<HOST_REGS;hr++) {
595     if(cur->regmap[hr]==reg) {
596       return cur->constmap[hr];
597     }
598   }
599   printf("Unknown constant in r%d\n",reg);
600   exit(1);
601 }
602
603 // Least soon needed registers
604 // Look at the next ten instructions and see which registers
605 // will be used.  Try not to reallocate these.
606 void lsn(u_char hsn[], int i, int *preferred_reg)
607 {
608   int j;
609   int b=-1;
610   for(j=0;j<9;j++)
611   {
612     if(i+j>=slen) {
613       j=slen-i-1;
614       break;
615     }
616     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
617     {
618       // Don't go past an unconditonal jump
619       j++;
620       break;
621     }
622   }
623   for(;j>=0;j--)
624   {
625     if(rs1[i+j]) hsn[rs1[i+j]]=j;
626     if(rs2[i+j]) hsn[rs2[i+j]]=j;
627     if(rt1[i+j]) hsn[rt1[i+j]]=j;
628     if(rt2[i+j]) hsn[rt2[i+j]]=j;
629     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
630       // Stores can allocate zero
631       hsn[rs1[i+j]]=j;
632       hsn[rs2[i+j]]=j;
633     }
634     // On some architectures stores need invc_ptr
635     #if defined(HOST_IMM8)
636     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
637       hsn[INVCP]=j;
638     }
639     #endif
640     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
641     {
642       hsn[CCREG]=j;
643       b=j;
644     }
645   }
646   if(b>=0)
647   {
648     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
649     {
650       // Follow first branch
651       int t=(ba[i+b]-start)>>2;
652       j=7-b;if(t+j>=slen) j=slen-t-1;
653       for(;j>=0;j--)
654       {
655         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
656         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
657         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
658         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
659       }
660     }
661     // TODO: preferred register based on backward branch
662   }
663   // Delay slot should preferably not overwrite branch conditions or cycle count
664   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
665     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
666     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
667     hsn[CCREG]=1;
668     // ...or hash tables
669     hsn[RHASH]=1;
670     hsn[RHTBL]=1;
671   }
672   // Coprocessor load/store needs FTEMP, even if not declared
673   if(itype[i]==C1LS) {
674     hsn[FTEMP]=0;
675   }
676   // Load L/R also uses FTEMP as a temporary register
677   if(itype[i]==LOADLR) {
678     hsn[FTEMP]=0;
679   }
680   // Also 64-bit SDL/SDR
681   if(opcode[i]==0x2c||opcode[i]==0x2d) {
682     hsn[FTEMP]=0;
683   }
684   // Don't remove the TLB registers either
685   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
686     hsn[TLREG]=0;
687   }
688   // Don't remove the miniht registers
689   if(itype[i]==UJUMP||itype[i]==RJUMP)
690   {
691     hsn[RHASH]=0;
692     hsn[RHTBL]=0;
693   }
694 }
695
696 // We only want to allocate registers if we're going to use them again soon
697 int needed_again(int r, int i)
698 {
699   int j;
700   int b=-1;
701   int rn=10;
702   int hr;
703   u_char hsn[MAXREG+1];
704   int preferred_reg;
705   
706   memset(hsn,10,sizeof(hsn));
707   lsn(hsn,i,&preferred_reg);
708   
709   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
710   {
711     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
712       return 0; // Don't need any registers if exiting the block
713   }
714   for(j=0;j<9;j++)
715   {
716     if(i+j>=slen) {
717       j=slen-i-1;
718       break;
719     }
720     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
721     {
722       // Don't go past an unconditonal jump
723       j++;
724       break;
725     }
726     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||((source[i+j]&0xfc00003f)==0x0d))
727     {
728       break;
729     }
730   }
731   for(;j>=1;j--)
732   {
733     if(rs1[i+j]==r) rn=j;
734     if(rs2[i+j]==r) rn=j;
735     if((unneeded_reg[i+j]>>r)&1) rn=10;
736     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
737     {
738       b=j;
739     }
740   }
741   /*
742   if(b>=0)
743   {
744     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
745     {
746       // Follow first branch
747       int o=rn;
748       int t=(ba[i+b]-start)>>2;
749       j=7-b;if(t+j>=slen) j=slen-t-1;
750       for(;j>=0;j--)
751       {
752         if(!((unneeded_reg[t+j]>>r)&1)) {
753           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
754           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
755         }
756         else rn=o;
757       }
758     }
759   }*/
760   for(hr=0;hr<HOST_REGS;hr++) {
761     if(hr!=EXCLUDE_REG) {
762       if(rn<hsn[hr]) return 1;
763     }
764   }
765   return 0;
766 }
767
768 // Try to match register allocations at the end of a loop with those
769 // at the beginning
770 int loop_reg(int i, int r, int hr)
771 {
772   int j,k;
773   for(j=0;j<9;j++)
774   {
775     if(i+j>=slen) {
776       j=slen-i-1;
777       break;
778     }
779     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
780     {
781       // Don't go past an unconditonal jump
782       j++;
783       break;
784     }
785   }
786   k=0;
787   if(i>0){
788     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
789       k--;
790   }
791   for(;k<j;k++)
792   {
793     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
794     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
795     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
796     {
797       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
798       {
799         int t=(ba[i+k]-start)>>2;
800         int reg=get_reg(regs[t].regmap_entry,r);
801         if(reg>=0) return reg;
802         //reg=get_reg(regs[t+1].regmap_entry,r);
803         //if(reg>=0) return reg;
804       }
805     }
806   }
807   return hr;
808 }
809
810
811 // Allocate every register, preserving source/target regs
812 void alloc_all(struct regstat *cur,int i)
813 {
814   int hr;
815   
816   for(hr=0;hr<HOST_REGS;hr++) {
817     if(hr!=EXCLUDE_REG) {
818       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
819          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
820       {
821         cur->regmap[hr]=-1;
822         cur->dirty&=~(1<<hr);
823       }
824       // Don't need zeros
825       if((cur->regmap[hr]&63)==0)
826       {
827         cur->regmap[hr]=-1;
828         cur->dirty&=~(1<<hr);
829       }
830     }
831   }
832 }
833
834
835 void div64(int64_t dividend,int64_t divisor)
836 {
837   lo=dividend/divisor;
838   hi=dividend%divisor;
839   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
840   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
841 }
842 void divu64(uint64_t dividend,uint64_t divisor)
843 {
844   lo=dividend/divisor;
845   hi=dividend%divisor;
846   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
847   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
848 }
849
850 void mult64(uint64_t m1,uint64_t m2)
851 {
852    unsigned long long int op1, op2, op3, op4;
853    unsigned long long int result1, result2, result3, result4;
854    unsigned long long int temp1, temp2, temp3, temp4;
855    int sign = 0;
856    
857    if (m1 < 0)
858      {
859     op2 = -m1;
860     sign = 1 - sign;
861      }
862    else op2 = m1;
863    if (m2 < 0)
864      {
865     op4 = -m2;
866     sign = 1 - sign;
867      }
868    else op4 = m2;
869    
870    op1 = op2 & 0xFFFFFFFF;
871    op2 = (op2 >> 32) & 0xFFFFFFFF;
872    op3 = op4 & 0xFFFFFFFF;
873    op4 = (op4 >> 32) & 0xFFFFFFFF;
874    
875    temp1 = op1 * op3;
876    temp2 = (temp1 >> 32) + op1 * op4;
877    temp3 = op2 * op3;
878    temp4 = (temp3 >> 32) + op2 * op4;
879    
880    result1 = temp1 & 0xFFFFFFFF;
881    result2 = temp2 + (temp3 & 0xFFFFFFFF);
882    result3 = (result2 >> 32) + temp4;
883    result4 = (result3 >> 32);
884    
885    lo = result1 | (result2 << 32);
886    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
887    if (sign)
888      {
889     hi = ~hi;
890     if (!lo) hi++;
891     else lo = ~lo + 1;
892      }
893 }
894
895 void multu64(uint64_t m1,uint64_t m2)
896 {
897    unsigned long long int op1, op2, op3, op4;
898    unsigned long long int result1, result2, result3, result4;
899    unsigned long long int temp1, temp2, temp3, temp4;
900    
901    op1 = m1 & 0xFFFFFFFF;
902    op2 = (m1 >> 32) & 0xFFFFFFFF;
903    op3 = m2 & 0xFFFFFFFF;
904    op4 = (m2 >> 32) & 0xFFFFFFFF;
905    
906    temp1 = op1 * op3;
907    temp2 = (temp1 >> 32) + op1 * op4;
908    temp3 = op2 * op3;
909    temp4 = (temp3 >> 32) + op2 * op4;
910    
911    result1 = temp1 & 0xFFFFFFFF;
912    result2 = temp2 + (temp3 & 0xFFFFFFFF);
913    result3 = (result2 >> 32) + temp4;
914    result4 = (result3 >> 32);
915    
916    lo = result1 | (result2 << 32);
917    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
918    
919   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
920   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
921 }
922
923 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
924 {
925   if(bits) {
926     original<<=64-bits;
927     original>>=64-bits;
928     loaded<<=bits;
929     original|=loaded;
930   }
931   else original=loaded;
932   return original;
933 }
934 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
935 {
936   if(bits^56) {
937     original>>=64-(bits^56);
938     original<<=64-(bits^56);
939     loaded>>=bits^56;
940     original|=loaded;
941   }
942   else original=loaded;
943   return original;
944 }
945
946 #ifdef __i386__
947 #include "assem_x86.c"
948 #endif
949 #ifdef __x86_64__
950 #include "assem_x64.c"
951 #endif
952 #ifdef __arm__
953 #include "assem_arm.c"
954 #endif
955
956 // Add virtual address mapping to linked list
957 void ll_add(struct ll_entry **head,int vaddr,void *addr)
958 {
959   struct ll_entry *new_entry;
960   new_entry=malloc(sizeof(struct ll_entry));
961   assert(new_entry!=NULL);
962   new_entry->vaddr=vaddr;
963   new_entry->reg32=0;
964   new_entry->addr=addr;
965   new_entry->next=*head;
966   *head=new_entry;
967 }
968
969 // Add virtual address mapping for 32-bit compiled block
970 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
971 {
972   ll_add(head,vaddr,addr);
973 #ifndef FORCE32
974   (*head)->reg32=reg32;
975 #endif
976 }
977
978 // Check if an address is already compiled
979 // but don't return addresses which are about to expire from the cache
980 void *check_addr(u_int vaddr)
981 {
982   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
983   if(ht_bin[0]==vaddr) {
984     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
985       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
986   }
987   if(ht_bin[2]==vaddr) {
988     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
989       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
990   }
991   u_int page=get_page(vaddr);
992   struct ll_entry *head;
993   head=jump_in[page];
994   while(head!=NULL) {
995     if(head->vaddr==vaddr&&head->reg32==0) {
996       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
997         // Update existing entry with current address
998         if(ht_bin[0]==vaddr) {
999           ht_bin[1]=(int)head->addr;
1000           return head->addr;
1001         }
1002         if(ht_bin[2]==vaddr) {
1003           ht_bin[3]=(int)head->addr;
1004           return head->addr;
1005         }
1006         // Insert into hash table with low priority.
1007         // Don't evict existing entries, as they are probably
1008         // addresses that are being accessed frequently.
1009         if(ht_bin[0]==-1) {
1010           ht_bin[1]=(int)head->addr;
1011           ht_bin[0]=vaddr;
1012         }else if(ht_bin[2]==-1) {
1013           ht_bin[3]=(int)head->addr;
1014           ht_bin[2]=vaddr;
1015         }
1016         return head->addr;
1017       }
1018     }
1019     head=head->next;
1020   }
1021   return 0;
1022 }
1023
1024 void remove_hash(int vaddr)
1025 {
1026   //printf("remove hash: %x\n",vaddr);
1027   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1028   if(ht_bin[2]==vaddr) {
1029     ht_bin[2]=ht_bin[3]=-1;
1030   }
1031   if(ht_bin[0]==vaddr) {
1032     ht_bin[0]=ht_bin[2];
1033     ht_bin[1]=ht_bin[3];
1034     ht_bin[2]=ht_bin[3]=-1;
1035   }
1036 }
1037
1038 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1039 {
1040   struct ll_entry *next;
1041   while(*head) {
1042     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1043        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1044     {
1045       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1046       remove_hash((*head)->vaddr);
1047       next=(*head)->next;
1048       free(*head);
1049       *head=next;
1050     }
1051     else
1052     {
1053       head=&((*head)->next);
1054     }
1055   }
1056 }
1057
1058 // Remove all entries from linked list
1059 void ll_clear(struct ll_entry **head)
1060 {
1061   struct ll_entry *cur;
1062   struct ll_entry *next;
1063   if(cur=*head) {
1064     *head=0;
1065     while(cur) {
1066       next=cur->next;
1067       free(cur);
1068       cur=next;
1069     }
1070   }
1071 }
1072
1073 // Dereference the pointers and remove if it matches
1074 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1075 {
1076   while(head) {
1077     int ptr=get_pointer(head->addr);
1078     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1079     if(((ptr>>shift)==(addr>>shift)) ||
1080        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1081     {
1082       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1083       kill_pointer(head->addr);
1084     }
1085     head=head->next;
1086   }
1087 }
1088
1089 // This is called when we write to a compiled block (see do_invstub)
1090 int invalidate_page(u_int page)
1091 {
1092   int modified=0;
1093   struct ll_entry *head;
1094   struct ll_entry *next;
1095   head=jump_in[page];
1096   jump_in[page]=0;
1097   while(head!=NULL) {
1098     inv_debug("INVALIDATE: %x\n",head->vaddr);
1099     remove_hash(head->vaddr);
1100     next=head->next;
1101     free(head);
1102     head=next;
1103   }
1104   head=jump_out[page];
1105   jump_out[page]=0;
1106   while(head!=NULL) {
1107     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1108     kill_pointer(head->addr);
1109     modified=1;
1110     next=head->next;
1111     free(head);
1112     head=next;
1113   }
1114   return modified;
1115 }
1116 void invalidate_block(u_int block)
1117 {
1118   int modified;
1119   u_int page=get_page(block<<12);
1120   u_int vpage=get_vpage(block<<12);
1121   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1122   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1123   u_int first,last;
1124   first=last=page;
1125   struct ll_entry *head;
1126   head=jump_dirty[vpage];
1127   //printf("page=%d vpage=%d\n",page,vpage);
1128   while(head!=NULL) {
1129     u_int start,end;
1130     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1131       get_bounds((int)head->addr,&start,&end);
1132       //printf("start: %x end: %x\n",start,end);
1133       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1134         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1135           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1136           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1137         }
1138       }
1139 #ifndef DISABLE_TLB
1140       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1141         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1142           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1143           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1144         }
1145       }
1146 #endif
1147     }
1148     head=head->next;
1149   }
1150   //printf("first=%d last=%d\n",first,last);
1151   modified=invalidate_page(page);
1152   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1153   assert(last<page+5);
1154   // Invalidate the adjacent pages if a block crosses a 4K boundary
1155   while(first<page) {
1156     invalidate_page(first);
1157     first++;
1158   }
1159   for(first=page+1;first<last;first++) {
1160     invalidate_page(first);
1161   }
1162   
1163   // Don't trap writes
1164   invalid_code[block]=1;
1165 #ifndef DISABLE_TLB
1166   // If there is a valid TLB entry for this page, remove write protect
1167   if(tlb_LUT_w[block]) {
1168     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1169     // CHECK: Is this right?
1170     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1171     u_int real_block=tlb_LUT_w[block]>>12;
1172     invalid_code[real_block]=1;
1173     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1174   }
1175   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1176 #endif
1177   #ifdef __arm__
1178   if(modified)
1179     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1180   #endif
1181   #ifdef USE_MINI_HT
1182   memset(mini_ht,-1,sizeof(mini_ht));
1183   #endif
1184 }
1185 void invalidate_addr(u_int addr)
1186 {
1187   invalidate_block(addr>>12);
1188 }
1189 void invalidate_all_pages()
1190 {
1191   u_int page,n;
1192   for(page=0;page<4096;page++)
1193     invalidate_page(page);
1194   for(page=0;page<1048576;page++)
1195     if(!invalid_code[page]) {
1196       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1197       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1198     }
1199   #ifdef __arm__
1200   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1201   #endif
1202   #ifdef USE_MINI_HT
1203   memset(mini_ht,-1,sizeof(mini_ht));
1204   #endif
1205   #ifndef DISABLE_TLB
1206   // TLB
1207   for(page=0;page<0x100000;page++) {
1208     if(tlb_LUT_r[page]) {
1209       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1210       if(!tlb_LUT_w[page]||!invalid_code[page])
1211         memory_map[page]|=0x40000000; // Write protect
1212     }
1213     else memory_map[page]=-1;
1214     if(page==0x80000) page=0xC0000;
1215   }
1216   tlb_hacks();
1217   #endif
1218 }
1219
1220 // Add an entry to jump_out after making a link
1221 void add_link(u_int vaddr,void *src)
1222 {
1223   u_int page=get_page(vaddr);
1224   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1225   ll_add(jump_out+page,vaddr,src);
1226   //int ptr=get_pointer(src);
1227   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1228 }
1229
1230 // If a code block was found to be unmodified (bit was set in
1231 // restore_candidate) and it remains unmodified (bit is clear
1232 // in invalid_code) then move the entries for that 4K page from
1233 // the dirty list to the clean list.
1234 void clean_blocks(u_int page)
1235 {
1236   struct ll_entry *head;
1237   inv_debug("INV: clean_blocks page=%d\n",page);
1238   head=jump_dirty[page];
1239   while(head!=NULL) {
1240     if(!invalid_code[head->vaddr>>12]) {
1241       // Don't restore blocks which are about to expire from the cache
1242       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1243         u_int start,end;
1244         if(verify_dirty((int)head->addr)) {
1245           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1246           u_int i;
1247           u_int inv=0;
1248           get_bounds((int)head->addr,&start,&end);
1249           if(start-(u_int)rdram<0x800000) {
1250             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1251               inv|=invalid_code[i];
1252             }
1253           }
1254           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1255             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1256             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1257             if(addr<start||addr>=end) inv=1;
1258           }
1259           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1260             inv=1;
1261           }
1262           if(!inv) {
1263             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1264             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1265               u_int ppage=page;
1266 #ifndef DISABLE_TLB
1267               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1268 #endif
1269               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1270               //printf("page=%x, addr=%x\n",page,head->vaddr);
1271               //assert(head->vaddr>>12==(page|0x80000));
1272               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1273               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1274               if(!head->reg32) {
1275                 if(ht_bin[0]==head->vaddr) {
1276                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1277                 }
1278                 if(ht_bin[2]==head->vaddr) {
1279                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1280                 }
1281               }
1282             }
1283           }
1284         }
1285       }
1286     }
1287     head=head->next;
1288   }
1289 }
1290
1291
1292 void mov_alloc(struct regstat *current,int i)
1293 {
1294   // Note: Don't need to actually alloc the source registers
1295   if((~current->is32>>rs1[i])&1) {
1296     //alloc_reg64(current,i,rs1[i]);
1297     alloc_reg64(current,i,rt1[i]);
1298     current->is32&=~(1LL<<rt1[i]);
1299   } else {
1300     //alloc_reg(current,i,rs1[i]);
1301     alloc_reg(current,i,rt1[i]);
1302     current->is32|=(1LL<<rt1[i]);
1303   }
1304   clear_const(current,rs1[i]);
1305   clear_const(current,rt1[i]);
1306   dirty_reg(current,rt1[i]);
1307 }
1308
1309 void shiftimm_alloc(struct regstat *current,int i)
1310 {
1311   clear_const(current,rs1[i]);
1312   clear_const(current,rt1[i]);
1313   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1314   {
1315     if(rt1[i]) {
1316       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1317       else lt1[i]=rs1[i];
1318       alloc_reg(current,i,rt1[i]);
1319       current->is32|=1LL<<rt1[i];
1320       dirty_reg(current,rt1[i]);
1321     }
1322   }
1323   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1324   {
1325     if(rt1[i]) {
1326       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1327       alloc_reg64(current,i,rt1[i]);
1328       current->is32&=~(1LL<<rt1[i]);
1329       dirty_reg(current,rt1[i]);
1330     }
1331   }
1332   if(opcode2[i]==0x3c) // DSLL32
1333   {
1334     if(rt1[i]) {
1335       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1336       alloc_reg64(current,i,rt1[i]);
1337       current->is32&=~(1LL<<rt1[i]);
1338       dirty_reg(current,rt1[i]);
1339     }
1340   }
1341   if(opcode2[i]==0x3e) // DSRL32
1342   {
1343     if(rt1[i]) {
1344       alloc_reg64(current,i,rs1[i]);
1345       if(imm[i]==32) {
1346         alloc_reg64(current,i,rt1[i]);
1347         current->is32&=~(1LL<<rt1[i]);
1348       } else {
1349         alloc_reg(current,i,rt1[i]);
1350         current->is32|=1LL<<rt1[i];
1351       }
1352       dirty_reg(current,rt1[i]);
1353     }
1354   }
1355   if(opcode2[i]==0x3f) // DSRA32
1356   {
1357     if(rt1[i]) {
1358       alloc_reg64(current,i,rs1[i]);
1359       alloc_reg(current,i,rt1[i]);
1360       current->is32|=1LL<<rt1[i];
1361       dirty_reg(current,rt1[i]);
1362     }
1363   }
1364 }
1365
1366 void shift_alloc(struct regstat *current,int i)
1367 {
1368   if(rt1[i]) {
1369     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1370     {
1371       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1372       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1373       alloc_reg(current,i,rt1[i]);
1374       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1375       current->is32|=1LL<<rt1[i];
1376     } else { // DSLLV/DSRLV/DSRAV
1377       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1378       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1379       alloc_reg64(current,i,rt1[i]);
1380       current->is32&=~(1LL<<rt1[i]);
1381       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1382         alloc_reg_temp(current,i,-1);
1383     }
1384     clear_const(current,rs1[i]);
1385     clear_const(current,rs2[i]);
1386     clear_const(current,rt1[i]);
1387     dirty_reg(current,rt1[i]);
1388   }
1389 }
1390
1391 void alu_alloc(struct regstat *current,int i)
1392 {
1393   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1394     if(rt1[i]) {
1395       if(rs1[i]&&rs2[i]) {
1396         alloc_reg(current,i,rs1[i]);
1397         alloc_reg(current,i,rs2[i]);
1398       }
1399       else {
1400         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1401         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1402       }
1403       alloc_reg(current,i,rt1[i]);
1404     }
1405     current->is32|=1LL<<rt1[i];
1406   }
1407   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1408     if(rt1[i]) {
1409       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1410       {
1411         alloc_reg64(current,i,rs1[i]);
1412         alloc_reg64(current,i,rs2[i]);
1413         alloc_reg(current,i,rt1[i]);
1414       } else {
1415         alloc_reg(current,i,rs1[i]);
1416         alloc_reg(current,i,rs2[i]);
1417         alloc_reg(current,i,rt1[i]);
1418       }
1419     }
1420     current->is32|=1LL<<rt1[i];
1421   }
1422   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1423     if(rt1[i]) {
1424       if(rs1[i]&&rs2[i]) {
1425         alloc_reg(current,i,rs1[i]);
1426         alloc_reg(current,i,rs2[i]);
1427       }
1428       else
1429       {
1430         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1431         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1432       }
1433       alloc_reg(current,i,rt1[i]);
1434       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1435       {
1436         if(!((current->uu>>rt1[i])&1)) {
1437           alloc_reg64(current,i,rt1[i]);
1438         }
1439         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1440           if(rs1[i]&&rs2[i]) {
1441             alloc_reg64(current,i,rs1[i]);
1442             alloc_reg64(current,i,rs2[i]);
1443           }
1444           else
1445           {
1446             // Is is really worth it to keep 64-bit values in registers?
1447             #ifdef NATIVE_64BIT
1448             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1449             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1450             #endif
1451           }
1452         }
1453         current->is32&=~(1LL<<rt1[i]);
1454       } else {
1455         current->is32|=1LL<<rt1[i];
1456       }
1457     }
1458   }
1459   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1460     if(rt1[i]) {
1461       if(rs1[i]&&rs2[i]) {
1462         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1463           alloc_reg64(current,i,rs1[i]);
1464           alloc_reg64(current,i,rs2[i]);
1465           alloc_reg64(current,i,rt1[i]);
1466         } else {
1467           alloc_reg(current,i,rs1[i]);
1468           alloc_reg(current,i,rs2[i]);
1469           alloc_reg(current,i,rt1[i]);
1470         }
1471       }
1472       else {
1473         alloc_reg(current,i,rt1[i]);
1474         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1475           // DADD used as move, or zeroing
1476           // If we have a 64-bit source, then make the target 64 bits too
1477           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1478             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1479             alloc_reg64(current,i,rt1[i]);
1480           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1481             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1482             alloc_reg64(current,i,rt1[i]);
1483           }
1484           if(opcode2[i]>=0x2e&&rs2[i]) {
1485             // DSUB used as negation - 64-bit result
1486             // If we have a 32-bit register, extend it to 64 bits
1487             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1488             alloc_reg64(current,i,rt1[i]);
1489           }
1490         }
1491       }
1492       if(rs1[i]&&rs2[i]) {
1493         current->is32&=~(1LL<<rt1[i]);
1494       } else if(rs1[i]) {
1495         current->is32&=~(1LL<<rt1[i]);
1496         if((current->is32>>rs1[i])&1)
1497           current->is32|=1LL<<rt1[i];
1498       } else if(rs2[i]) {
1499         current->is32&=~(1LL<<rt1[i]);
1500         if((current->is32>>rs2[i])&1)
1501           current->is32|=1LL<<rt1[i];
1502       } else {
1503         current->is32|=1LL<<rt1[i];
1504       }
1505     }
1506   }
1507   clear_const(current,rs1[i]);
1508   clear_const(current,rs2[i]);
1509   clear_const(current,rt1[i]);
1510   dirty_reg(current,rt1[i]);
1511 }
1512
1513 void imm16_alloc(struct regstat *current,int i)
1514 {
1515   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1516   else lt1[i]=rs1[i];
1517   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1518   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1519     current->is32&=~(1LL<<rt1[i]);
1520     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1521       // TODO: Could preserve the 32-bit flag if the immediate is zero
1522       alloc_reg64(current,i,rt1[i]);
1523       alloc_reg64(current,i,rs1[i]);
1524     }
1525     clear_const(current,rs1[i]);
1526     clear_const(current,rt1[i]);
1527   }
1528   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1529     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1530     current->is32|=1LL<<rt1[i];
1531     clear_const(current,rs1[i]);
1532     clear_const(current,rt1[i]);
1533   }
1534   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1535     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1536       if(rs1[i]!=rt1[i]) {
1537         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1538         alloc_reg64(current,i,rt1[i]);
1539         current->is32&=~(1LL<<rt1[i]);
1540       }
1541     }
1542     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1543     if(is_const(current,rs1[i])) {
1544       int v=get_const(current,rs1[i]);
1545       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1546       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1547       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1548     }
1549     else clear_const(current,rt1[i]);
1550   }
1551   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1552     if(is_const(current,rs1[i])) {
1553       int v=get_const(current,rs1[i]);
1554       set_const(current,rt1[i],v+imm[i]);
1555     }
1556     else clear_const(current,rt1[i]);
1557     current->is32|=1LL<<rt1[i];
1558   }
1559   else {
1560     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1561     current->is32|=1LL<<rt1[i];
1562   }
1563   dirty_reg(current,rt1[i]);
1564 }
1565
1566 void load_alloc(struct regstat *current,int i)
1567 {
1568   clear_const(current,rt1[i]);
1569   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1570   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1571   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1572   if(rt1[i]) {
1573     alloc_reg(current,i,rt1[i]);
1574     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1575     {
1576       current->is32&=~(1LL<<rt1[i]);
1577       alloc_reg64(current,i,rt1[i]);
1578     }
1579     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1580     {
1581       current->is32&=~(1LL<<rt1[i]);
1582       alloc_reg64(current,i,rt1[i]);
1583       alloc_all(current,i);
1584       alloc_reg64(current,i,FTEMP);
1585     }
1586     else current->is32|=1LL<<rt1[i];
1587     dirty_reg(current,rt1[i]);
1588     // If using TLB, need a register for pointer to the mapping table
1589     if(using_tlb) alloc_reg(current,i,TLREG);
1590     // LWL/LWR need a temporary register for the old value
1591     if(opcode[i]==0x22||opcode[i]==0x26)
1592     {
1593       alloc_reg(current,i,FTEMP);
1594       alloc_reg_temp(current,i,-1);
1595     }
1596   }
1597   else
1598   {
1599     // Load to r0 (dummy load)
1600     // but we still need a register to calculate the address
1601     alloc_reg_temp(current,i,-1);
1602   }
1603 }
1604
1605 void store_alloc(struct regstat *current,int i)
1606 {
1607   clear_const(current,rs2[i]);
1608   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1609   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1610   alloc_reg(current,i,rs2[i]);
1611   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1612     alloc_reg64(current,i,rs2[i]);
1613     if(rs2[i]) alloc_reg(current,i,FTEMP);
1614   }
1615   // If using TLB, need a register for pointer to the mapping table
1616   if(using_tlb) alloc_reg(current,i,TLREG);
1617   #if defined(HOST_IMM8)
1618   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1619   else alloc_reg(current,i,INVCP);
1620   #endif
1621   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1622     alloc_reg(current,i,FTEMP);
1623   }
1624   // We need a temporary register for address generation
1625   alloc_reg_temp(current,i,-1);
1626 }
1627
1628 void c1ls_alloc(struct regstat *current,int i)
1629 {
1630   //clear_const(current,rs1[i]); // FIXME
1631   clear_const(current,rt1[i]);
1632   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1633   alloc_reg(current,i,CSREG); // Status
1634   alloc_reg(current,i,FTEMP);
1635   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1636     alloc_reg64(current,i,FTEMP);
1637   }
1638   // If using TLB, need a register for pointer to the mapping table
1639   if(using_tlb) alloc_reg(current,i,TLREG);
1640   #if defined(HOST_IMM8)
1641   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1642   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1643     alloc_reg(current,i,INVCP);
1644   #endif
1645   // We need a temporary register for address generation
1646   alloc_reg_temp(current,i,-1);
1647 }
1648
1649 #ifndef multdiv_alloc
1650 void multdiv_alloc(struct regstat *current,int i)
1651 {
1652   //  case 0x18: MULT
1653   //  case 0x19: MULTU
1654   //  case 0x1A: DIV
1655   //  case 0x1B: DIVU
1656   //  case 0x1C: DMULT
1657   //  case 0x1D: DMULTU
1658   //  case 0x1E: DDIV
1659   //  case 0x1F: DDIVU
1660   clear_const(current,rs1[i]);
1661   clear_const(current,rs2[i]);
1662   if(rs1[i]&&rs2[i])
1663   {
1664     if((opcode2[i]&4)==0) // 32-bit
1665     {
1666       current->u&=~(1LL<<HIREG);
1667       current->u&=~(1LL<<LOREG);
1668       alloc_reg(current,i,HIREG);
1669       alloc_reg(current,i,LOREG);
1670       alloc_reg(current,i,rs1[i]);
1671       alloc_reg(current,i,rs2[i]);
1672       current->is32|=1LL<<HIREG;
1673       current->is32|=1LL<<LOREG;
1674       dirty_reg(current,HIREG);
1675       dirty_reg(current,LOREG);
1676     }
1677     else // 64-bit
1678     {
1679       current->u&=~(1LL<<HIREG);
1680       current->u&=~(1LL<<LOREG);
1681       current->uu&=~(1LL<<HIREG);
1682       current->uu&=~(1LL<<LOREG);
1683       alloc_reg64(current,i,HIREG);
1684       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1685       alloc_reg64(current,i,rs1[i]);
1686       alloc_reg64(current,i,rs2[i]);
1687       alloc_all(current,i);
1688       current->is32&=~(1LL<<HIREG);
1689       current->is32&=~(1LL<<LOREG);
1690       dirty_reg(current,HIREG);
1691       dirty_reg(current,LOREG);
1692     }
1693   }
1694   else
1695   {
1696     // Multiply by zero is zero.
1697     // MIPS does not have a divide by zero exception.
1698     // The result is undefined, we return zero.
1699     alloc_reg(current,i,HIREG);
1700     alloc_reg(current,i,LOREG);
1701     current->is32|=1LL<<HIREG;
1702     current->is32|=1LL<<LOREG;
1703     dirty_reg(current,HIREG);
1704     dirty_reg(current,LOREG);
1705   }
1706 }
1707 #endif
1708
1709 void cop0_alloc(struct regstat *current,int i)
1710 {
1711   if(opcode2[i]==0) // MFC0
1712   {
1713     if(rt1[i]) {
1714       clear_const(current,rt1[i]);
1715       alloc_all(current,i);
1716       alloc_reg(current,i,rt1[i]);
1717       current->is32|=1LL<<rt1[i];
1718       dirty_reg(current,rt1[i]);
1719     }
1720   }
1721   else if(opcode2[i]==4) // MTC0
1722   {
1723     if(rs1[i]){
1724       clear_const(current,rs1[i]);
1725       alloc_reg(current,i,rs1[i]);
1726       alloc_all(current,i);
1727     }
1728     else {
1729       alloc_all(current,i); // FIXME: Keep r0
1730       current->u&=~1LL;
1731       alloc_reg(current,i,0);
1732     }
1733   }
1734   else
1735   {
1736     // TLBR/TLBWI/TLBWR/TLBP/ERET
1737     assert(opcode2[i]==0x10);
1738     alloc_all(current,i);
1739   }
1740 }
1741
1742 void cop1_alloc(struct regstat *current,int i)
1743 {
1744   alloc_reg(current,i,CSREG); // Load status
1745   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1746   {
1747     assert(rt1[i]);
1748     clear_const(current,rt1[i]);
1749     if(opcode2[i]==1) {
1750       alloc_reg64(current,i,rt1[i]); // DMFC1
1751       current->is32&=~(1LL<<rt1[i]);
1752     }else{
1753       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1754       current->is32|=1LL<<rt1[i];
1755     }
1756     dirty_reg(current,rt1[i]);
1757     alloc_reg_temp(current,i,-1);
1758   }
1759   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1760   {
1761     if(rs1[i]){
1762       clear_const(current,rs1[i]);
1763       if(opcode2[i]==5)
1764         alloc_reg64(current,i,rs1[i]); // DMTC1
1765       else
1766         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1767       alloc_reg_temp(current,i,-1);
1768     }
1769     else {
1770       current->u&=~1LL;
1771       alloc_reg(current,i,0);
1772       alloc_reg_temp(current,i,-1);
1773     }
1774   }
1775 }
1776 void fconv_alloc(struct regstat *current,int i)
1777 {
1778   alloc_reg(current,i,CSREG); // Load status
1779   alloc_reg_temp(current,i,-1);
1780 }
1781 void float_alloc(struct regstat *current,int i)
1782 {
1783   alloc_reg(current,i,CSREG); // Load status
1784   alloc_reg_temp(current,i,-1);
1785 }
1786 void fcomp_alloc(struct regstat *current,int i)
1787 {
1788   alloc_reg(current,i,CSREG); // Load status
1789   alloc_reg(current,i,FSREG); // Load flags
1790   dirty_reg(current,FSREG); // Flag will be modified
1791   alloc_reg_temp(current,i,-1);
1792 }
1793
1794 void syscall_alloc(struct regstat *current,int i)
1795 {
1796   alloc_cc(current,i);
1797   dirty_reg(current,CCREG);
1798   alloc_all(current,i);
1799   current->isconst=0;
1800 }
1801
1802 void delayslot_alloc(struct regstat *current,int i)
1803 {
1804   switch(itype[i]) {
1805     case UJUMP:
1806     case CJUMP:
1807     case SJUMP:
1808     case RJUMP:
1809     case FJUMP:
1810     case SYSCALL:
1811     case HLECALL:
1812     case SPAN:
1813       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1814       printf("Disabled speculative precompilation\n");
1815       stop_after_jal=1;
1816       break;
1817     case IMM16:
1818       imm16_alloc(current,i);
1819       break;
1820     case LOAD:
1821     case LOADLR:
1822       load_alloc(current,i);
1823       break;
1824     case STORE:
1825     case STORELR:
1826       store_alloc(current,i);
1827       break;
1828     case ALU:
1829       alu_alloc(current,i);
1830       break;
1831     case SHIFT:
1832       shift_alloc(current,i);
1833       break;
1834     case MULTDIV:
1835       multdiv_alloc(current,i);
1836       break;
1837     case SHIFTIMM:
1838       shiftimm_alloc(current,i);
1839       break;
1840     case MOV:
1841       mov_alloc(current,i);
1842       break;
1843     case COP0:
1844       cop0_alloc(current,i);
1845       break;
1846     case COP1:
1847       cop1_alloc(current,i);
1848       break;
1849     case C1LS:
1850       c1ls_alloc(current,i);
1851       break;
1852     case FCONV:
1853       fconv_alloc(current,i);
1854       break;
1855     case FLOAT:
1856       float_alloc(current,i);
1857       break;
1858     case FCOMP:
1859       fcomp_alloc(current,i);
1860       break;
1861   }
1862 }
1863
1864 // Special case where a branch and delay slot span two pages in virtual memory
1865 static void pagespan_alloc(struct regstat *current,int i)
1866 {
1867   current->isconst=0;
1868   current->wasconst=0;
1869   regs[i].wasconst=0;
1870   alloc_all(current,i);
1871   alloc_cc(current,i);
1872   dirty_reg(current,CCREG);
1873   if(opcode[i]==3) // JAL
1874   {
1875     alloc_reg(current,i,31);
1876     dirty_reg(current,31);
1877   }
1878   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1879   {
1880     alloc_reg(current,i,rs1[i]);
1881     if (rt1[i]==31) {
1882       alloc_reg(current,i,31);
1883       dirty_reg(current,31);
1884     }
1885   }
1886   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1887   {
1888     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1889     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1890     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1891     {
1892       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1893       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1894     }
1895   }
1896   else
1897   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1898   {
1899     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1900     if(!((current->is32>>rs1[i])&1))
1901     {
1902       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1903     }
1904   }
1905   else
1906   if(opcode[i]==0x11) // BC1
1907   {
1908     alloc_reg(current,i,FSREG);
1909     alloc_reg(current,i,CSREG);
1910   }
1911   //else ...
1912 }
1913
1914 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1915 {
1916   stubs[stubcount][0]=type;
1917   stubs[stubcount][1]=addr;
1918   stubs[stubcount][2]=retaddr;
1919   stubs[stubcount][3]=a;
1920   stubs[stubcount][4]=b;
1921   stubs[stubcount][5]=c;
1922   stubs[stubcount][6]=d;
1923   stubs[stubcount][7]=e;
1924   stubcount++;
1925 }
1926
1927 // Write out a single register
1928 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1929 {
1930   int hr;
1931   for(hr=0;hr<HOST_REGS;hr++) {
1932     if(hr!=EXCLUDE_REG) {
1933       if((regmap[hr]&63)==r) {
1934         if((dirty>>hr)&1) {
1935           if(regmap[hr]<64) {
1936             emit_storereg(r,hr);
1937 #ifndef FORCE32
1938             if((is32>>regmap[hr])&1) {
1939               emit_sarimm(hr,31,hr);
1940               emit_storereg(r|64,hr);
1941             }
1942 #endif
1943           }else{
1944             emit_storereg(r|64,hr);
1945           }
1946         }
1947       }
1948     }
1949   }
1950 }
1951
1952 int mchecksum()
1953 {
1954   //if(!tracedebug) return 0;
1955   int i;
1956   int sum=0;
1957   for(i=0;i<2097152;i++) {
1958     unsigned int temp=sum;
1959     sum<<=1;
1960     sum|=(~temp)>>31;
1961     sum^=((u_int *)rdram)[i];
1962   }
1963   return sum;
1964 }
1965 int rchecksum()
1966 {
1967   int i;
1968   int sum=0;
1969   for(i=0;i<64;i++)
1970     sum^=((u_int *)reg)[i];
1971   return sum;
1972 }
1973 void rlist()
1974 {
1975   int i;
1976   printf("TRACE: ");
1977   for(i=0;i<32;i++)
1978     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1979   printf("\n");
1980 #ifndef DISABLE_COP1
1981   printf("TRACE: ");
1982   for(i=0;i<32;i++)
1983     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1984   printf("\n");
1985 #endif
1986 }
1987
1988 void enabletrace()
1989 {
1990   tracedebug=1;
1991 }
1992
1993 void memdebug(int i)
1994 {
1995   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1996   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1997   //rlist();
1998   //if(tracedebug) {
1999   //if(Count>=-2084597794) {
2000   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2001   //if(0) {
2002     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2003     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2004     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2005     rlist();
2006     #ifdef __i386__
2007     printf("TRACE: %x\n",(&i)[-1]);
2008     #endif
2009     #ifdef __arm__
2010     int j;
2011     printf("TRACE: %x \n",(&j)[10]);
2012     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2013     #endif
2014     //fflush(stdout);
2015   }
2016   //printf("TRACE: %x\n",(&i)[-1]);
2017 }
2018
2019 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2020 {
2021   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2022 }
2023
2024 void alu_assemble(int i,struct regstat *i_regs)
2025 {
2026   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2027     if(rt1[i]) {
2028       signed char s1,s2,t;
2029       t=get_reg(i_regs->regmap,rt1[i]);
2030       if(t>=0) {
2031         s1=get_reg(i_regs->regmap,rs1[i]);
2032         s2=get_reg(i_regs->regmap,rs2[i]);
2033         if(rs1[i]&&rs2[i]) {
2034           assert(s1>=0);
2035           assert(s2>=0);
2036           if(opcode2[i]&2) emit_sub(s1,s2,t);
2037           else emit_add(s1,s2,t);
2038         }
2039         else if(rs1[i]) {
2040           if(s1>=0) emit_mov(s1,t);
2041           else emit_loadreg(rs1[i],t);
2042         }
2043         else if(rs2[i]) {
2044           if(s2>=0) {
2045             if(opcode2[i]&2) emit_neg(s2,t);
2046             else emit_mov(s2,t);
2047           }
2048           else {
2049             emit_loadreg(rs2[i],t);
2050             if(opcode2[i]&2) emit_neg(t,t);
2051           }
2052         }
2053         else emit_zeroreg(t);
2054       }
2055     }
2056   }
2057   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2058     if(rt1[i]) {
2059       signed char s1l,s2l,s1h,s2h,tl,th;
2060       tl=get_reg(i_regs->regmap,rt1[i]);
2061       th=get_reg(i_regs->regmap,rt1[i]|64);
2062       if(tl>=0) {
2063         s1l=get_reg(i_regs->regmap,rs1[i]);
2064         s2l=get_reg(i_regs->regmap,rs2[i]);
2065         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2066         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2067         if(rs1[i]&&rs2[i]) {
2068           assert(s1l>=0);
2069           assert(s2l>=0);
2070           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2071           else emit_adds(s1l,s2l,tl);
2072           if(th>=0) {
2073             #ifdef INVERTED_CARRY
2074             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2075             #else
2076             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2077             #endif
2078             else emit_add(s1h,s2h,th);
2079           }
2080         }
2081         else if(rs1[i]) {
2082           if(s1l>=0) emit_mov(s1l,tl);
2083           else emit_loadreg(rs1[i],tl);
2084           if(th>=0) {
2085             if(s1h>=0) emit_mov(s1h,th);
2086             else emit_loadreg(rs1[i]|64,th);
2087           }
2088         }
2089         else if(rs2[i]) {
2090           if(s2l>=0) {
2091             if(opcode2[i]&2) emit_negs(s2l,tl);
2092             else emit_mov(s2l,tl);
2093           }
2094           else {
2095             emit_loadreg(rs2[i],tl);
2096             if(opcode2[i]&2) emit_negs(tl,tl);
2097           }
2098           if(th>=0) {
2099             #ifdef INVERTED_CARRY
2100             if(s2h>=0) emit_mov(s2h,th);
2101             else emit_loadreg(rs2[i]|64,th);
2102             if(opcode2[i]&2) {
2103               emit_adcimm(-1,th); // x86 has inverted carry flag
2104               emit_not(th,th);
2105             }
2106             #else
2107             if(opcode2[i]&2) {
2108               if(s2h>=0) emit_rscimm(s2h,0,th);
2109               else {
2110                 emit_loadreg(rs2[i]|64,th);
2111                 emit_rscimm(th,0,th);
2112               }
2113             }else{
2114               if(s2h>=0) emit_mov(s2h,th);
2115               else emit_loadreg(rs2[i]|64,th);
2116             }
2117             #endif
2118           }
2119         }
2120         else {
2121           emit_zeroreg(tl);
2122           if(th>=0) emit_zeroreg(th);
2123         }
2124       }
2125     }
2126   }
2127   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2128     if(rt1[i]) {
2129       signed char s1l,s1h,s2l,s2h,t;
2130       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2131       {
2132         t=get_reg(i_regs->regmap,rt1[i]);
2133         //assert(t>=0);
2134         if(t>=0) {
2135           s1l=get_reg(i_regs->regmap,rs1[i]);
2136           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2137           s2l=get_reg(i_regs->regmap,rs2[i]);
2138           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2139           if(rs2[i]==0) // rx<r0
2140           {
2141             assert(s1h>=0);
2142             if(opcode2[i]==0x2a) // SLT
2143               emit_shrimm(s1h,31,t);
2144             else // SLTU (unsigned can not be less than zero)
2145               emit_zeroreg(t);
2146           }
2147           else if(rs1[i]==0) // r0<rx
2148           {
2149             assert(s2h>=0);
2150             if(opcode2[i]==0x2a) // SLT
2151               emit_set_gz64_32(s2h,s2l,t);
2152             else // SLTU (set if not zero)
2153               emit_set_nz64_32(s2h,s2l,t);
2154           }
2155           else {
2156             assert(s1l>=0);assert(s1h>=0);
2157             assert(s2l>=0);assert(s2h>=0);
2158             if(opcode2[i]==0x2a) // SLT
2159               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2160             else // SLTU
2161               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2162           }
2163         }
2164       } else {
2165         t=get_reg(i_regs->regmap,rt1[i]);
2166         //assert(t>=0);
2167         if(t>=0) {
2168           s1l=get_reg(i_regs->regmap,rs1[i]);
2169           s2l=get_reg(i_regs->regmap,rs2[i]);
2170           if(rs2[i]==0) // rx<r0
2171           {
2172             assert(s1l>=0);
2173             if(opcode2[i]==0x2a) // SLT
2174               emit_shrimm(s1l,31,t);
2175             else // SLTU (unsigned can not be less than zero)
2176               emit_zeroreg(t);
2177           }
2178           else if(rs1[i]==0) // r0<rx
2179           {
2180             assert(s2l>=0);
2181             if(opcode2[i]==0x2a) // SLT
2182               emit_set_gz32(s2l,t);
2183             else // SLTU (set if not zero)
2184               emit_set_nz32(s2l,t);
2185           }
2186           else{
2187             assert(s1l>=0);assert(s2l>=0);
2188             if(opcode2[i]==0x2a) // SLT
2189               emit_set_if_less32(s1l,s2l,t);
2190             else // SLTU
2191               emit_set_if_carry32(s1l,s2l,t);
2192           }
2193         }
2194       }
2195     }
2196   }
2197   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2198     if(rt1[i]) {
2199       signed char s1l,s1h,s2l,s2h,th,tl;
2200       tl=get_reg(i_regs->regmap,rt1[i]);
2201       th=get_reg(i_regs->regmap,rt1[i]|64);
2202       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2203       {
2204         assert(tl>=0);
2205         if(tl>=0) {
2206           s1l=get_reg(i_regs->regmap,rs1[i]);
2207           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2208           s2l=get_reg(i_regs->regmap,rs2[i]);
2209           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2210           if(rs1[i]&&rs2[i]) {
2211             assert(s1l>=0);assert(s1h>=0);
2212             assert(s2l>=0);assert(s2h>=0);
2213             if(opcode2[i]==0x24) { // AND
2214               emit_and(s1l,s2l,tl);
2215               emit_and(s1h,s2h,th);
2216             } else
2217             if(opcode2[i]==0x25) { // OR
2218               emit_or(s1l,s2l,tl);
2219               emit_or(s1h,s2h,th);
2220             } else
2221             if(opcode2[i]==0x26) { // XOR
2222               emit_xor(s1l,s2l,tl);
2223               emit_xor(s1h,s2h,th);
2224             } else
2225             if(opcode2[i]==0x27) { // NOR
2226               emit_or(s1l,s2l,tl);
2227               emit_or(s1h,s2h,th);
2228               emit_not(tl,tl);
2229               emit_not(th,th);
2230             }
2231           }
2232           else
2233           {
2234             if(opcode2[i]==0x24) { // AND
2235               emit_zeroreg(tl);
2236               emit_zeroreg(th);
2237             } else
2238             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2239               if(rs1[i]){
2240                 if(s1l>=0) emit_mov(s1l,tl);
2241                 else emit_loadreg(rs1[i],tl);
2242                 if(s1h>=0) emit_mov(s1h,th);
2243                 else emit_loadreg(rs1[i]|64,th);
2244               }
2245               else
2246               if(rs2[i]){
2247                 if(s2l>=0) emit_mov(s2l,tl);
2248                 else emit_loadreg(rs2[i],tl);
2249                 if(s2h>=0) emit_mov(s2h,th);
2250                 else emit_loadreg(rs2[i]|64,th);
2251               }
2252               else{
2253                 emit_zeroreg(tl);
2254                 emit_zeroreg(th);
2255               }
2256             } else
2257             if(opcode2[i]==0x27) { // NOR
2258               if(rs1[i]){
2259                 if(s1l>=0) emit_not(s1l,tl);
2260                 else{
2261                   emit_loadreg(rs1[i],tl);
2262                   emit_not(tl,tl);
2263                 }
2264                 if(s1h>=0) emit_not(s1h,th);
2265                 else{
2266                   emit_loadreg(rs1[i]|64,th);
2267                   emit_not(th,th);
2268                 }
2269               }
2270               else
2271               if(rs2[i]){
2272                 if(s2l>=0) emit_not(s2l,tl);
2273                 else{
2274                   emit_loadreg(rs2[i],tl);
2275                   emit_not(tl,tl);
2276                 }
2277                 if(s2h>=0) emit_not(s2h,th);
2278                 else{
2279                   emit_loadreg(rs2[i]|64,th);
2280                   emit_not(th,th);
2281                 }
2282               }
2283               else {
2284                 emit_movimm(-1,tl);
2285                 emit_movimm(-1,th);
2286               }
2287             }
2288           }
2289         }
2290       }
2291       else
2292       {
2293         // 32 bit
2294         if(tl>=0) {
2295           s1l=get_reg(i_regs->regmap,rs1[i]);
2296           s2l=get_reg(i_regs->regmap,rs2[i]);
2297           if(rs1[i]&&rs2[i]) {
2298             assert(s1l>=0);
2299             assert(s2l>=0);
2300             if(opcode2[i]==0x24) { // AND
2301               emit_and(s1l,s2l,tl);
2302             } else
2303             if(opcode2[i]==0x25) { // OR
2304               emit_or(s1l,s2l,tl);
2305             } else
2306             if(opcode2[i]==0x26) { // XOR
2307               emit_xor(s1l,s2l,tl);
2308             } else
2309             if(opcode2[i]==0x27) { // NOR
2310               emit_or(s1l,s2l,tl);
2311               emit_not(tl,tl);
2312             }
2313           }
2314           else
2315           {
2316             if(opcode2[i]==0x24) { // AND
2317               emit_zeroreg(tl);
2318             } else
2319             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2320               if(rs1[i]){
2321                 if(s1l>=0) emit_mov(s1l,tl);
2322                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2323               }
2324               else
2325               if(rs2[i]){
2326                 if(s2l>=0) emit_mov(s2l,tl);
2327                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2328               }
2329               else emit_zeroreg(tl);
2330             } else
2331             if(opcode2[i]==0x27) { // NOR
2332               if(rs1[i]){
2333                 if(s1l>=0) emit_not(s1l,tl);
2334                 else {
2335                   emit_loadreg(rs1[i],tl);
2336                   emit_not(tl,tl);
2337                 }
2338               }
2339               else
2340               if(rs2[i]){
2341                 if(s2l>=0) emit_not(s2l,tl);
2342                 else {
2343                   emit_loadreg(rs2[i],tl);
2344                   emit_not(tl,tl);
2345                 }
2346               }
2347               else emit_movimm(-1,tl);
2348             }
2349           }
2350         }
2351       }
2352     }
2353   }
2354 }
2355
2356 void imm16_assemble(int i,struct regstat *i_regs)
2357 {
2358   if (opcode[i]==0x0f) { // LUI
2359     if(rt1[i]) {
2360       signed char t;
2361       t=get_reg(i_regs->regmap,rt1[i]);
2362       //assert(t>=0);
2363       if(t>=0) {
2364         if(!((i_regs->isconst>>t)&1))
2365           emit_movimm(imm[i]<<16,t);
2366       }
2367     }
2368   }
2369   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2370     if(rt1[i]) {
2371       signed char s,t;
2372       t=get_reg(i_regs->regmap,rt1[i]);
2373       s=get_reg(i_regs->regmap,rs1[i]);
2374       if(rs1[i]) {
2375         //assert(t>=0);
2376         //assert(s>=0);
2377         if(t>=0) {
2378           if(!((i_regs->isconst>>t)&1)) {
2379             if(s<0) {
2380               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2381               emit_addimm(t,imm[i],t);
2382             }else{
2383               if(!((i_regs->wasconst>>s)&1))
2384                 emit_addimm(s,imm[i],t);
2385               else
2386                 emit_movimm(constmap[i][s]+imm[i],t);
2387             }
2388           }
2389         }
2390       } else {
2391         if(t>=0) {
2392           if(!((i_regs->isconst>>t)&1))
2393             emit_movimm(imm[i],t);
2394         }
2395       }
2396     }
2397   }
2398   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2399     if(rt1[i]) {
2400       signed char sh,sl,th,tl;
2401       th=get_reg(i_regs->regmap,rt1[i]|64);
2402       tl=get_reg(i_regs->regmap,rt1[i]);
2403       sh=get_reg(i_regs->regmap,rs1[i]|64);
2404       sl=get_reg(i_regs->regmap,rs1[i]);
2405       if(tl>=0) {
2406         if(rs1[i]) {
2407           assert(sh>=0);
2408           assert(sl>=0);
2409           if(th>=0) {
2410             emit_addimm64_32(sh,sl,imm[i],th,tl);
2411           }
2412           else {
2413             emit_addimm(sl,imm[i],tl);
2414           }
2415         } else {
2416           emit_movimm(imm[i],tl);
2417           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2418         }
2419       }
2420     }
2421   }
2422   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2423     if(rt1[i]) {
2424       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2425       signed char sh,sl,t;
2426       t=get_reg(i_regs->regmap,rt1[i]);
2427       sh=get_reg(i_regs->regmap,rs1[i]|64);
2428       sl=get_reg(i_regs->regmap,rs1[i]);
2429       //assert(t>=0);
2430       if(t>=0) {
2431         if(rs1[i]>0) {
2432           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2433           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2434             if(opcode[i]==0x0a) { // SLTI
2435               if(sl<0) {
2436                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2437                 emit_slti32(t,imm[i],t);
2438               }else{
2439                 emit_slti32(sl,imm[i],t);
2440               }
2441             }
2442             else { // SLTIU
2443               if(sl<0) {
2444                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2445                 emit_sltiu32(t,imm[i],t);
2446               }else{
2447                 emit_sltiu32(sl,imm[i],t);
2448               }
2449             }
2450           }else{ // 64-bit
2451             assert(sl>=0);
2452             if(opcode[i]==0x0a) // SLTI
2453               emit_slti64_32(sh,sl,imm[i],t);
2454             else // SLTIU
2455               emit_sltiu64_32(sh,sl,imm[i],t);
2456           }
2457         }else{
2458           // SLTI(U) with r0 is just stupid,
2459           // nonetheless examples can be found
2460           if(opcode[i]==0x0a) // SLTI
2461             if(0<imm[i]) emit_movimm(1,t);
2462             else emit_zeroreg(t);
2463           else // SLTIU
2464           {
2465             if(imm[i]) emit_movimm(1,t);
2466             else emit_zeroreg(t);
2467           }
2468         }
2469       }
2470     }
2471   }
2472   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2473     if(rt1[i]) {
2474       signed char sh,sl,th,tl;
2475       th=get_reg(i_regs->regmap,rt1[i]|64);
2476       tl=get_reg(i_regs->regmap,rt1[i]);
2477       sh=get_reg(i_regs->regmap,rs1[i]|64);
2478       sl=get_reg(i_regs->regmap,rs1[i]);
2479       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2480         if(opcode[i]==0x0c) //ANDI
2481         {
2482           if(rs1[i]) {
2483             if(sl<0) {
2484               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2485               emit_andimm(tl,imm[i],tl);
2486             }else{
2487               if(!((i_regs->wasconst>>sl)&1))
2488                 emit_andimm(sl,imm[i],tl);
2489               else
2490                 emit_movimm(constmap[i][sl]&imm[i],tl);
2491             }
2492           }
2493           else
2494             emit_zeroreg(tl);
2495           if(th>=0) emit_zeroreg(th);
2496         }
2497         else
2498         {
2499           if(rs1[i]) {
2500             if(sl<0) {
2501               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2502             }
2503             if(th>=0) {
2504               if(sh<0) {
2505                 emit_loadreg(rs1[i]|64,th);
2506               }else{
2507                 emit_mov(sh,th);
2508               }
2509             }
2510             if(opcode[i]==0x0d) //ORI
2511             if(sl<0) {
2512               emit_orimm(tl,imm[i],tl);
2513             }else{
2514               if(!((i_regs->wasconst>>sl)&1))
2515                 emit_orimm(sl,imm[i],tl);
2516               else
2517                 emit_movimm(constmap[i][sl]|imm[i],tl);
2518             }
2519             if(opcode[i]==0x0e) //XORI
2520             if(sl<0) {
2521               emit_xorimm(tl,imm[i],tl);
2522             }else{
2523               if(!((i_regs->wasconst>>sl)&1))
2524                 emit_xorimm(sl,imm[i],tl);
2525               else
2526                 emit_movimm(constmap[i][sl]^imm[i],tl);
2527             }
2528           }
2529           else {
2530             emit_movimm(imm[i],tl);
2531             if(th>=0) emit_zeroreg(th);
2532           }
2533         }
2534       }
2535     }
2536   }
2537 }
2538
2539 void shiftimm_assemble(int i,struct regstat *i_regs)
2540 {
2541   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2542   {
2543     if(rt1[i]) {
2544       signed char s,t;
2545       t=get_reg(i_regs->regmap,rt1[i]);
2546       s=get_reg(i_regs->regmap,rs1[i]);
2547       //assert(t>=0);
2548       if(t>=0){
2549         if(rs1[i]==0)
2550         {
2551           emit_zeroreg(t);
2552         }
2553         else
2554         {
2555           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2556           if(imm[i]) {
2557             if(opcode2[i]==0) // SLL
2558             {
2559               emit_shlimm(s<0?t:s,imm[i],t);
2560             }
2561             if(opcode2[i]==2) // SRL
2562             {
2563               emit_shrimm(s<0?t:s,imm[i],t);
2564             }
2565             if(opcode2[i]==3) // SRA
2566             {
2567               emit_sarimm(s<0?t:s,imm[i],t);
2568             }
2569           }else{
2570             // Shift by zero
2571             if(s>=0 && s!=t) emit_mov(s,t);
2572           }
2573         }
2574       }
2575       //emit_storereg(rt1[i],t); //DEBUG
2576     }
2577   }
2578   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2579   {
2580     if(rt1[i]) {
2581       signed char sh,sl,th,tl;
2582       th=get_reg(i_regs->regmap,rt1[i]|64);
2583       tl=get_reg(i_regs->regmap,rt1[i]);
2584       sh=get_reg(i_regs->regmap,rs1[i]|64);
2585       sl=get_reg(i_regs->regmap,rs1[i]);
2586       if(tl>=0) {
2587         if(rs1[i]==0)
2588         {
2589           emit_zeroreg(tl);
2590           if(th>=0) emit_zeroreg(th);
2591         }
2592         else
2593         {
2594           assert(sl>=0);
2595           assert(sh>=0);
2596           if(imm[i]) {
2597             if(opcode2[i]==0x38) // DSLL
2598             {
2599               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2600               emit_shlimm(sl,imm[i],tl);
2601             }
2602             if(opcode2[i]==0x3a) // DSRL
2603             {
2604               emit_shrdimm(sl,sh,imm[i],tl);
2605               if(th>=0) emit_shrimm(sh,imm[i],th);
2606             }
2607             if(opcode2[i]==0x3b) // DSRA
2608             {
2609               emit_shrdimm(sl,sh,imm[i],tl);
2610               if(th>=0) emit_sarimm(sh,imm[i],th);
2611             }
2612           }else{
2613             // Shift by zero
2614             if(sl!=tl) emit_mov(sl,tl);
2615             if(th>=0&&sh!=th) emit_mov(sh,th);
2616           }
2617         }
2618       }
2619     }
2620   }
2621   if(opcode2[i]==0x3c) // DSLL32
2622   {
2623     if(rt1[i]) {
2624       signed char sl,tl,th;
2625       tl=get_reg(i_regs->regmap,rt1[i]);
2626       th=get_reg(i_regs->regmap,rt1[i]|64);
2627       sl=get_reg(i_regs->regmap,rs1[i]);
2628       if(th>=0||tl>=0){
2629         assert(tl>=0);
2630         assert(th>=0);
2631         assert(sl>=0);
2632         emit_mov(sl,th);
2633         emit_zeroreg(tl);
2634         if(imm[i]>32)
2635         {
2636           emit_shlimm(th,imm[i]&31,th);
2637         }
2638       }
2639     }
2640   }
2641   if(opcode2[i]==0x3e) // DSRL32
2642   {
2643     if(rt1[i]) {
2644       signed char sh,tl,th;
2645       tl=get_reg(i_regs->regmap,rt1[i]);
2646       th=get_reg(i_regs->regmap,rt1[i]|64);
2647       sh=get_reg(i_regs->regmap,rs1[i]|64);
2648       if(tl>=0){
2649         assert(sh>=0);
2650         emit_mov(sh,tl);
2651         if(th>=0) emit_zeroreg(th);
2652         if(imm[i]>32)
2653         {
2654           emit_shrimm(tl,imm[i]&31,tl);
2655         }
2656       }
2657     }
2658   }
2659   if(opcode2[i]==0x3f) // DSRA32
2660   {
2661     if(rt1[i]) {
2662       signed char sh,tl;
2663       tl=get_reg(i_regs->regmap,rt1[i]);
2664       sh=get_reg(i_regs->regmap,rs1[i]|64);
2665       if(tl>=0){
2666         assert(sh>=0);
2667         emit_mov(sh,tl);
2668         if(imm[i]>32)
2669         {
2670           emit_sarimm(tl,imm[i]&31,tl);
2671         }
2672       }
2673     }
2674   }
2675 }
2676
2677 #ifndef shift_assemble
2678 void shift_assemble(int i,struct regstat *i_regs)
2679 {
2680   printf("Need shift_assemble for this architecture.\n");
2681   exit(1);
2682 }
2683 #endif
2684
2685 void load_assemble(int i,struct regstat *i_regs)
2686 {
2687   int s,th,tl,addr,map=-1;
2688   int offset;
2689   int jaddr=0;
2690   int memtarget,c=0;
2691   u_int hr,reglist=0;
2692   th=get_reg(i_regs->regmap,rt1[i]|64);
2693   tl=get_reg(i_regs->regmap,rt1[i]);
2694   s=get_reg(i_regs->regmap,rs1[i]);
2695   offset=imm[i];
2696   for(hr=0;hr<HOST_REGS;hr++) {
2697     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2698   }
2699   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2700   if(s>=0) {
2701     c=(i_regs->wasconst>>s)&1;
2702     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2703     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2704   }
2705   if(offset||s<0||c) addr=tl;
2706   else addr=s;
2707   //printf("load_assemble: c=%d\n",c);
2708   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2709   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2710   if(tl>=0) {
2711     //assert(tl>=0);
2712     //assert(rt1[i]);
2713     reglist&=~(1<<tl);
2714     if(th>=0) reglist&=~(1<<th);
2715     if(!using_tlb) {
2716       if(!c) {
2717 //#define R29_HACK 1
2718         #ifdef R29_HACK
2719         // Strmnnrmn's speed hack
2720         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2721         #endif
2722         {
2723           emit_cmpimm(addr,0x800000);
2724           jaddr=(int)out;
2725           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2726           // Hint to branch predictor that the branch is unlikely to be taken
2727           if(rs1[i]>=28)
2728             emit_jno_unlikely(0);
2729           else
2730           #endif
2731           emit_jno(0);
2732         }
2733       }
2734     }else{ // using tlb
2735       int x=0;
2736       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2737       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2738       map=get_reg(i_regs->regmap,TLREG);
2739       assert(map>=0);
2740       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2741       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2742     }
2743     if (opcode[i]==0x20) { // LB
2744       if(!c||memtarget) {
2745         #ifdef HOST_IMM_ADDR32
2746         if(c)
2747           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2748         else
2749         #endif
2750         {
2751           //emit_xorimm(addr,3,tl);
2752           //gen_tlb_addr_r(tl,map);
2753           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2754           int x=0;
2755 #ifdef BIG_ENDIAN_MIPS
2756           if(!c) emit_xorimm(addr,3,tl);
2757           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2758 #else
2759           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2760           else if (tl!=addr) emit_mov(addr,tl);
2761 #endif
2762           emit_movsbl_indexed_tlb(x,tl,map,tl);
2763         }
2764         if(jaddr)
2765           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2766       }
2767       else
2768         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2769     }
2770     if (opcode[i]==0x21) { // LH
2771       if(!c||memtarget) {
2772         #ifdef HOST_IMM_ADDR32
2773         if(c)
2774           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2775         else
2776         #endif
2777         {
2778           int x=0;
2779 #ifdef BIG_ENDIAN_MIPS
2780           if(!c) emit_xorimm(addr,2,tl);
2781           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2782 #else
2783           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2784           else if (tl!=addr) emit_mov(addr,tl);
2785 #endif
2786           //#ifdef
2787           //emit_movswl_indexed_tlb(x,tl,map,tl);
2788           //else
2789           if(map>=0) {
2790             gen_tlb_addr_r(tl,map);
2791             emit_movswl_indexed(x,tl,tl);
2792           }else
2793             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2794         }
2795         if(jaddr)
2796           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2797       }
2798       else
2799         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2800     }
2801     if (opcode[i]==0x23) { // LW
2802       if(!c||memtarget) {
2803         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2804         #ifdef HOST_IMM_ADDR32
2805         if(c)
2806           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2807         else
2808         #endif
2809         emit_readword_indexed_tlb(0,addr,map,tl);
2810         if(jaddr)
2811           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2812       }
2813       else
2814         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2815     }
2816     if (opcode[i]==0x24) { // LBU
2817       if(!c||memtarget) {
2818         #ifdef HOST_IMM_ADDR32
2819         if(c)
2820           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2821         else
2822         #endif
2823         {
2824           //emit_xorimm(addr,3,tl);
2825           //gen_tlb_addr_r(tl,map);
2826           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2827           int x=0;
2828 #ifdef BIG_ENDIAN_MIPS
2829           if(!c) emit_xorimm(addr,3,tl);
2830           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2831 #else
2832           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2833           else if (tl!=addr) emit_mov(addr,tl);
2834 #endif
2835           emit_movzbl_indexed_tlb(x,tl,map,tl);
2836         }
2837         if(jaddr)
2838           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2839       }
2840       else
2841         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2842     }
2843     if (opcode[i]==0x25) { // LHU
2844       if(!c||memtarget) {
2845         #ifdef HOST_IMM_ADDR32
2846         if(c)
2847           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2848         else
2849         #endif
2850         {
2851           int x=0;
2852 #ifdef BIG_ENDIAN_MIPS
2853           if(!c) emit_xorimm(addr,2,tl);
2854           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2855 #else
2856           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2857           else if (tl!=addr) emit_mov(addr,tl);
2858 #endif
2859           //#ifdef
2860           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2861           //#else
2862           if(map>=0) {
2863             gen_tlb_addr_r(tl,map);
2864             emit_movzwl_indexed(x,tl,tl);
2865           }else
2866             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2867           if(jaddr)
2868             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2869         }
2870       }
2871       else
2872         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2873     }
2874     if (opcode[i]==0x27) { // LWU
2875       assert(th>=0);
2876       if(!c||memtarget) {
2877         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2878         #ifdef HOST_IMM_ADDR32
2879         if(c)
2880           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2881         else
2882         #endif
2883         emit_readword_indexed_tlb(0,addr,map,tl);
2884         if(jaddr)
2885           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2886       }
2887       else {
2888         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2889       }
2890       emit_zeroreg(th);
2891     }
2892     if (opcode[i]==0x37) { // LD
2893       if(!c||memtarget) {
2894         //gen_tlb_addr_r(tl,map);
2895         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2896         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2897         #ifdef HOST_IMM_ADDR32
2898         if(c)
2899           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2900         else
2901         #endif
2902         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2903         if(jaddr)
2904           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2905       }
2906       else
2907         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2908     }
2909     //emit_storereg(rt1[i],tl); // DEBUG
2910   }
2911   //if(opcode[i]==0x23)
2912   //if(opcode[i]==0x24)
2913   //if(opcode[i]==0x23||opcode[i]==0x24)
2914   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2915   {
2916     //emit_pusha();
2917     save_regs(0x100f);
2918         emit_readword((int)&last_count,ECX);
2919         #ifdef __i386__
2920         if(get_reg(i_regs->regmap,CCREG)<0)
2921           emit_loadreg(CCREG,HOST_CCREG);
2922         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2923         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2924         emit_writeword(HOST_CCREG,(int)&Count);
2925         #endif
2926         #ifdef __arm__
2927         if(get_reg(i_regs->regmap,CCREG)<0)
2928           emit_loadreg(CCREG,0);
2929         else
2930           emit_mov(HOST_CCREG,0);
2931         emit_add(0,ECX,0);
2932         emit_addimm(0,2*ccadj[i],0);
2933         emit_writeword(0,(int)&Count);
2934         #endif
2935     emit_call((int)memdebug);
2936     //emit_popa();
2937     restore_regs(0x100f);
2938   }/**/
2939 }
2940
2941 #ifndef loadlr_assemble
2942 void loadlr_assemble(int i,struct regstat *i_regs)
2943 {
2944   printf("Need loadlr_assemble for this architecture.\n");
2945   exit(1);
2946 }
2947 #endif
2948
2949 void store_assemble(int i,struct regstat *i_regs)
2950 {
2951   int s,th,tl,map=-1;
2952   int addr,temp;
2953   int offset;
2954   int jaddr=0,jaddr2,type;
2955   int memtarget,c=0;
2956   int agr=AGEN1+(i&1);
2957   u_int hr,reglist=0;
2958   th=get_reg(i_regs->regmap,rs2[i]|64);
2959   tl=get_reg(i_regs->regmap,rs2[i]);
2960   s=get_reg(i_regs->regmap,rs1[i]);
2961   temp=get_reg(i_regs->regmap,agr);
2962   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2963   offset=imm[i];
2964   if(s>=0) {
2965     c=(i_regs->wasconst>>s)&1;
2966     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2967     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2968   }
2969   assert(tl>=0);
2970   assert(temp>=0);
2971   for(hr=0;hr<HOST_REGS;hr++) {
2972     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2973   }
2974   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2975   if(offset||s<0||c) addr=temp;
2976   else addr=s;
2977   if(!using_tlb) {
2978     if(!c) {
2979       #ifdef R29_HACK
2980       // Strmnnrmn's speed hack
2981       memtarget=1;
2982       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2983       #endif
2984       emit_cmpimm(addr,0x800000);
2985       #ifdef DESTRUCTIVE_SHIFT
2986       if(s==addr) emit_mov(s,temp);
2987       #endif
2988       #ifdef R29_HACK
2989       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2990       #endif
2991       {
2992         jaddr=(int)out;
2993         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2994         // Hint to branch predictor that the branch is unlikely to be taken
2995         if(rs1[i]>=28)
2996           emit_jno_unlikely(0);
2997         else
2998         #endif
2999         emit_jno(0);
3000       }
3001     }
3002   }else{ // using tlb
3003     int x=0;
3004     if (opcode[i]==0x28) x=3; // SB
3005     if (opcode[i]==0x29) x=2; // SH
3006     map=get_reg(i_regs->regmap,TLREG);
3007     assert(map>=0);
3008     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3009     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3010   }
3011
3012   if (opcode[i]==0x28) { // SB
3013     if(!c||memtarget) {
3014       int x=0;
3015 #ifdef BIG_ENDIAN_MIPS
3016       if(!c) emit_xorimm(addr,3,temp);
3017       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3018 #else
3019       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3020       else if (addr!=temp) emit_mov(addr,temp);
3021 #endif
3022       //gen_tlb_addr_w(temp,map);
3023       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3024       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
3025     }
3026     type=STOREB_STUB;
3027   }
3028   if (opcode[i]==0x29) { // SH
3029     if(!c||memtarget) {
3030       int x=0;
3031 #ifdef BIG_ENDIAN_MIPS
3032       if(!c) emit_xorimm(addr,2,temp);
3033       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3034 #else
3035       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3036       else if (addr!=temp) emit_mov(addr,temp);
3037 #endif
3038       //#ifdef
3039       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3040       //#else
3041       if(map>=0) {
3042         gen_tlb_addr_w(temp,map);
3043         emit_writehword_indexed(tl,x,temp);
3044       }else
3045         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3046     }
3047     type=STOREH_STUB;
3048   }
3049   if (opcode[i]==0x2B) { // SW
3050     if(!c||memtarget)
3051       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3052       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3053     type=STOREW_STUB;
3054   }
3055   if (opcode[i]==0x3F) { // SD
3056     if(!c||memtarget) {
3057       if(rs2[i]) {
3058         assert(th>=0);
3059         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3060         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3061         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3062       }else{
3063         // Store zero
3064         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3065         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3066         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3067       }
3068     }
3069     type=STORED_STUB;
3070   }
3071   if(jaddr) {
3072     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3073   } else if(!memtarget) {
3074     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3075   }
3076   if(!using_tlb) {
3077     if(!c||memtarget) {
3078       #ifdef DESTRUCTIVE_SHIFT
3079       // The x86 shift operation is 'destructive'; it overwrites the
3080       // source register, so we need to make a copy first and use that.
3081       addr=temp;
3082       #endif
3083       #if defined(HOST_IMM8)
3084       int ir=get_reg(i_regs->regmap,INVCP);
3085       assert(ir>=0);
3086       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3087       #else
3088       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3089       #endif
3090       jaddr2=(int)out;
3091       emit_jne(0);
3092       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3093     }
3094   }
3095   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3096   //if(opcode[i]==0x2B || opcode[i]==0x28)
3097   //if(opcode[i]==0x2B || opcode[i]==0x29)
3098   //if(opcode[i]==0x2B)
3099   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3100   {
3101     //emit_pusha();
3102     save_regs(0x100f);
3103         emit_readword((int)&last_count,ECX);
3104         #ifdef __i386__
3105         if(get_reg(i_regs->regmap,CCREG)<0)
3106           emit_loadreg(CCREG,HOST_CCREG);
3107         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3108         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3109         emit_writeword(HOST_CCREG,(int)&Count);
3110         #endif
3111         #ifdef __arm__
3112         if(get_reg(i_regs->regmap,CCREG)<0)
3113           emit_loadreg(CCREG,0);
3114         else
3115           emit_mov(HOST_CCREG,0);
3116         emit_add(0,ECX,0);
3117         emit_addimm(0,2*ccadj[i],0);
3118         emit_writeword(0,(int)&Count);
3119         #endif
3120     emit_call((int)memdebug);
3121     //emit_popa();
3122     restore_regs(0x100f);
3123   }/**/
3124 }
3125
3126 void storelr_assemble(int i,struct regstat *i_regs)
3127 {
3128   int s,th,tl;
3129   int temp;
3130   int temp2;
3131   int offset;
3132   int jaddr=0,jaddr2;
3133   int case1,case2,case3;
3134   int done0,done1,done2;
3135   int memtarget,c=0;
3136   u_int hr,reglist=0;
3137   th=get_reg(i_regs->regmap,rs2[i]|64);
3138   tl=get_reg(i_regs->regmap,rs2[i]);
3139   s=get_reg(i_regs->regmap,rs1[i]);
3140   temp=get_reg(i_regs->regmap,-1);
3141   offset=imm[i];
3142   if(s>=0) {
3143     c=(i_regs->isconst>>s)&1;
3144     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3145     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3146   }
3147   assert(tl>=0);
3148   for(hr=0;hr<HOST_REGS;hr++) {
3149     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3150   }
3151   if(tl>=0) {
3152     assert(temp>=0);
3153     if(!using_tlb) {
3154       if(!c) {
3155         emit_cmpimm(s<0||offset?temp:s,0x800000);
3156         if(!offset&&s!=temp) emit_mov(s,temp);
3157         jaddr=(int)out;
3158         emit_jno(0);
3159       }
3160       else
3161       {
3162         if(!memtarget||!rs1[i]) {
3163           jaddr=(int)out;
3164           emit_jmp(0);
3165         }
3166       }
3167       if((u_int)rdram!=0x80000000) 
3168         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3169     }else{ // using tlb
3170       int map=get_reg(i_regs->regmap,TLREG);
3171       assert(map>=0);
3172       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3173       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3174       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3175       if(!jaddr&&!memtarget) {
3176         jaddr=(int)out;
3177         emit_jmp(0);
3178       }
3179       gen_tlb_addr_w(temp,map);
3180     }
3181
3182     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3183       temp2=get_reg(i_regs->regmap,FTEMP);
3184       if(!rs2[i]) temp2=th=tl;
3185     }
3186
3187 #ifndef BIG_ENDIAN_MIPS
3188     emit_xorimm(temp,3,temp);
3189 #endif
3190     emit_testimm(temp,2);
3191     case2=(int)out;
3192     emit_jne(0);
3193     emit_testimm(temp,1);
3194     case1=(int)out;
3195     emit_jne(0);
3196     // 0
3197     if (opcode[i]==0x2A) { // SWL
3198       emit_writeword_indexed(tl,0,temp);
3199     }
3200     if (opcode[i]==0x2E) { // SWR
3201       emit_writebyte_indexed(tl,3,temp);
3202     }
3203     if (opcode[i]==0x2C) { // SDL
3204       emit_writeword_indexed(th,0,temp);
3205       if(rs2[i]) emit_mov(tl,temp2);
3206     }
3207     if (opcode[i]==0x2D) { // SDR
3208       emit_writebyte_indexed(tl,3,temp);
3209       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3210     }
3211     done0=(int)out;
3212     emit_jmp(0);
3213     // 1
3214     set_jump_target(case1,(int)out);
3215     if (opcode[i]==0x2A) { // SWL
3216       // Write 3 msb into three least significant bytes
3217       if(rs2[i]) emit_rorimm(tl,8,tl);
3218       emit_writehword_indexed(tl,-1,temp);
3219       if(rs2[i]) emit_rorimm(tl,16,tl);
3220       emit_writebyte_indexed(tl,1,temp);
3221       if(rs2[i]) emit_rorimm(tl,8,tl);
3222     }
3223     if (opcode[i]==0x2E) { // SWR
3224       // Write two lsb into two most significant bytes
3225       emit_writehword_indexed(tl,1,temp);
3226     }
3227     if (opcode[i]==0x2C) { // SDL
3228       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3229       // Write 3 msb into three least significant bytes
3230       if(rs2[i]) emit_rorimm(th,8,th);
3231       emit_writehword_indexed(th,-1,temp);
3232       if(rs2[i]) emit_rorimm(th,16,th);
3233       emit_writebyte_indexed(th,1,temp);
3234       if(rs2[i]) emit_rorimm(th,8,th);
3235     }
3236     if (opcode[i]==0x2D) { // SDR
3237       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3238       // Write two lsb into two most significant bytes
3239       emit_writehword_indexed(tl,1,temp);
3240     }
3241     done1=(int)out;
3242     emit_jmp(0);
3243     // 2
3244     set_jump_target(case2,(int)out);
3245     emit_testimm(temp,1);
3246     case3=(int)out;
3247     emit_jne(0);
3248     if (opcode[i]==0x2A) { // SWL
3249       // Write two msb into two least significant bytes
3250       if(rs2[i]) emit_rorimm(tl,16,tl);
3251       emit_writehword_indexed(tl,-2,temp);
3252       if(rs2[i]) emit_rorimm(tl,16,tl);
3253     }
3254     if (opcode[i]==0x2E) { // SWR
3255       // Write 3 lsb into three most significant bytes
3256       emit_writebyte_indexed(tl,-1,temp);
3257       if(rs2[i]) emit_rorimm(tl,8,tl);
3258       emit_writehword_indexed(tl,0,temp);
3259       if(rs2[i]) emit_rorimm(tl,24,tl);
3260     }
3261     if (opcode[i]==0x2C) { // SDL
3262       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3263       // Write two msb into two least significant bytes
3264       if(rs2[i]) emit_rorimm(th,16,th);
3265       emit_writehword_indexed(th,-2,temp);
3266       if(rs2[i]) emit_rorimm(th,16,th);
3267     }
3268     if (opcode[i]==0x2D) { // SDR
3269       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3270       // Write 3 lsb into three most significant bytes
3271       emit_writebyte_indexed(tl,-1,temp);
3272       if(rs2[i]) emit_rorimm(tl,8,tl);
3273       emit_writehword_indexed(tl,0,temp);
3274       if(rs2[i]) emit_rorimm(tl,24,tl);
3275     }
3276     done2=(int)out;
3277     emit_jmp(0);
3278     // 3
3279     set_jump_target(case3,(int)out);
3280     if (opcode[i]==0x2A) { // SWL
3281       // Write msb into least significant byte
3282       if(rs2[i]) emit_rorimm(tl,24,tl);
3283       emit_writebyte_indexed(tl,-3,temp);
3284       if(rs2[i]) emit_rorimm(tl,8,tl);
3285     }
3286     if (opcode[i]==0x2E) { // SWR
3287       // Write entire word
3288       emit_writeword_indexed(tl,-3,temp);
3289     }
3290     if (opcode[i]==0x2C) { // SDL
3291       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3292       // Write msb into least significant byte
3293       if(rs2[i]) emit_rorimm(th,24,th);
3294       emit_writebyte_indexed(th,-3,temp);
3295       if(rs2[i]) emit_rorimm(th,8,th);
3296     }
3297     if (opcode[i]==0x2D) { // SDR
3298       if(rs2[i]) emit_mov(th,temp2);
3299       // Write entire word
3300       emit_writeword_indexed(tl,-3,temp);
3301     }
3302     set_jump_target(done0,(int)out);
3303     set_jump_target(done1,(int)out);
3304     set_jump_target(done2,(int)out);
3305     if (opcode[i]==0x2C) { // SDL
3306       emit_testimm(temp,4);
3307       done0=(int)out;
3308       emit_jne(0);
3309       emit_andimm(temp,~3,temp);
3310       emit_writeword_indexed(temp2,4,temp);
3311       set_jump_target(done0,(int)out);
3312     }
3313     if (opcode[i]==0x2D) { // SDR
3314       emit_testimm(temp,4);
3315       done0=(int)out;
3316       emit_jeq(0);
3317       emit_andimm(temp,~3,temp);
3318       emit_writeword_indexed(temp2,-4,temp);
3319       set_jump_target(done0,(int)out);
3320     }
3321     if(!c||!memtarget)
3322       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3323   }
3324   if(!using_tlb) {
3325     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3326     #if defined(HOST_IMM8)
3327     int ir=get_reg(i_regs->regmap,INVCP);
3328     assert(ir>=0);
3329     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3330     #else
3331     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3332     #endif
3333     jaddr2=(int)out;
3334     emit_jne(0);
3335     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3336   }
3337   /*
3338     emit_pusha();
3339     //save_regs(0x100f);
3340         emit_readword((int)&last_count,ECX);
3341         if(get_reg(i_regs->regmap,CCREG)<0)
3342           emit_loadreg(CCREG,HOST_CCREG);
3343         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3344         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3345         emit_writeword(HOST_CCREG,(int)&Count);
3346     emit_call((int)memdebug);
3347     emit_popa();
3348     //restore_regs(0x100f);
3349   /**/
3350 }
3351
3352 void c1ls_assemble(int i,struct regstat *i_regs)
3353 {
3354 #ifndef DISABLE_COP1
3355   int s,th,tl;
3356   int temp,ar;
3357   int map=-1;
3358   int offset;
3359   int c=0;
3360   int jaddr,jaddr2=0,jaddr3,type;
3361   int agr=AGEN1+(i&1);
3362   u_int hr,reglist=0;
3363   th=get_reg(i_regs->regmap,FTEMP|64);
3364   tl=get_reg(i_regs->regmap,FTEMP);
3365   s=get_reg(i_regs->regmap,rs1[i]);
3366   temp=get_reg(i_regs->regmap,agr);
3367   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3368   offset=imm[i];
3369   assert(tl>=0);
3370   assert(rs1[i]>0);
3371   assert(temp>=0);
3372   for(hr=0;hr<HOST_REGS;hr++) {
3373     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3374   }
3375   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3376   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3377   {
3378     // Loads use a temporary register which we need to save
3379     reglist|=1<<temp;
3380   }
3381   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3382     ar=temp;
3383   else // LWC1/LDC1
3384     ar=tl;
3385   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3386   //else c=(i_regs->wasconst>>s)&1;
3387   if(s>=0) c=(i_regs->wasconst>>s)&1;
3388   // Check cop1 unusable
3389   if(!cop1_usable) {
3390     signed char rs=get_reg(i_regs->regmap,CSREG);
3391     assert(rs>=0);
3392     emit_testimm(rs,0x20000000);
3393     jaddr=(int)out;
3394     emit_jeq(0);
3395     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3396     cop1_usable=1;
3397   }
3398   if (opcode[i]==0x39) { // SWC1 (get float address)
3399     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3400   }
3401   if (opcode[i]==0x3D) { // SDC1 (get double address)
3402     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3403   }
3404   // Generate address + offset
3405   if(!using_tlb) {
3406     if(!c)
3407       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3408   }
3409   else
3410   {
3411     map=get_reg(i_regs->regmap,TLREG);
3412     assert(map>=0);
3413     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3414       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3415     }
3416     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3417       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3418     }
3419   }
3420   if (opcode[i]==0x39) { // SWC1 (read float)
3421     emit_readword_indexed(0,tl,tl);
3422   }
3423   if (opcode[i]==0x3D) { // SDC1 (read double)
3424     emit_readword_indexed(4,tl,th);
3425     emit_readword_indexed(0,tl,tl);
3426   }
3427   if (opcode[i]==0x31) { // LWC1 (get target address)
3428     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3429   }
3430   if (opcode[i]==0x35) { // LDC1 (get target address)
3431     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3432   }
3433   if(!using_tlb) {
3434     if(!c) {
3435       jaddr2=(int)out;
3436       emit_jno(0);
3437     }
3438     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3439       jaddr2=(int)out;
3440       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3441     }
3442     #ifdef DESTRUCTIVE_SHIFT
3443     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3444       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3445     }
3446     #endif
3447   }else{
3448     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3449       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3450     }
3451     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3452       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3453     }
3454   }
3455   if (opcode[i]==0x31) { // LWC1
3456     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3457     //gen_tlb_addr_r(ar,map);
3458     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3459     #ifdef HOST_IMM_ADDR32
3460     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3461     else
3462     #endif
3463     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3464     type=LOADW_STUB;
3465   }
3466   if (opcode[i]==0x35) { // LDC1
3467     assert(th>=0);
3468     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3469     //gen_tlb_addr_r(ar,map);
3470     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3471     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3472     #ifdef HOST_IMM_ADDR32
3473     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3474     else
3475     #endif
3476     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3477     type=LOADD_STUB;
3478   }
3479   if (opcode[i]==0x39) { // SWC1
3480     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3481     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3482     type=STOREW_STUB;
3483   }
3484   if (opcode[i]==0x3D) { // SDC1
3485     assert(th>=0);
3486     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3487     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3488     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3489     type=STORED_STUB;
3490   }
3491   if(!using_tlb) {
3492     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3493       #ifndef DESTRUCTIVE_SHIFT
3494       temp=offset||c||s<0?ar:s;
3495       #endif
3496       #if defined(HOST_IMM8)
3497       int ir=get_reg(i_regs->regmap,INVCP);
3498       assert(ir>=0);
3499       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3500       #else
3501       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3502       #endif
3503       jaddr3=(int)out;
3504       emit_jne(0);
3505       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3506     }
3507   }
3508   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3509   if (opcode[i]==0x31) { // LWC1 (write float)
3510     emit_writeword_indexed(tl,0,temp);
3511   }
3512   if (opcode[i]==0x35) { // LDC1 (write double)
3513     emit_writeword_indexed(th,4,temp);
3514     emit_writeword_indexed(tl,0,temp);
3515   }
3516   //if(opcode[i]==0x39)
3517   /*if(opcode[i]==0x39||opcode[i]==0x31)
3518   {
3519     emit_pusha();
3520         emit_readword((int)&last_count,ECX);
3521         if(get_reg(i_regs->regmap,CCREG)<0)
3522           emit_loadreg(CCREG,HOST_CCREG);
3523         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3524         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3525         emit_writeword(HOST_CCREG,(int)&Count);
3526     emit_call((int)memdebug);
3527     emit_popa();
3528   }/**/
3529 #else
3530   cop1_unusable(i, i_regs);
3531 #endif
3532 }
3533
3534 #ifndef multdiv_assemble
3535 void multdiv_assemble(int i,struct regstat *i_regs)
3536 {
3537   printf("Need multdiv_assemble for this architecture.\n");
3538   exit(1);
3539 }
3540 #endif
3541
3542 void mov_assemble(int i,struct regstat *i_regs)
3543 {
3544   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3545   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3546   assert(rt1[i]>0);
3547   if(rt1[i]) {
3548     signed char sh,sl,th,tl;
3549     th=get_reg(i_regs->regmap,rt1[i]|64);
3550     tl=get_reg(i_regs->regmap,rt1[i]);
3551     //assert(tl>=0);
3552     if(tl>=0) {
3553       sh=get_reg(i_regs->regmap,rs1[i]|64);
3554       sl=get_reg(i_regs->regmap,rs1[i]);
3555       if(sl>=0) emit_mov(sl,tl);
3556       else emit_loadreg(rs1[i],tl);
3557       if(th>=0) {
3558         if(sh>=0) emit_mov(sh,th);
3559         else emit_loadreg(rs1[i]|64,th);
3560       }
3561     }
3562   }
3563 }
3564
3565 #ifndef fconv_assemble
3566 void fconv_assemble(int i,struct regstat *i_regs)
3567 {
3568   printf("Need fconv_assemble for this architecture.\n");
3569   exit(1);
3570 }
3571 #endif
3572
3573 #if 0
3574 void float_assemble(int i,struct regstat *i_regs)
3575 {
3576   printf("Need float_assemble for this architecture.\n");
3577   exit(1);
3578 }
3579 #endif
3580
3581 void syscall_assemble(int i,struct regstat *i_regs)
3582 {
3583   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3584   assert(ccreg==HOST_CCREG);
3585   assert(!is_delayslot);
3586   emit_movimm(start+i*4,EAX); // Get PC
3587   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3588   emit_jmp((int)jump_syscall_hle); // XXX
3589 }
3590
3591 void hlecall_assemble(int i,struct regstat *i_regs)
3592 {
3593   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3594   assert(ccreg==HOST_CCREG);
3595   assert(!is_delayslot);
3596   emit_movimm(start+i*4+4,0); // Get PC
3597   emit_movimm(source[i],1); // opcode
3598   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3599   emit_jmp((int)jump_hlecall); // XXX
3600 }
3601
3602 void ds_assemble(int i,struct regstat *i_regs)
3603 {
3604   is_delayslot=1;
3605   switch(itype[i]) {
3606     case ALU:
3607       alu_assemble(i,i_regs);break;
3608     case IMM16:
3609       imm16_assemble(i,i_regs);break;
3610     case SHIFT:
3611       shift_assemble(i,i_regs);break;
3612     case SHIFTIMM:
3613       shiftimm_assemble(i,i_regs);break;
3614     case LOAD:
3615       load_assemble(i,i_regs);break;
3616     case LOADLR:
3617       loadlr_assemble(i,i_regs);break;
3618     case STORE:
3619       store_assemble(i,i_regs);break;
3620     case STORELR:
3621       storelr_assemble(i,i_regs);break;
3622     case COP0:
3623       cop0_assemble(i,i_regs);break;
3624     case COP1:
3625       cop1_assemble(i,i_regs);break;
3626     case C1LS:
3627       c1ls_assemble(i,i_regs);break;
3628     case FCONV:
3629       fconv_assemble(i,i_regs);break;
3630     case FLOAT:
3631       float_assemble(i,i_regs);break;
3632     case FCOMP:
3633       fcomp_assemble(i,i_regs);break;
3634     case MULTDIV:
3635       multdiv_assemble(i,i_regs);break;
3636     case MOV:
3637       mov_assemble(i,i_regs);break;
3638     case SYSCALL:
3639     case HLECALL:
3640     case SPAN:
3641     case UJUMP:
3642     case RJUMP:
3643     case CJUMP:
3644     case SJUMP:
3645     case FJUMP:
3646       printf("Jump in the delay slot.  This is probably a bug.\n");
3647   }
3648   is_delayslot=0;
3649 }
3650
3651 // Is the branch target a valid internal jump?
3652 int internal_branch(uint64_t i_is32,int addr)
3653 {
3654   if(addr&1) return 0; // Indirect (register) jump
3655   if(addr>=start && addr<start+slen*4-4)
3656   {
3657     int t=(addr-start)>>2;
3658     // Delay slots are not valid branch targets
3659     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3660     // 64 -> 32 bit transition requires a recompile
3661     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3662     {
3663       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3664       else printf("optimizable: yes\n");
3665     }*/
3666     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3667     if(requires_32bit[t]&~i_is32) return 0;
3668     else return 1;
3669   }
3670   return 0;
3671 }
3672
3673 #ifndef wb_invalidate
3674 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3675   uint64_t u,uint64_t uu)
3676 {
3677   int hr;
3678   for(hr=0;hr<HOST_REGS;hr++) {
3679     if(hr!=EXCLUDE_REG) {
3680       if(pre[hr]!=entry[hr]) {
3681         if(pre[hr]>=0) {
3682           if((dirty>>hr)&1) {
3683             if(get_reg(entry,pre[hr])<0) {
3684               if(pre[hr]<64) {
3685                 if(!((u>>pre[hr])&1)) {
3686                   emit_storereg(pre[hr],hr);
3687                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3688                     emit_sarimm(hr,31,hr);
3689                     emit_storereg(pre[hr]|64,hr);
3690                   }
3691                 }
3692               }else{
3693                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3694                   emit_storereg(pre[hr],hr);
3695                 }
3696               }
3697             }
3698           }
3699         }
3700       }
3701     }
3702   }
3703   // Move from one register to another (no writeback)
3704   for(hr=0;hr<HOST_REGS;hr++) {
3705     if(hr!=EXCLUDE_REG) {
3706       if(pre[hr]!=entry[hr]) {
3707         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3708           int nr;
3709           if((nr=get_reg(entry,pre[hr]))>=0) {
3710             emit_mov(hr,nr);
3711           }
3712         }
3713       }
3714     }
3715   }
3716 }
3717 #endif
3718
3719 // Load the specified registers
3720 // This only loads the registers given as arguments because
3721 // we don't want to load things that will be overwritten
3722 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3723 {
3724   int hr;
3725   // Load 32-bit regs
3726   for(hr=0;hr<HOST_REGS;hr++) {
3727     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3728       if(entry[hr]!=regmap[hr]) {
3729         if(regmap[hr]==rs1||regmap[hr]==rs2)
3730         {
3731           if(regmap[hr]==0) {
3732             emit_zeroreg(hr);
3733           }
3734           else
3735           {
3736             emit_loadreg(regmap[hr],hr);
3737           }
3738         }
3739       }
3740     }
3741   }
3742   //Load 64-bit regs
3743   for(hr=0;hr<HOST_REGS;hr++) {
3744     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3745       if(entry[hr]!=regmap[hr]) {
3746         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3747         {
3748           assert(regmap[hr]!=64);
3749           if((is32>>(regmap[hr]&63))&1) {
3750             int lr=get_reg(regmap,regmap[hr]-64);
3751             if(lr>=0)
3752               emit_sarimm(lr,31,hr);
3753             else
3754               emit_loadreg(regmap[hr],hr);
3755           }
3756           else
3757           {
3758             emit_loadreg(regmap[hr],hr);
3759           }
3760         }
3761       }
3762     }
3763   }
3764 }
3765
3766 // Load registers prior to the start of a loop
3767 // so that they are not loaded within the loop
3768 static void loop_preload(signed char pre[],signed char entry[])
3769 {
3770   int hr;
3771   for(hr=0;hr<HOST_REGS;hr++) {
3772     if(hr!=EXCLUDE_REG) {
3773       if(pre[hr]!=entry[hr]) {
3774         if(entry[hr]>=0) {
3775           if(get_reg(pre,entry[hr])<0) {
3776             assem_debug("loop preload:\n");
3777             //printf("loop preload: %d\n",hr);
3778             if(entry[hr]==0) {
3779               emit_zeroreg(hr);
3780             }
3781             else if(entry[hr]<TEMPREG)
3782             {
3783               emit_loadreg(entry[hr],hr);
3784             }
3785             else if(entry[hr]-64<TEMPREG)
3786             {
3787               emit_loadreg(entry[hr],hr);
3788             }
3789           }
3790         }
3791       }
3792     }
3793   }
3794 }
3795
3796 // Generate address for load/store instruction
3797 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3798 {
3799   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3800     int ra;
3801     int agr=AGEN1+(i&1);
3802     int mgr=MGEN1+(i&1);
3803     if(itype[i]==LOAD) {
3804       ra=get_reg(i_regs->regmap,rt1[i]);
3805       //if(rt1[i]) assert(ra>=0);
3806     }
3807     if(itype[i]==LOADLR) {
3808       ra=get_reg(i_regs->regmap,FTEMP);
3809     }
3810     if(itype[i]==STORE||itype[i]==STORELR) {
3811       ra=get_reg(i_regs->regmap,agr);
3812       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3813     }
3814     if(itype[i]==C1LS) {
3815       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3816         ra=get_reg(i_regs->regmap,FTEMP);
3817       else { // SWC1/SDC1
3818         ra=get_reg(i_regs->regmap,agr);
3819         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3820       }
3821     }
3822     int rs=get_reg(i_regs->regmap,rs1[i]);
3823     int rm=get_reg(i_regs->regmap,TLREG);
3824     if(ra>=0) {
3825       int offset=imm[i];
3826       int c=(i_regs->wasconst>>rs)&1;
3827       if(rs1[i]==0) {
3828         // Using r0 as a base address
3829         /*if(rm>=0) {
3830           if(!entry||entry[rm]!=mgr) {
3831             generate_map_const(offset,rm);
3832           } // else did it in the previous cycle
3833         }*/
3834         if(!entry||entry[ra]!=agr) {
3835           if (opcode[i]==0x22||opcode[i]==0x26) {
3836             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3837           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3838             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3839           }else{
3840             emit_movimm(offset,ra);
3841           }
3842         } // else did it in the previous cycle
3843       }
3844       else if(rs<0) {
3845         if(!entry||entry[ra]!=rs1[i])
3846           emit_loadreg(rs1[i],ra);
3847         //if(!entry||entry[ra]!=rs1[i])
3848         //  printf("poor load scheduling!\n");
3849       }
3850       else if(c) {
3851         if(rm>=0) {
3852           if(!entry||entry[rm]!=mgr) {
3853             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3854               // Stores to memory go thru the mapper to detect self-modifying
3855               // code, loads don't.
3856               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3857                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3858                 generate_map_const(constmap[i][rs]+offset,rm);
3859             }else{
3860               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3861                 generate_map_const(constmap[i][rs]+offset,rm);
3862             }
3863           }
3864         }
3865         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3866           if(!entry||entry[ra]!=agr) {
3867             if (opcode[i]==0x22||opcode[i]==0x26) {
3868               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3869             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3870               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3871             }else{
3872               #ifdef HOST_IMM_ADDR32
3873               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3874                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3875               #endif
3876               emit_movimm(constmap[i][rs]+offset,ra);
3877             }
3878           } // else did it in the previous cycle
3879         } // else load_consts already did it
3880       }
3881       if(offset&&!c&&rs1[i]) {
3882         if(rs>=0) {
3883           emit_addimm(rs,offset,ra);
3884         }else{
3885           emit_addimm(ra,offset,ra);
3886         }
3887       }
3888     }
3889   }
3890   // Preload constants for next instruction
3891   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3892     int agr,ra;
3893     #ifndef HOST_IMM_ADDR32
3894     // Mapper entry
3895     agr=MGEN1+((i+1)&1);
3896     ra=get_reg(i_regs->regmap,agr);
3897     if(ra>=0) {
3898       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3899       int offset=imm[i+1];
3900       int c=(regs[i+1].wasconst>>rs)&1;
3901       if(c) {
3902         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3903           // Stores to memory go thru the mapper to detect self-modifying
3904           // code, loads don't.
3905           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3906              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3907             generate_map_const(constmap[i+1][rs]+offset,ra);
3908         }else{
3909           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3910             generate_map_const(constmap[i+1][rs]+offset,ra);
3911         }
3912       }
3913       /*else if(rs1[i]==0) {
3914         generate_map_const(offset,ra);
3915       }*/
3916     }
3917     #endif
3918     // Actual address
3919     agr=AGEN1+((i+1)&1);
3920     ra=get_reg(i_regs->regmap,agr);
3921     if(ra>=0) {
3922       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3923       int offset=imm[i+1];
3924       int c=(regs[i+1].wasconst>>rs)&1;
3925       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3926         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3927           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3928         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3929           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3930         }else{
3931           #ifdef HOST_IMM_ADDR32
3932           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3933              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3934           #endif
3935           emit_movimm(constmap[i+1][rs]+offset,ra);
3936         }
3937       }
3938       else if(rs1[i+1]==0) {
3939         // Using r0 as a base address
3940         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3941           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3942         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3943           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3944         }else{
3945           emit_movimm(offset,ra);
3946         }
3947       }
3948     }
3949   }
3950 }
3951
3952 int get_final_value(int hr, int i, int *value)
3953 {
3954   int reg=regs[i].regmap[hr];
3955   while(i<slen-1) {
3956     if(regs[i+1].regmap[hr]!=reg) break;
3957     if(!((regs[i+1].isconst>>hr)&1)) break;
3958     if(bt[i+1]) break;
3959     i++;
3960   }
3961   if(i<slen-1) {
3962     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3963       *value=constmap[i][hr];
3964       return 1;
3965     }
3966     if(!bt[i+1]) {
3967       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3968         // Load in delay slot, out-of-order execution
3969         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3970         {
3971           #ifdef HOST_IMM_ADDR32
3972           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3973           #endif
3974           // Precompute load address
3975           *value=constmap[i][hr]+imm[i+2];
3976           return 1;
3977         }
3978       }
3979       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3980       {
3981         #ifdef HOST_IMM_ADDR32
3982         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3983         #endif
3984         // Precompute load address
3985         *value=constmap[i][hr]+imm[i+1];
3986         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3987         return 1;
3988       }
3989     }
3990   }
3991   *value=constmap[i][hr];
3992   //printf("c=%x\n",(int)constmap[i][hr]);
3993   if(i==slen-1) return 1;
3994   if(reg<64) {
3995     return !((unneeded_reg[i+1]>>reg)&1);
3996   }else{
3997     return !((unneeded_reg_upper[i+1]>>reg)&1);
3998   }
3999 }
4000
4001 // Load registers with known constants
4002 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4003 {
4004   int hr;
4005   // Load 32-bit regs
4006   for(hr=0;hr<HOST_REGS;hr++) {
4007     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4008       //if(entry[hr]!=regmap[hr]) {
4009       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4010         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4011           int value;
4012           if(get_final_value(hr,i,&value)) {
4013             if(value==0) {
4014               emit_zeroreg(hr);
4015             }
4016             else {
4017               emit_movimm(value,hr);
4018             }
4019           }
4020         }
4021       }
4022     }
4023   }
4024   // Load 64-bit regs
4025   for(hr=0;hr<HOST_REGS;hr++) {
4026     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4027       //if(entry[hr]!=regmap[hr]) {
4028       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4029         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4030           if((is32>>(regmap[hr]&63))&1) {
4031             int lr=get_reg(regmap,regmap[hr]-64);
4032             assert(lr>=0);
4033             emit_sarimm(lr,31,hr);
4034           }
4035           else
4036           {
4037             int value;
4038             if(get_final_value(hr,i,&value)) {
4039               if(value==0) {
4040                 emit_zeroreg(hr);
4041               }
4042               else {
4043                 emit_movimm(value,hr);
4044               }
4045             }
4046           }
4047         }
4048       }
4049     }
4050   }
4051 }
4052 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4053 {
4054   int hr;
4055   // Load 32-bit regs
4056   for(hr=0;hr<HOST_REGS;hr++) {
4057     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4058       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4059         int value=constmap[i][hr];
4060         if(value==0) {
4061           emit_zeroreg(hr);
4062         }
4063         else {
4064           emit_movimm(value,hr);
4065         }
4066       }
4067     }
4068   }
4069   // Load 64-bit regs
4070   for(hr=0;hr<HOST_REGS;hr++) {
4071     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4072       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4073         if((is32>>(regmap[hr]&63))&1) {
4074           int lr=get_reg(regmap,regmap[hr]-64);
4075           assert(lr>=0);
4076           emit_sarimm(lr,31,hr);
4077         }
4078         else
4079         {
4080           int value=constmap[i][hr];
4081           if(value==0) {
4082             emit_zeroreg(hr);
4083           }
4084           else {
4085             emit_movimm(value,hr);
4086           }
4087         }
4088       }
4089     }
4090   }
4091 }
4092
4093 // Write out all dirty registers (except cycle count)
4094 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4095 {
4096   int hr;
4097   for(hr=0;hr<HOST_REGS;hr++) {
4098     if(hr!=EXCLUDE_REG) {
4099       if(i_regmap[hr]>0) {
4100         if(i_regmap[hr]!=CCREG) {
4101           if((i_dirty>>hr)&1) {
4102             if(i_regmap[hr]<64) {
4103               emit_storereg(i_regmap[hr],hr);
4104 #ifndef FORCE32
4105               if( ((i_is32>>i_regmap[hr])&1) ) {
4106                 #ifdef DESTRUCTIVE_WRITEBACK
4107                 emit_sarimm(hr,31,hr);
4108                 emit_storereg(i_regmap[hr]|64,hr);
4109                 #else
4110                 emit_sarimm(hr,31,HOST_TEMPREG);
4111                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4112                 #endif
4113               }
4114 #endif
4115             }else{
4116               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4117                 emit_storereg(i_regmap[hr],hr);
4118               }
4119             }
4120           }
4121         }
4122       }
4123     }
4124   }
4125 }
4126 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4127 // This writes the registers not written by store_regs_bt
4128 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4129 {
4130   int hr;
4131   int t=(addr-start)>>2;
4132   for(hr=0;hr<HOST_REGS;hr++) {
4133     if(hr!=EXCLUDE_REG) {
4134       if(i_regmap[hr]>0) {
4135         if(i_regmap[hr]!=CCREG) {
4136           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4137             if((i_dirty>>hr)&1) {
4138               if(i_regmap[hr]<64) {
4139                 emit_storereg(i_regmap[hr],hr);
4140 #ifndef FORCE32
4141                 if( ((i_is32>>i_regmap[hr])&1) ) {
4142                   #ifdef DESTRUCTIVE_WRITEBACK
4143                   emit_sarimm(hr,31,hr);
4144                   emit_storereg(i_regmap[hr]|64,hr);
4145                   #else
4146                   emit_sarimm(hr,31,HOST_TEMPREG);
4147                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4148                   #endif
4149                 }
4150 #endif
4151               }else{
4152                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4153                   emit_storereg(i_regmap[hr],hr);
4154                 }
4155               }
4156             }
4157           }
4158         }
4159       }
4160     }
4161   }
4162 }
4163
4164 // Load all registers (except cycle count)
4165 void load_all_regs(signed char i_regmap[])
4166 {
4167   int hr;
4168   for(hr=0;hr<HOST_REGS;hr++) {
4169     if(hr!=EXCLUDE_REG) {
4170       if(i_regmap[hr]==0) {
4171         emit_zeroreg(hr);
4172       }
4173       else
4174       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4175       {
4176         emit_loadreg(i_regmap[hr],hr);
4177       }
4178     }
4179   }
4180 }
4181
4182 // Load all current registers also needed by next instruction
4183 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4184 {
4185   int hr;
4186   for(hr=0;hr<HOST_REGS;hr++) {
4187     if(hr!=EXCLUDE_REG) {
4188       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4189         if(i_regmap[hr]==0) {
4190           emit_zeroreg(hr);
4191         }
4192         else
4193         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4194         {
4195           emit_loadreg(i_regmap[hr],hr);
4196         }
4197       }
4198     }
4199   }
4200 }
4201
4202 // Load all regs, storing cycle count if necessary
4203 void load_regs_entry(int t)
4204 {
4205   int hr;
4206   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4207   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4208   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4209     emit_storereg(CCREG,HOST_CCREG);
4210   }
4211   // Load 32-bit regs
4212   for(hr=0;hr<HOST_REGS;hr++) {
4213     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4214       if(regs[t].regmap_entry[hr]==0) {
4215         emit_zeroreg(hr);
4216       }
4217       else if(regs[t].regmap_entry[hr]!=CCREG)
4218       {
4219         emit_loadreg(regs[t].regmap_entry[hr],hr);
4220       }
4221     }
4222   }
4223   // Load 64-bit regs
4224   for(hr=0;hr<HOST_REGS;hr++) {
4225     if(regs[t].regmap_entry[hr]>=64) {
4226       assert(regs[t].regmap_entry[hr]!=64);
4227       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4228         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4229         if(lr<0) {
4230           emit_loadreg(regs[t].regmap_entry[hr],hr);
4231         }
4232         else
4233         {
4234           emit_sarimm(lr,31,hr);
4235         }
4236       }
4237       else
4238       {
4239         emit_loadreg(regs[t].regmap_entry[hr],hr);
4240       }
4241     }
4242   }
4243 }
4244
4245 // Store dirty registers prior to branch
4246 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4247 {
4248   if(internal_branch(i_is32,addr))
4249   {
4250     int t=(addr-start)>>2;
4251     int hr;
4252     for(hr=0;hr<HOST_REGS;hr++) {
4253       if(hr!=EXCLUDE_REG) {
4254         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4255           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4256             if((i_dirty>>hr)&1) {
4257               if(i_regmap[hr]<64) {
4258                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4259                   emit_storereg(i_regmap[hr],hr);
4260                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4261                     #ifdef DESTRUCTIVE_WRITEBACK
4262                     emit_sarimm(hr,31,hr);
4263                     emit_storereg(i_regmap[hr]|64,hr);
4264                     #else
4265                     emit_sarimm(hr,31,HOST_TEMPREG);
4266                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4267                     #endif
4268                   }
4269                 }
4270               }else{
4271                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4272                   emit_storereg(i_regmap[hr],hr);
4273                 }
4274               }
4275             }
4276           }
4277         }
4278       }
4279     }
4280   }
4281   else
4282   {
4283     // Branch out of this block, write out all dirty regs
4284     wb_dirtys(i_regmap,i_is32,i_dirty);
4285   }
4286 }
4287
4288 // Load all needed registers for branch target
4289 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4290 {
4291   //if(addr>=start && addr<(start+slen*4))
4292   if(internal_branch(i_is32,addr))
4293   {
4294     int t=(addr-start)>>2;
4295     int hr;
4296     // Store the cycle count before loading something else
4297     if(i_regmap[HOST_CCREG]!=CCREG) {
4298       assert(i_regmap[HOST_CCREG]==-1);
4299     }
4300     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4301       emit_storereg(CCREG,HOST_CCREG);
4302     }
4303     // Load 32-bit regs
4304     for(hr=0;hr<HOST_REGS;hr++) {
4305       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4306         #ifdef DESTRUCTIVE_WRITEBACK
4307         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4308         #else
4309         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4310         #endif
4311           if(regs[t].regmap_entry[hr]==0) {
4312             emit_zeroreg(hr);
4313           }
4314           else if(regs[t].regmap_entry[hr]!=CCREG)
4315           {
4316             emit_loadreg(regs[t].regmap_entry[hr],hr);
4317           }
4318         }
4319       }
4320     }
4321     //Load 64-bit regs
4322     for(hr=0;hr<HOST_REGS;hr++) {
4323       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4324         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4325           assert(regs[t].regmap_entry[hr]!=64);
4326           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4327             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4328             if(lr<0) {
4329               emit_loadreg(regs[t].regmap_entry[hr],hr);
4330             }
4331             else
4332             {
4333               emit_sarimm(lr,31,hr);
4334             }
4335           }
4336           else
4337           {
4338             emit_loadreg(regs[t].regmap_entry[hr],hr);
4339           }
4340         }
4341         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4342           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4343           assert(lr>=0);
4344           emit_sarimm(lr,31,hr);
4345         }
4346       }
4347     }
4348   }
4349 }
4350
4351 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4352 {
4353   if(addr>=start && addr<start+slen*4-4)
4354   {
4355     int t=(addr-start)>>2;
4356     int hr;
4357     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4358     for(hr=0;hr<HOST_REGS;hr++)
4359     {
4360       if(hr!=EXCLUDE_REG)
4361       {
4362         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4363         {
4364           if(regs[t].regmap_entry[hr]!=-1)
4365           {
4366             return 0;
4367           }
4368           else 
4369           if((i_dirty>>hr)&1)
4370           {
4371             if(i_regmap[hr]<64)
4372             {
4373               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4374                 return 0;
4375             }
4376             else
4377             {
4378               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4379                 return 0;
4380             }
4381           }
4382         }
4383         else // Same register but is it 32-bit or dirty?
4384         if(i_regmap[hr]>=0)
4385         {
4386           if(!((regs[t].dirty>>hr)&1))
4387           {
4388             if((i_dirty>>hr)&1)
4389             {
4390               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4391               {
4392                 //printf("%x: dirty no match\n",addr);
4393                 return 0;
4394               }
4395             }
4396           }
4397           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4398           {
4399             //printf("%x: is32 no match\n",addr);
4400             return 0;
4401           }
4402         }
4403       }
4404     }
4405     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4406     if(requires_32bit[t]&~i_is32) return 0;
4407     // Delay slots are not valid branch targets
4408     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4409     // Delay slots require additional processing, so do not match
4410     if(is_ds[t]) return 0;
4411   }
4412   else
4413   {
4414     int hr;
4415     for(hr=0;hr<HOST_REGS;hr++)
4416     {
4417       if(hr!=EXCLUDE_REG)
4418       {
4419         if(i_regmap[hr]>=0)
4420         {
4421           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4422           {
4423             if((i_dirty>>hr)&1)
4424             {
4425               return 0;
4426             }
4427           }
4428         }
4429       }
4430     }
4431   }
4432   return 1;
4433 }
4434
4435 // Used when a branch jumps into the delay slot of another branch
4436 void ds_assemble_entry(int i)
4437 {
4438   int t=(ba[i]-start)>>2;
4439   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4440   assem_debug("Assemble delay slot at %x\n",ba[i]);
4441   assem_debug("<->\n");
4442   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4443     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4444   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4445   address_generation(t,&regs[t],regs[t].regmap_entry);
4446   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4447     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4448   cop1_usable=0;
4449   is_delayslot=0;
4450   switch(itype[t]) {
4451     case ALU:
4452       alu_assemble(t,&regs[t]);break;
4453     case IMM16:
4454       imm16_assemble(t,&regs[t]);break;
4455     case SHIFT:
4456       shift_assemble(t,&regs[t]);break;
4457     case SHIFTIMM:
4458       shiftimm_assemble(t,&regs[t]);break;
4459     case LOAD:
4460       load_assemble(t,&regs[t]);break;
4461     case LOADLR:
4462       loadlr_assemble(t,&regs[t]);break;
4463     case STORE:
4464       store_assemble(t,&regs[t]);break;
4465     case STORELR:
4466       storelr_assemble(t,&regs[t]);break;
4467     case COP0:
4468       cop0_assemble(t,&regs[t]);break;
4469     case COP1:
4470       cop1_assemble(t,&regs[t]);break;
4471     case C1LS:
4472       c1ls_assemble(t,&regs[t]);break;
4473     case FCONV:
4474       fconv_assemble(t,&regs[t]);break;
4475     case FLOAT:
4476       float_assemble(t,&regs[t]);break;
4477     case FCOMP:
4478       fcomp_assemble(t,&regs[t]);break;
4479     case MULTDIV:
4480       multdiv_assemble(t,&regs[t]);break;
4481     case MOV:
4482       mov_assemble(t,&regs[t]);break;
4483     case SYSCALL:
4484     case HLECALL:
4485     case SPAN:
4486     case UJUMP:
4487     case RJUMP:
4488     case CJUMP:
4489     case SJUMP:
4490     case FJUMP:
4491       printf("Jump in the delay slot.  This is probably a bug.\n");
4492   }
4493   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4494   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4495   if(internal_branch(regs[t].is32,ba[i]+4))
4496     assem_debug("branch: internal\n");
4497   else
4498     assem_debug("branch: external\n");
4499   assert(internal_branch(regs[t].is32,ba[i]+4));
4500   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4501   emit_jmp(0);
4502 }
4503
4504 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4505 {
4506   int count;
4507   int jaddr;
4508   int idle=0;
4509   if(itype[i]==RJUMP)
4510   {
4511     *adj=0;
4512   }
4513   //if(ba[i]>=start && ba[i]<(start+slen*4))
4514   if(internal_branch(branch_regs[i].is32,ba[i]))
4515   {
4516     int t=(ba[i]-start)>>2;
4517     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4518     else *adj=ccadj[t];
4519   }
4520   else
4521   {
4522     *adj=0;
4523   }
4524   count=ccadj[i];
4525   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4526     // Idle loop
4527     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4528     idle=(int)out;
4529     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4530     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4531     jaddr=(int)out;
4532     emit_jmp(0);
4533   }
4534   else if(*adj==0||invert) {
4535     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4536     jaddr=(int)out;
4537     emit_jns(0);
4538   }
4539   else
4540   {
4541     emit_cmpimm(HOST_CCREG,-2*(count+2));
4542     jaddr=(int)out;
4543     emit_jns(0);
4544   }
4545   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4546 }
4547
4548 void do_ccstub(int n)
4549 {
4550   literal_pool(256);
4551   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4552   set_jump_target(stubs[n][1],(int)out);
4553   int i=stubs[n][4];
4554   if(stubs[n][6]==NULLDS) {
4555     // Delay slot instruction is nullified ("likely" branch)
4556     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4557   }
4558   else if(stubs[n][6]!=TAKEN) {
4559     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4560   }
4561   else {
4562     if(internal_branch(branch_regs[i].is32,ba[i]))
4563       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4564   }
4565   if(stubs[n][5]!=-1)
4566   {
4567     // Save PC as return address
4568     emit_movimm(stubs[n][5],EAX);
4569     emit_writeword(EAX,(int)&pcaddr);
4570   }
4571   else
4572   {
4573     // Return address depends on which way the branch goes
4574     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4575     {
4576       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4577       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4578       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4579       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4580       if(rs1[i]==0)
4581       {
4582         s1l=s2l;s1h=s2h;
4583         s2l=s2h=-1;
4584       }
4585       else if(rs2[i]==0)
4586       {
4587         s2l=s2h=-1;
4588       }
4589       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4590         s1h=s2h=-1;
4591       }
4592       assert(s1l>=0);
4593       #ifdef DESTRUCTIVE_WRITEBACK
4594       if(rs1[i]) {
4595         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4596           emit_loadreg(rs1[i],s1l);
4597       } 
4598       else {
4599         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4600           emit_loadreg(rs2[i],s1l);
4601       }
4602       if(s2l>=0)
4603         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4604           emit_loadreg(rs2[i],s2l);
4605       #endif
4606       int hr=0;
4607       int addr,alt,ntaddr;
4608       while(hr<HOST_REGS)
4609       {
4610         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4611            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4612            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4613         {
4614           addr=hr++;break;
4615         }
4616         hr++;
4617       }
4618       while(hr<HOST_REGS)
4619       {
4620         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4621            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4622            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4623         {
4624           alt=hr++;break;
4625         }
4626         hr++;
4627       }
4628       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4629       {
4630         while(hr<HOST_REGS)
4631         {
4632           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4633              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4634              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4635           {
4636             ntaddr=hr;break;
4637           }
4638           hr++;
4639         }
4640         assert(hr<HOST_REGS);
4641       }
4642       if((opcode[i]&0x2f)==4) // BEQ
4643       {
4644         #ifdef HAVE_CMOV_IMM
4645         if(s1h<0) {
4646           if(s2l>=0) emit_cmp(s1l,s2l);
4647           else emit_test(s1l,s1l);
4648           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4649         }
4650         else
4651         #endif
4652         {
4653           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4654           if(s1h>=0) {
4655             if(s2h>=0) emit_cmp(s1h,s2h);
4656             else emit_test(s1h,s1h);
4657             emit_cmovne_reg(alt,addr);
4658           }
4659           if(s2l>=0) emit_cmp(s1l,s2l);
4660           else emit_test(s1l,s1l);
4661           emit_cmovne_reg(alt,addr);
4662         }
4663       }
4664       if((opcode[i]&0x2f)==5) // BNE
4665       {
4666         #ifdef HAVE_CMOV_IMM
4667         if(s1h<0) {
4668           if(s2l>=0) emit_cmp(s1l,s2l);
4669           else emit_test(s1l,s1l);
4670           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4671         }
4672         else
4673         #endif
4674         {
4675           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4676           if(s1h>=0) {
4677             if(s2h>=0) emit_cmp(s1h,s2h);
4678             else emit_test(s1h,s1h);
4679             emit_cmovne_reg(alt,addr);
4680           }
4681           if(s2l>=0) emit_cmp(s1l,s2l);
4682           else emit_test(s1l,s1l);
4683           emit_cmovne_reg(alt,addr);
4684         }
4685       }
4686       if((opcode[i]&0x2f)==6) // BLEZ
4687       {
4688         //emit_movimm(ba[i],alt);
4689         //emit_movimm(start+i*4+8,addr);
4690         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4691         emit_cmpimm(s1l,1);
4692         if(s1h>=0) emit_mov(addr,ntaddr);
4693         emit_cmovl_reg(alt,addr);
4694         if(s1h>=0) {
4695           emit_test(s1h,s1h);
4696           emit_cmovne_reg(ntaddr,addr);
4697           emit_cmovs_reg(alt,addr);
4698         }
4699       }
4700       if((opcode[i]&0x2f)==7) // BGTZ
4701       {
4702         //emit_movimm(ba[i],addr);
4703         //emit_movimm(start+i*4+8,ntaddr);
4704         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4705         emit_cmpimm(s1l,1);
4706         if(s1h>=0) emit_mov(addr,alt);
4707         emit_cmovl_reg(ntaddr,addr);
4708         if(s1h>=0) {
4709           emit_test(s1h,s1h);
4710           emit_cmovne_reg(alt,addr);
4711           emit_cmovs_reg(ntaddr,addr);
4712         }
4713       }
4714       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4715       {
4716         //emit_movimm(ba[i],alt);
4717         //emit_movimm(start+i*4+8,addr);
4718         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4719         if(s1h>=0) emit_test(s1h,s1h);
4720         else emit_test(s1l,s1l);
4721         emit_cmovs_reg(alt,addr);
4722       }
4723       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4724       {
4725         //emit_movimm(ba[i],addr);
4726         //emit_movimm(start+i*4+8,alt);
4727         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4728         if(s1h>=0) emit_test(s1h,s1h);
4729         else emit_test(s1l,s1l);
4730         emit_cmovs_reg(alt,addr);
4731       }
4732       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4733         if(source[i]&0x10000) // BC1T
4734         {
4735           //emit_movimm(ba[i],alt);
4736           //emit_movimm(start+i*4+8,addr);
4737           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4738           emit_testimm(s1l,0x800000);
4739           emit_cmovne_reg(alt,addr);
4740         }
4741         else // BC1F
4742         {
4743           //emit_movimm(ba[i],addr);
4744           //emit_movimm(start+i*4+8,alt);
4745           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4746           emit_testimm(s1l,0x800000);
4747           emit_cmovne_reg(alt,addr);
4748         }
4749       }
4750       emit_writeword(addr,(int)&pcaddr);
4751     }
4752     else
4753     if(itype[i]==RJUMP)
4754     {
4755       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4756       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4757         r=get_reg(branch_regs[i].regmap,RTEMP);
4758       }
4759       emit_writeword(r,(int)&pcaddr);
4760     }
4761     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4762   }
4763   // Update cycle count
4764   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4765   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4766   emit_call((int)cc_interrupt);
4767   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4768   if(stubs[n][6]==TAKEN) {
4769     if(internal_branch(branch_regs[i].is32,ba[i]))
4770       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4771     else if(itype[i]==RJUMP) {
4772       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4773         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4774       else
4775         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4776     }
4777   }else if(stubs[n][6]==NOTTAKEN) {
4778     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4779     else load_all_regs(branch_regs[i].regmap);
4780   }else if(stubs[n][6]==NULLDS) {
4781     // Delay slot instruction is nullified ("likely" branch)
4782     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4783     else load_all_regs(regs[i].regmap);
4784   }else{
4785     load_all_regs(branch_regs[i].regmap);
4786   }
4787   emit_jmp(stubs[n][2]); // return address
4788   
4789   /* This works but uses a lot of memory...
4790   emit_readword((int)&last_count,ECX);
4791   emit_add(HOST_CCREG,ECX,EAX);
4792   emit_writeword(EAX,(int)&Count);
4793   emit_call((int)gen_interupt);
4794   emit_readword((int)&Count,HOST_CCREG);
4795   emit_readword((int)&next_interupt,EAX);
4796   emit_readword((int)&pending_exception,EBX);
4797   emit_writeword(EAX,(int)&last_count);
4798   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4799   emit_test(EBX,EBX);
4800   int jne_instr=(int)out;
4801   emit_jne(0);
4802   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4803   load_all_regs(branch_regs[i].regmap);
4804   emit_jmp(stubs[n][2]); // return address
4805   set_jump_target(jne_instr,(int)out);
4806   emit_readword((int)&pcaddr,EAX);
4807   // Call get_addr_ht instead of doing the hash table here.
4808   // This code is executed infrequently and takes up a lot of space
4809   // so smaller is better.
4810   emit_storereg(CCREG,HOST_CCREG);
4811   emit_pushreg(EAX);
4812   emit_call((int)get_addr_ht);
4813   emit_loadreg(CCREG,HOST_CCREG);
4814   emit_addimm(ESP,4,ESP);
4815   emit_jmpreg(EAX);*/
4816 }
4817
4818 add_to_linker(int addr,int target,int ext)
4819 {
4820   link_addr[linkcount][0]=addr;
4821   link_addr[linkcount][1]=target;
4822   link_addr[linkcount][2]=ext;  
4823   linkcount++;
4824 }
4825
4826 void ujump_assemble(int i,struct regstat *i_regs)
4827 {
4828   signed char *i_regmap=i_regs->regmap;
4829   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4830   address_generation(i+1,i_regs,regs[i].regmap_entry);
4831   #ifdef REG_PREFETCH
4832   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4833   if(rt1[i]==31&&temp>=0) 
4834   {
4835     int return_address=start+i*4+8;
4836     if(get_reg(branch_regs[i].regmap,31)>0) 
4837     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4838   }
4839   #endif
4840   ds_assemble(i+1,i_regs);
4841   uint64_t bc_unneeded=branch_regs[i].u;
4842   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4843   bc_unneeded|=1|(1LL<<rt1[i]);
4844   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4845   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4846                 bc_unneeded,bc_unneeded_upper);
4847   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4848   if(rt1[i]==31) {
4849     int rt;
4850     unsigned int return_address;
4851     assert(rt1[i+1]!=31);
4852     assert(rt2[i+1]!=31);
4853     rt=get_reg(branch_regs[i].regmap,31);
4854     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4855     //assert(rt>=0);
4856     return_address=start+i*4+8;
4857     if(rt>=0) {
4858       #ifdef USE_MINI_HT
4859       if(internal_branch(branch_regs[i].is32,return_address)) {
4860         int temp=rt+1;
4861         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4862            branch_regs[i].regmap[temp]>=0)
4863         {
4864           temp=get_reg(branch_regs[i].regmap,-1);
4865         }
4866         #ifdef HOST_TEMPREG
4867         if(temp<0) temp=HOST_TEMPREG;
4868         #endif
4869         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4870         else emit_movimm(return_address,rt);
4871       }
4872       else
4873       #endif
4874       {
4875         #ifdef REG_PREFETCH
4876         if(temp>=0) 
4877         {
4878           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4879         }
4880         #endif
4881         emit_movimm(return_address,rt); // PC into link register
4882         #ifdef IMM_PREFETCH
4883         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4884         #endif
4885       }
4886     }
4887   }
4888   int cc,adj;
4889   cc=get_reg(branch_regs[i].regmap,CCREG);
4890   assert(cc==HOST_CCREG);
4891   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4892   #ifdef REG_PREFETCH
4893   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4894   #endif
4895   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4896   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4897   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4898   if(internal_branch(branch_regs[i].is32,ba[i]))
4899     assem_debug("branch: internal\n");
4900   else
4901     assem_debug("branch: external\n");
4902   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4903     ds_assemble_entry(i);
4904   }
4905   else {
4906     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4907     emit_jmp(0);
4908   }
4909 }
4910
4911 void rjump_assemble(int i,struct regstat *i_regs)
4912 {
4913   signed char *i_regmap=i_regs->regmap;
4914   int temp;
4915   int rs,cc,adj;
4916   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4917   assert(rs>=0);
4918   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4919     // Delay slot abuse, make a copy of the branch address register
4920     temp=get_reg(branch_regs[i].regmap,RTEMP);
4921     assert(temp>=0);
4922     assert(regs[i].regmap[temp]==RTEMP);
4923     emit_mov(rs,temp);
4924     rs=temp;
4925   }
4926   address_generation(i+1,i_regs,regs[i].regmap_entry);
4927   #ifdef REG_PREFETCH
4928   if(rt1[i]==31) 
4929   {
4930     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4931       int return_address=start+i*4+8;
4932       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4933     }
4934   }
4935   #endif
4936   #ifdef USE_MINI_HT
4937   if(rs1[i]==31) {
4938     int rh=get_reg(regs[i].regmap,RHASH);
4939     if(rh>=0) do_preload_rhash(rh);
4940   }
4941   #endif
4942   ds_assemble(i+1,i_regs);
4943   uint64_t bc_unneeded=branch_regs[i].u;
4944   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4945   bc_unneeded|=1|(1LL<<rt1[i]);
4946   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4947   bc_unneeded&=~(1LL<<rs1[i]);
4948   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4949                 bc_unneeded,bc_unneeded_upper);
4950   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4951   if(rt1[i]==31) {
4952     int rt,return_address;
4953     assert(rt1[i+1]!=31);
4954     assert(rt2[i+1]!=31);
4955     rt=get_reg(branch_regs[i].regmap,31);
4956     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4957     assert(rt>=0);
4958     return_address=start+i*4+8;
4959     #ifdef REG_PREFETCH
4960     if(temp>=0) 
4961     {
4962       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4963     }
4964     #endif
4965     emit_movimm(return_address,rt); // PC into link register
4966     #ifdef IMM_PREFETCH
4967     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4968     #endif
4969   }
4970   cc=get_reg(branch_regs[i].regmap,CCREG);
4971   assert(cc==HOST_CCREG);
4972   #ifdef USE_MINI_HT
4973   int rh=get_reg(branch_regs[i].regmap,RHASH);
4974   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4975   if(rs1[i]==31) {
4976     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4977     do_preload_rhtbl(ht);
4978     do_rhash(rs,rh);
4979   }
4980   #endif
4981   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4982   #ifdef DESTRUCTIVE_WRITEBACK
4983   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4984     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4985       emit_loadreg(rs1[i],rs);
4986     }
4987   }
4988   #endif
4989   #ifdef REG_PREFETCH
4990   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4991   #endif
4992   #ifdef USE_MINI_HT
4993   if(rs1[i]==31) {
4994     do_miniht_load(ht,rh);
4995   }
4996   #endif
4997   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4998   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4999   //assert(adj==0);
5000   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5001   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5002   emit_jns(0);
5003   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5004   #ifdef USE_MINI_HT
5005   if(rs1[i]==31) {
5006     do_miniht_jump(rs,rh,ht);
5007   }
5008   else
5009   #endif
5010   {
5011     //if(rs!=EAX) emit_mov(rs,EAX);
5012     //emit_jmp((int)jump_vaddr_eax);
5013     emit_jmp(jump_vaddr_reg[rs]);
5014   }
5015   /* Check hash table
5016   temp=!rs;
5017   emit_mov(rs,temp);
5018   emit_shrimm(rs,16,rs);
5019   emit_xor(temp,rs,rs);
5020   emit_movzwl_reg(rs,rs);
5021   emit_shlimm(rs,4,rs);
5022   emit_cmpmem_indexed((int)hash_table,rs,temp);
5023   emit_jne((int)out+14);
5024   emit_readword_indexed((int)hash_table+4,rs,rs);
5025   emit_jmpreg(rs);
5026   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5027   emit_addimm_no_flags(8,rs);
5028   emit_jeq((int)out-17);
5029   // No hit on hash table, call compiler
5030   emit_pushreg(temp);
5031 //DEBUG >
5032 #ifdef DEBUG_CYCLE_COUNT
5033   emit_readword((int)&last_count,ECX);
5034   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5035   emit_readword((int)&next_interupt,ECX);
5036   emit_writeword(HOST_CCREG,(int)&Count);
5037   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5038   emit_writeword(ECX,(int)&last_count);
5039 #endif
5040 //DEBUG <
5041   emit_storereg(CCREG,HOST_CCREG);
5042   emit_call((int)get_addr);
5043   emit_loadreg(CCREG,HOST_CCREG);
5044   emit_addimm(ESP,4,ESP);
5045   emit_jmpreg(EAX);*/
5046   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5047   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5048   #endif
5049 }
5050
5051 void cjump_assemble(int i,struct regstat *i_regs)
5052 {
5053   signed char *i_regmap=i_regs->regmap;
5054   int cc;
5055   int match;
5056   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5057   assem_debug("match=%d\n",match);
5058   int s1h,s1l,s2h,s2l;
5059   int prev_cop1_usable=cop1_usable;
5060   int unconditional=0,nop=0;
5061   int only32=0;
5062   int ooo=1;
5063   int invert=0;
5064   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5065   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5066   if(likely[i]) ooo=0;
5067   if(!match) invert=1;
5068   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5069   if(i>(ba[i]-start)>>2) invert=1;
5070   #endif
5071     
5072   if(ooo)
5073     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5074        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5075   {
5076     // Write-after-read dependency prevents out of order execution
5077     // First test branch condition, then execute delay slot, then branch
5078     ooo=0;
5079   }
5080
5081   if(ooo) {
5082     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5083     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5084     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5085     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5086   }
5087   else {
5088     s1l=get_reg(i_regmap,rs1[i]);
5089     s1h=get_reg(i_regmap,rs1[i]|64);
5090     s2l=get_reg(i_regmap,rs2[i]);
5091     s2h=get_reg(i_regmap,rs2[i]|64);
5092   }
5093   if(rs1[i]==0&&rs2[i]==0)
5094   {
5095     if(opcode[i]&1) nop=1;
5096     else unconditional=1;
5097     //assert(opcode[i]!=5);
5098     //assert(opcode[i]!=7);
5099     //assert(opcode[i]!=0x15);
5100     //assert(opcode[i]!=0x17);
5101   }
5102   else if(rs1[i]==0)
5103   {
5104     s1l=s2l;s1h=s2h;
5105     s2l=s2h=-1;
5106     only32=(regs[i].was32>>rs2[i])&1;
5107   }
5108   else if(rs2[i]==0)
5109   {
5110     s2l=s2h=-1;
5111     only32=(regs[i].was32>>rs1[i])&1;
5112   }
5113   else {
5114     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5115   }
5116
5117   if(ooo) {
5118     // Out of order execution (delay slot first)
5119     //printf("OOOE\n");
5120     address_generation(i+1,i_regs,regs[i].regmap_entry);
5121     ds_assemble(i+1,i_regs);
5122     int adj;
5123     uint64_t bc_unneeded=branch_regs[i].u;
5124     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5125     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5126     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5127     bc_unneeded|=1;
5128     bc_unneeded_upper|=1;
5129     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5130                   bc_unneeded,bc_unneeded_upper);
5131     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5132     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5133     cc=get_reg(branch_regs[i].regmap,CCREG);
5134     assert(cc==HOST_CCREG);
5135     if(unconditional) 
5136       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5137     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5138     //assem_debug("cycle count (adj)\n");
5139     if(unconditional) {
5140       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5141       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5142         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5143         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5144         if(internal)
5145           assem_debug("branch: internal\n");
5146         else
5147           assem_debug("branch: external\n");
5148         if(internal&&is_ds[(ba[i]-start)>>2]) {
5149           ds_assemble_entry(i);
5150         }
5151         else {
5152           add_to_linker((int)out,ba[i],internal);
5153           emit_jmp(0);
5154         }
5155         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5156         if(((u_int)out)&7) emit_addnop(0);
5157         #endif
5158       }
5159     }
5160     else if(nop) {
5161       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5162       int jaddr=(int)out;
5163       emit_jns(0);
5164       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5165     }
5166     else {
5167       int taken=0,nottaken=0,nottaken1=0;
5168       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5169       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5170       if(!only32)
5171       {
5172         assert(s1h>=0);
5173         if(opcode[i]==4) // BEQ
5174         {
5175           if(s2h>=0) emit_cmp(s1h,s2h);
5176           else emit_test(s1h,s1h);
5177           nottaken1=(int)out;
5178           emit_jne(1);
5179         }
5180         if(opcode[i]==5) // BNE
5181         {
5182           if(s2h>=0) emit_cmp(s1h,s2h);
5183           else emit_test(s1h,s1h);
5184           if(invert) taken=(int)out;
5185           else add_to_linker((int)out,ba[i],internal);
5186           emit_jne(0);
5187         }
5188         if(opcode[i]==6) // BLEZ
5189         {
5190           emit_test(s1h,s1h);
5191           if(invert) taken=(int)out;
5192           else add_to_linker((int)out,ba[i],internal);
5193           emit_js(0);
5194           nottaken1=(int)out;
5195           emit_jne(1);
5196         }
5197         if(opcode[i]==7) // BGTZ
5198         {
5199           emit_test(s1h,s1h);
5200           nottaken1=(int)out;
5201           emit_js(1);
5202           if(invert) taken=(int)out;
5203           else add_to_linker((int)out,ba[i],internal);
5204           emit_jne(0);
5205         }
5206       } // if(!only32)
5207           
5208       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5209       assert(s1l>=0);
5210       if(opcode[i]==4) // BEQ
5211       {
5212         if(s2l>=0) emit_cmp(s1l,s2l);
5213         else emit_test(s1l,s1l);
5214         if(invert){
5215           nottaken=(int)out;
5216           emit_jne(1);
5217         }else{
5218           add_to_linker((int)out,ba[i],internal);
5219           emit_jeq(0);
5220         }
5221       }
5222       if(opcode[i]==5) // BNE
5223       {
5224         if(s2l>=0) emit_cmp(s1l,s2l);
5225         else emit_test(s1l,s1l);
5226         if(invert){
5227           nottaken=(int)out;
5228           emit_jeq(1);
5229         }else{
5230           add_to_linker((int)out,ba[i],internal);
5231           emit_jne(0);
5232         }
5233       }
5234       if(opcode[i]==6) // BLEZ
5235       {
5236         emit_cmpimm(s1l,1);
5237         if(invert){
5238           nottaken=(int)out;
5239           emit_jge(1);
5240         }else{
5241           add_to_linker((int)out,ba[i],internal);
5242           emit_jl(0);
5243         }
5244       }
5245       if(opcode[i]==7) // BGTZ
5246       {
5247         emit_cmpimm(s1l,1);
5248         if(invert){
5249           nottaken=(int)out;
5250           emit_jl(1);
5251         }else{
5252           add_to_linker((int)out,ba[i],internal);
5253           emit_jge(0);
5254         }
5255       }
5256       if(invert) {
5257         if(taken) set_jump_target(taken,(int)out);
5258         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5259         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5260           if(adj) {
5261             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5262             add_to_linker((int)out,ba[i],internal);
5263           }else{
5264             emit_addnop(13);
5265             add_to_linker((int)out,ba[i],internal*2);
5266           }
5267           emit_jmp(0);
5268         }else
5269         #endif
5270         {
5271           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5272           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5273           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5274           if(internal)
5275             assem_debug("branch: internal\n");
5276           else
5277             assem_debug("branch: external\n");
5278           if(internal&&is_ds[(ba[i]-start)>>2]) {
5279             ds_assemble_entry(i);
5280           }
5281           else {
5282             add_to_linker((int)out,ba[i],internal);
5283             emit_jmp(0);
5284           }
5285         }
5286         set_jump_target(nottaken,(int)out);
5287       }
5288
5289       if(nottaken1) set_jump_target(nottaken1,(int)out);
5290       if(adj) {
5291         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5292       }
5293     } // (!unconditional)
5294   } // if(ooo)
5295   else
5296   {
5297     // In-order execution (branch first)
5298     //if(likely[i]) printf("IOL\n");
5299     //else
5300     //printf("IOE\n");
5301     int taken=0,nottaken=0,nottaken1=0;
5302     if(!unconditional&&!nop) {
5303       if(!only32)
5304       {
5305         assert(s1h>=0);
5306         if((opcode[i]&0x2f)==4) // BEQ
5307         {
5308           if(s2h>=0) emit_cmp(s1h,s2h);
5309           else emit_test(s1h,s1h);
5310           nottaken1=(int)out;
5311           emit_jne(2);
5312         }
5313         if((opcode[i]&0x2f)==5) // BNE
5314         {
5315           if(s2h>=0) emit_cmp(s1h,s2h);
5316           else emit_test(s1h,s1h);
5317           taken=(int)out;
5318           emit_jne(1);
5319         }
5320         if((opcode[i]&0x2f)==6) // BLEZ
5321         {
5322           emit_test(s1h,s1h);
5323           taken=(int)out;
5324           emit_js(1);
5325           nottaken1=(int)out;
5326           emit_jne(2);
5327         }
5328         if((opcode[i]&0x2f)==7) // BGTZ
5329         {
5330           emit_test(s1h,s1h);
5331           nottaken1=(int)out;
5332           emit_js(2);
5333           taken=(int)out;
5334           emit_jne(1);
5335         }
5336       } // if(!only32)
5337           
5338       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5339       assert(s1l>=0);
5340       if((opcode[i]&0x2f)==4) // BEQ
5341       {
5342         if(s2l>=0) emit_cmp(s1l,s2l);
5343         else emit_test(s1l,s1l);
5344         nottaken=(int)out;
5345         emit_jne(2);
5346       }
5347       if((opcode[i]&0x2f)==5) // BNE
5348       {
5349         if(s2l>=0) emit_cmp(s1l,s2l);
5350         else emit_test(s1l,s1l);
5351         nottaken=(int)out;
5352         emit_jeq(2);
5353       }
5354       if((opcode[i]&0x2f)==6) // BLEZ
5355       {
5356         emit_cmpimm(s1l,1);
5357         nottaken=(int)out;
5358         emit_jge(2);
5359       }
5360       if((opcode[i]&0x2f)==7) // BGTZ
5361       {
5362         emit_cmpimm(s1l,1);
5363         nottaken=(int)out;
5364         emit_jl(2);
5365       }
5366     } // if(!unconditional)
5367     int adj;
5368     uint64_t ds_unneeded=branch_regs[i].u;
5369     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5370     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5371     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5372     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5373     ds_unneeded|=1;
5374     ds_unneeded_upper|=1;
5375     // branch taken
5376     if(!nop) {
5377       if(taken) set_jump_target(taken,(int)out);
5378       assem_debug("1:\n");
5379       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5380                     ds_unneeded,ds_unneeded_upper);
5381       // load regs
5382       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5383       address_generation(i+1,&branch_regs[i],0);
5384       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5385       ds_assemble(i+1,&branch_regs[i]);
5386       cc=get_reg(branch_regs[i].regmap,CCREG);
5387       if(cc==-1) {
5388         emit_loadreg(CCREG,cc=HOST_CCREG);
5389         // CHECK: Is the following instruction (fall thru) allocated ok?
5390       }
5391       assert(cc==HOST_CCREG);
5392       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5393       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5394       assem_debug("cycle count (adj)\n");
5395       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5396       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5397       if(internal)
5398         assem_debug("branch: internal\n");
5399       else
5400         assem_debug("branch: external\n");
5401       if(internal&&is_ds[(ba[i]-start)>>2]) {
5402         ds_assemble_entry(i);
5403       }
5404       else {
5405         add_to_linker((int)out,ba[i],internal);
5406         emit_jmp(0);
5407       }
5408     }
5409     // branch not taken
5410     cop1_usable=prev_cop1_usable;
5411     if(!unconditional) {
5412       if(nottaken1) set_jump_target(nottaken1,(int)out);
5413       set_jump_target(nottaken,(int)out);
5414       assem_debug("2:\n");
5415       if(!likely[i]) {
5416         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5417                       ds_unneeded,ds_unneeded_upper);
5418         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5419         address_generation(i+1,&branch_regs[i],0);
5420         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5421         ds_assemble(i+1,&branch_regs[i]);
5422       }
5423       cc=get_reg(branch_regs[i].regmap,CCREG);
5424       if(cc==-1&&!likely[i]) {
5425         // Cycle count isn't in a register, temporarily load it then write it out
5426         emit_loadreg(CCREG,HOST_CCREG);
5427         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5428         int jaddr=(int)out;
5429         emit_jns(0);
5430         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5431         emit_storereg(CCREG,HOST_CCREG);
5432       }
5433       else{
5434         cc=get_reg(i_regmap,CCREG);
5435         assert(cc==HOST_CCREG);
5436         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5437         int jaddr=(int)out;
5438         emit_jns(0);
5439         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5440       }
5441     }
5442   }
5443 }
5444
5445 void sjump_assemble(int i,struct regstat *i_regs)
5446 {
5447   signed char *i_regmap=i_regs->regmap;
5448   int cc;
5449   int match;
5450   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5451   assem_debug("smatch=%d\n",match);
5452   int s1h,s1l;
5453   int prev_cop1_usable=cop1_usable;
5454   int unconditional=0,nevertaken=0;
5455   int only32=0;
5456   int ooo=1;
5457   int invert=0;
5458   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5459   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5460   if(likely[i]) ooo=0;
5461   if(!match) invert=1;
5462   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5463   if(i>(ba[i]-start)>>2) invert=1;
5464   #endif
5465
5466   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5467   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5468
5469   if(ooo)
5470     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5471   {
5472     // Write-after-read dependency prevents out of order execution
5473     // First test branch condition, then execute delay slot, then branch
5474     ooo=0;
5475   }
5476   // TODO: Conditional branches w/link must execute in-order so that
5477   // condition test and write to r31 occur before cycle count test
5478
5479   if(ooo) {
5480     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5481     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5482   }
5483   else {
5484     s1l=get_reg(i_regmap,rs1[i]);
5485     s1h=get_reg(i_regmap,rs1[i]|64);
5486   }
5487   if(rs1[i]==0)
5488   {
5489     if(opcode2[i]&1) unconditional=1;
5490     else nevertaken=1;
5491     // These are never taken (r0 is never less than zero)
5492     //assert(opcode2[i]!=0);
5493     //assert(opcode2[i]!=2);
5494     //assert(opcode2[i]!=0x10);
5495     //assert(opcode2[i]!=0x12);
5496   }
5497   else {
5498     only32=(regs[i].was32>>rs1[i])&1;
5499   }
5500
5501   if(ooo) {
5502     // Out of order execution (delay slot first)
5503     //printf("OOOE\n");
5504     address_generation(i+1,i_regs,regs[i].regmap_entry);
5505     ds_assemble(i+1,i_regs);
5506     int adj;
5507     uint64_t bc_unneeded=branch_regs[i].u;
5508     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5509     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5510     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5511     bc_unneeded|=1;
5512     bc_unneeded_upper|=1;
5513     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5514                   bc_unneeded,bc_unneeded_upper);
5515     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5516     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5517     if(rt1[i]==31) {
5518       int rt,return_address;
5519       assert(rt1[i+1]!=31);
5520       assert(rt2[i+1]!=31);
5521       rt=get_reg(branch_regs[i].regmap,31);
5522       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5523       if(rt>=0) {
5524         // Save the PC even if the branch is not taken
5525         return_address=start+i*4+8;
5526         emit_movimm(return_address,rt); // PC into link register
5527         #ifdef IMM_PREFETCH
5528         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5529         #endif
5530       }
5531     }
5532     cc=get_reg(branch_regs[i].regmap,CCREG);
5533     assert(cc==HOST_CCREG);
5534     if(unconditional) 
5535       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5536     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5537     assem_debug("cycle count (adj)\n");
5538     if(unconditional) {
5539       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5540       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5541         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5542         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5543         if(internal)
5544           assem_debug("branch: internal\n");
5545         else
5546           assem_debug("branch: external\n");
5547         if(internal&&is_ds[(ba[i]-start)>>2]) {
5548           ds_assemble_entry(i);
5549         }
5550         else {
5551           add_to_linker((int)out,ba[i],internal);
5552           emit_jmp(0);
5553         }
5554         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5555         if(((u_int)out)&7) emit_addnop(0);
5556         #endif
5557       }
5558     }
5559     else if(nevertaken) {
5560       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5561       int jaddr=(int)out;
5562       emit_jns(0);
5563       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5564     }
5565     else {
5566       int nottaken=0;
5567       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5568       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5569       if(!only32)
5570       {
5571         assert(s1h>=0);
5572         if(opcode2[i]==0) // BLTZ
5573         {
5574           emit_test(s1h,s1h);
5575           if(invert){
5576             nottaken=(int)out;
5577             emit_jns(1);
5578           }else{
5579             add_to_linker((int)out,ba[i],internal);
5580             emit_js(0);
5581           }
5582         }
5583         if(opcode2[i]==1) // BGEZ
5584         {
5585           emit_test(s1h,s1h);
5586           if(invert){
5587             nottaken=(int)out;
5588             emit_js(1);
5589           }else{
5590             add_to_linker((int)out,ba[i],internal);
5591             emit_jns(0);
5592           }
5593         }
5594       } // if(!only32)
5595       else
5596       {
5597         assert(s1l>=0);
5598         if(opcode2[i]==0) // BLTZ
5599         {
5600           emit_test(s1l,s1l);
5601           if(invert){
5602             nottaken=(int)out;
5603             emit_jns(1);
5604           }else{
5605             add_to_linker((int)out,ba[i],internal);
5606             emit_js(0);
5607           }
5608         }
5609         if(opcode2[i]==1) // BGEZ
5610         {
5611           emit_test(s1l,s1l);
5612           if(invert){
5613             nottaken=(int)out;
5614             emit_js(1);
5615           }else{
5616             add_to_linker((int)out,ba[i],internal);
5617             emit_jns(0);
5618           }
5619         }
5620       } // if(!only32)
5621           
5622       if(invert) {
5623         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5624         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5625           if(adj) {
5626             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5627             add_to_linker((int)out,ba[i],internal);
5628           }else{
5629             emit_addnop(13);
5630             add_to_linker((int)out,ba[i],internal*2);
5631           }
5632           emit_jmp(0);
5633         }else
5634         #endif
5635         {
5636           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5637           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5638           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5639           if(internal)
5640             assem_debug("branch: internal\n");
5641           else
5642             assem_debug("branch: external\n");
5643           if(internal&&is_ds[(ba[i]-start)>>2]) {
5644             ds_assemble_entry(i);
5645           }
5646           else {
5647             add_to_linker((int)out,ba[i],internal);
5648             emit_jmp(0);
5649           }
5650         }
5651         set_jump_target(nottaken,(int)out);
5652       }
5653
5654       if(adj) {
5655         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5656       }
5657     } // (!unconditional)
5658   } // if(ooo)
5659   else
5660   {
5661     // In-order execution (branch first)
5662     //printf("IOE\n");
5663     int nottaken=0;
5664     if(!unconditional) {
5665       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5666       if(!only32)
5667       {
5668         assert(s1h>=0);
5669         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5670         {
5671           emit_test(s1h,s1h);
5672           nottaken=(int)out;
5673           emit_jns(1);
5674         }
5675         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5676         {
5677           emit_test(s1h,s1h);
5678           nottaken=(int)out;
5679           emit_js(1);
5680         }
5681       } // if(!only32)
5682       else
5683       {
5684         assert(s1l>=0);
5685         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5686         {
5687           emit_test(s1l,s1l);
5688           nottaken=(int)out;
5689           emit_jns(1);
5690         }
5691         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5692         {
5693           emit_test(s1l,s1l);
5694           nottaken=(int)out;
5695           emit_js(1);
5696         }
5697       }
5698     } // if(!unconditional)
5699     int adj;
5700     uint64_t ds_unneeded=branch_regs[i].u;
5701     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5702     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5703     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5704     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5705     ds_unneeded|=1;
5706     ds_unneeded_upper|=1;
5707     // branch taken
5708     if(!nevertaken) {
5709       //assem_debug("1:\n");
5710       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5711                     ds_unneeded,ds_unneeded_upper);
5712       // load regs
5713       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5714       address_generation(i+1,&branch_regs[i],0);
5715       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5716       ds_assemble(i+1,&branch_regs[i]);
5717       cc=get_reg(branch_regs[i].regmap,CCREG);
5718       if(cc==-1) {
5719         emit_loadreg(CCREG,cc=HOST_CCREG);
5720         // CHECK: Is the following instruction (fall thru) allocated ok?
5721       }
5722       assert(cc==HOST_CCREG);
5723       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5724       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5725       assem_debug("cycle count (adj)\n");
5726       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5727       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5728       if(internal)
5729         assem_debug("branch: internal\n");
5730       else
5731         assem_debug("branch: external\n");
5732       if(internal&&is_ds[(ba[i]-start)>>2]) {
5733         ds_assemble_entry(i);
5734       }
5735       else {
5736         add_to_linker((int)out,ba[i],internal);
5737         emit_jmp(0);
5738       }
5739     }
5740     // branch not taken
5741     cop1_usable=prev_cop1_usable;
5742     if(!unconditional) {
5743       set_jump_target(nottaken,(int)out);
5744       assem_debug("1:\n");
5745       if(!likely[i]) {
5746         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5747                       ds_unneeded,ds_unneeded_upper);
5748         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5749         address_generation(i+1,&branch_regs[i],0);
5750         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5751         ds_assemble(i+1,&branch_regs[i]);
5752       }
5753       cc=get_reg(branch_regs[i].regmap,CCREG);
5754       if(cc==-1&&!likely[i]) {
5755         // Cycle count isn't in a register, temporarily load it then write it out
5756         emit_loadreg(CCREG,HOST_CCREG);
5757         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5758         int jaddr=(int)out;
5759         emit_jns(0);
5760         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5761         emit_storereg(CCREG,HOST_CCREG);
5762       }
5763       else{
5764         cc=get_reg(i_regmap,CCREG);
5765         assert(cc==HOST_CCREG);
5766         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5767         int jaddr=(int)out;
5768         emit_jns(0);
5769         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5770       }
5771     }
5772   }
5773 }
5774
5775 void fjump_assemble(int i,struct regstat *i_regs)
5776 {
5777   signed char *i_regmap=i_regs->regmap;
5778   int cc;
5779   int match;
5780   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5781   assem_debug("fmatch=%d\n",match);
5782   int fs,cs;
5783   int eaddr;
5784   int ooo=1;
5785   int invert=0;
5786   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5787   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5788   if(likely[i]) ooo=0;
5789   if(!match) invert=1;
5790   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5791   if(i>(ba[i]-start)>>2) invert=1;
5792   #endif
5793
5794   if(ooo)
5795     if(itype[i+1]==FCOMP)
5796   {
5797     // Write-after-read dependency prevents out of order execution
5798     // First test branch condition, then execute delay slot, then branch
5799     ooo=0;
5800   }
5801
5802   if(ooo) {
5803     fs=get_reg(branch_regs[i].regmap,FSREG);
5804     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5805   }
5806   else {
5807     fs=get_reg(i_regmap,FSREG);
5808   }
5809
5810   // Check cop1 unusable
5811   if(!cop1_usable) {
5812     cs=get_reg(i_regmap,CSREG);
5813     assert(cs>=0);
5814     emit_testimm(cs,0x20000000);
5815     eaddr=(int)out;
5816     emit_jeq(0);
5817     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5818     cop1_usable=1;
5819   }
5820
5821   if(ooo) {
5822     // Out of order execution (delay slot first)
5823     //printf("OOOE\n");
5824     ds_assemble(i+1,i_regs);
5825     int adj;
5826     uint64_t bc_unneeded=branch_regs[i].u;
5827     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5828     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5829     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5830     bc_unneeded|=1;
5831     bc_unneeded_upper|=1;
5832     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5833                   bc_unneeded,bc_unneeded_upper);
5834     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5835     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5836     cc=get_reg(branch_regs[i].regmap,CCREG);
5837     assert(cc==HOST_CCREG);
5838     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5839     assem_debug("cycle count (adj)\n");
5840     if(1) {
5841       int nottaken=0;
5842       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5843       if(1) {
5844         assert(fs>=0);
5845         emit_testimm(fs,0x800000);
5846         if(source[i]&0x10000) // BC1T
5847         {
5848           if(invert){
5849             nottaken=(int)out;
5850             emit_jeq(1);
5851           }else{
5852             add_to_linker((int)out,ba[i],internal);
5853             emit_jne(0);
5854           }
5855         }
5856         else // BC1F
5857           if(invert){
5858             nottaken=(int)out;
5859             emit_jne(1);
5860           }else{
5861             add_to_linker((int)out,ba[i],internal);
5862             emit_jeq(0);
5863           }
5864         {
5865         }
5866       } // if(!only32)
5867           
5868       if(invert) {
5869         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5870         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5871         else if(match) emit_addnop(13);
5872         #endif
5873         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5874         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5875         if(internal)
5876           assem_debug("branch: internal\n");
5877         else
5878           assem_debug("branch: external\n");
5879         if(internal&&is_ds[(ba[i]-start)>>2]) {
5880           ds_assemble_entry(i);
5881         }
5882         else {
5883           add_to_linker((int)out,ba[i],internal);
5884           emit_jmp(0);
5885         }
5886         set_jump_target(nottaken,(int)out);
5887       }
5888
5889       if(adj) {
5890         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5891       }
5892     } // (!unconditional)
5893   } // if(ooo)
5894   else
5895   {
5896     // In-order execution (branch first)
5897     //printf("IOE\n");
5898     int nottaken=0;
5899     if(1) {
5900       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5901       if(1) {
5902         assert(fs>=0);
5903         emit_testimm(fs,0x800000);
5904         if(source[i]&0x10000) // BC1T
5905         {
5906           nottaken=(int)out;
5907           emit_jeq(1);
5908         }
5909         else // BC1F
5910         {
5911           nottaken=(int)out;
5912           emit_jne(1);
5913         }
5914       }
5915     } // if(!unconditional)
5916     int adj;
5917     uint64_t ds_unneeded=branch_regs[i].u;
5918     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5919     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5920     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5921     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5922     ds_unneeded|=1;
5923     ds_unneeded_upper|=1;
5924     // branch taken
5925     //assem_debug("1:\n");
5926     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5927                   ds_unneeded,ds_unneeded_upper);
5928     // load regs
5929     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5930     address_generation(i+1,&branch_regs[i],0);
5931     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5932     ds_assemble(i+1,&branch_regs[i]);
5933     cc=get_reg(branch_regs[i].regmap,CCREG);
5934     if(cc==-1) {
5935       emit_loadreg(CCREG,cc=HOST_CCREG);
5936       // CHECK: Is the following instruction (fall thru) allocated ok?
5937     }
5938     assert(cc==HOST_CCREG);
5939     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5940     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5941     assem_debug("cycle count (adj)\n");
5942     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5943     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5944     if(internal)
5945       assem_debug("branch: internal\n");
5946     else
5947       assem_debug("branch: external\n");
5948     if(internal&&is_ds[(ba[i]-start)>>2]) {
5949       ds_assemble_entry(i);
5950     }
5951     else {
5952       add_to_linker((int)out,ba[i],internal);
5953       emit_jmp(0);
5954     }
5955
5956     // branch not taken
5957     if(1) { // <- FIXME (don't need this)
5958       set_jump_target(nottaken,(int)out);
5959       assem_debug("1:\n");
5960       if(!likely[i]) {
5961         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5962                       ds_unneeded,ds_unneeded_upper);
5963         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5964         address_generation(i+1,&branch_regs[i],0);
5965         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5966         ds_assemble(i+1,&branch_regs[i]);
5967       }
5968       cc=get_reg(branch_regs[i].regmap,CCREG);
5969       if(cc==-1&&!likely[i]) {
5970         // Cycle count isn't in a register, temporarily load it then write it out
5971         emit_loadreg(CCREG,HOST_CCREG);
5972         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5973         int jaddr=(int)out;
5974         emit_jns(0);
5975         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5976         emit_storereg(CCREG,HOST_CCREG);
5977       }
5978       else{
5979         cc=get_reg(i_regmap,CCREG);
5980         assert(cc==HOST_CCREG);
5981         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5982         int jaddr=(int)out;
5983         emit_jns(0);
5984         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5985       }
5986     }
5987   }
5988 }
5989
5990 static void pagespan_assemble(int i,struct regstat *i_regs)
5991 {
5992   int s1l=get_reg(i_regs->regmap,rs1[i]);
5993   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5994   int s2l=get_reg(i_regs->regmap,rs2[i]);
5995   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5996   void *nt_branch=NULL;
5997   int taken=0;
5998   int nottaken=0;
5999   int unconditional=0;
6000   if(rs1[i]==0)
6001   {
6002     s1l=s2l;s1h=s2h;
6003     s2l=s2h=-1;
6004   }
6005   else if(rs2[i]==0)
6006   {
6007     s2l=s2h=-1;
6008   }
6009   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6010     s1h=s2h=-1;
6011   }
6012   int hr=0;
6013   int addr,alt,ntaddr;
6014   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6015   else {
6016     while(hr<HOST_REGS)
6017     {
6018       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6019          (i_regs->regmap[hr]&63)!=rs1[i] &&
6020          (i_regs->regmap[hr]&63)!=rs2[i] )
6021       {
6022         addr=hr++;break;
6023       }
6024       hr++;
6025     }
6026   }
6027   while(hr<HOST_REGS)
6028   {
6029     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6030        (i_regs->regmap[hr]&63)!=rs1[i] &&
6031        (i_regs->regmap[hr]&63)!=rs2[i] )
6032     {
6033       alt=hr++;break;
6034     }
6035     hr++;
6036   }
6037   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6038   {
6039     while(hr<HOST_REGS)
6040     {
6041       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6042          (i_regs->regmap[hr]&63)!=rs1[i] &&
6043          (i_regs->regmap[hr]&63)!=rs2[i] )
6044       {
6045         ntaddr=hr;break;
6046       }
6047       hr++;
6048     }
6049   }
6050   assert(hr<HOST_REGS);
6051   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6052     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6053   }
6054   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6055   if(opcode[i]==2) // J
6056   {
6057     unconditional=1;
6058   }
6059   if(opcode[i]==3) // JAL
6060   {
6061     // TODO: mini_ht
6062     int rt=get_reg(i_regs->regmap,31);
6063     emit_movimm(start+i*4+8,rt);
6064     unconditional=1;
6065   }
6066   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6067   {
6068     emit_mov(s1l,addr);
6069     if(opcode2[i]==9) // JALR
6070     {
6071       int rt=get_reg(i_regs->regmap,31);
6072       emit_movimm(start+i*4+8,rt);
6073     }
6074   }
6075   if((opcode[i]&0x3f)==4) // BEQ
6076   {
6077     if(rs1[i]==rs2[i])
6078     {
6079       unconditional=1;
6080     }
6081     else
6082     #ifdef HAVE_CMOV_IMM
6083     if(s1h<0) {
6084       if(s2l>=0) emit_cmp(s1l,s2l);
6085       else emit_test(s1l,s1l);
6086       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6087     }
6088     else
6089     #endif
6090     {
6091       assert(s1l>=0);
6092       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6093       if(s1h>=0) {
6094         if(s2h>=0) emit_cmp(s1h,s2h);
6095         else emit_test(s1h,s1h);
6096         emit_cmovne_reg(alt,addr);
6097       }
6098       if(s2l>=0) emit_cmp(s1l,s2l);
6099       else emit_test(s1l,s1l);
6100       emit_cmovne_reg(alt,addr);
6101     }
6102   }
6103   if((opcode[i]&0x3f)==5) // BNE
6104   {
6105     #ifdef HAVE_CMOV_IMM
6106     if(s1h<0) {
6107       if(s2l>=0) emit_cmp(s1l,s2l);
6108       else emit_test(s1l,s1l);
6109       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6110     }
6111     else
6112     #endif
6113     {
6114       assert(s1l>=0);
6115       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6116       if(s1h>=0) {
6117         if(s2h>=0) emit_cmp(s1h,s2h);
6118         else emit_test(s1h,s1h);
6119         emit_cmovne_reg(alt,addr);
6120       }
6121       if(s2l>=0) emit_cmp(s1l,s2l);
6122       else emit_test(s1l,s1l);
6123       emit_cmovne_reg(alt,addr);
6124     }
6125   }
6126   if((opcode[i]&0x3f)==0x14) // BEQL
6127   {
6128     if(s1h>=0) {
6129       if(s2h>=0) emit_cmp(s1h,s2h);
6130       else emit_test(s1h,s1h);
6131       nottaken=(int)out;
6132       emit_jne(0);
6133     }
6134     if(s2l>=0) emit_cmp(s1l,s2l);
6135     else emit_test(s1l,s1l);
6136     if(nottaken) set_jump_target(nottaken,(int)out);
6137     nottaken=(int)out;
6138     emit_jne(0);
6139   }
6140   if((opcode[i]&0x3f)==0x15) // BNEL
6141   {
6142     if(s1h>=0) {
6143       if(s2h>=0) emit_cmp(s1h,s2h);
6144       else emit_test(s1h,s1h);
6145       taken=(int)out;
6146       emit_jne(0);
6147     }
6148     if(s2l>=0) emit_cmp(s1l,s2l);
6149     else emit_test(s1l,s1l);
6150     nottaken=(int)out;
6151     emit_jeq(0);
6152     if(taken) set_jump_target(taken,(int)out);
6153   }
6154   if((opcode[i]&0x3f)==6) // BLEZ
6155   {
6156     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6157     emit_cmpimm(s1l,1);
6158     if(s1h>=0) emit_mov(addr,ntaddr);
6159     emit_cmovl_reg(alt,addr);
6160     if(s1h>=0) {
6161       emit_test(s1h,s1h);
6162       emit_cmovne_reg(ntaddr,addr);
6163       emit_cmovs_reg(alt,addr);
6164     }
6165   }
6166   if((opcode[i]&0x3f)==7) // BGTZ
6167   {
6168     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6169     emit_cmpimm(s1l,1);
6170     if(s1h>=0) emit_mov(addr,alt);
6171     emit_cmovl_reg(ntaddr,addr);
6172     if(s1h>=0) {
6173       emit_test(s1h,s1h);
6174       emit_cmovne_reg(alt,addr);
6175       emit_cmovs_reg(ntaddr,addr);
6176     }
6177   }
6178   if((opcode[i]&0x3f)==0x16) // BLEZL
6179   {
6180     assert((opcode[i]&0x3f)!=0x16);
6181   }
6182   if((opcode[i]&0x3f)==0x17) // BGTZL
6183   {
6184     assert((opcode[i]&0x3f)!=0x17);
6185   }
6186   assert(opcode[i]!=1); // BLTZ/BGEZ
6187
6188   //FIXME: Check CSREG
6189   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6190     if((source[i]&0x30000)==0) // BC1F
6191     {
6192       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6193       emit_testimm(s1l,0x800000);
6194       emit_cmovne_reg(alt,addr);
6195     }
6196     if((source[i]&0x30000)==0x10000) // BC1T
6197     {
6198       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6199       emit_testimm(s1l,0x800000);
6200       emit_cmovne_reg(alt,addr);
6201     }
6202     if((source[i]&0x30000)==0x20000) // BC1FL
6203     {
6204       emit_testimm(s1l,0x800000);
6205       nottaken=(int)out;
6206       emit_jne(0);
6207     }
6208     if((source[i]&0x30000)==0x30000) // BC1TL
6209     {
6210       emit_testimm(s1l,0x800000);
6211       nottaken=(int)out;
6212       emit_jeq(0);
6213     }
6214   }
6215
6216   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6217   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6218   if(likely[i]||unconditional)
6219   {
6220     emit_movimm(ba[i],HOST_BTREG);
6221   }
6222   else if(addr!=HOST_BTREG)
6223   {
6224     emit_mov(addr,HOST_BTREG);
6225   }
6226   void *branch_addr=out;
6227   emit_jmp(0);
6228   int target_addr=start+i*4+5;
6229   void *stub=out;
6230   void *compiled_target_addr=check_addr(target_addr);
6231   emit_extjump_ds((int)branch_addr,target_addr);
6232   if(compiled_target_addr) {
6233     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6234     add_link(target_addr,stub);
6235   }
6236   else set_jump_target((int)branch_addr,(int)stub);
6237   if(likely[i]) {
6238     // Not-taken path
6239     set_jump_target((int)nottaken,(int)out);
6240     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6241     void *branch_addr=out;
6242     emit_jmp(0);
6243     int target_addr=start+i*4+8;
6244     void *stub=out;
6245     void *compiled_target_addr=check_addr(target_addr);
6246     emit_extjump_ds((int)branch_addr,target_addr);
6247     if(compiled_target_addr) {
6248       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6249       add_link(target_addr,stub);
6250     }
6251     else set_jump_target((int)branch_addr,(int)stub);
6252   }
6253 }
6254
6255 // Assemble the delay slot for the above
6256 static void pagespan_ds()
6257 {
6258   assem_debug("initial delay slot:\n");
6259   u_int vaddr=start+1;
6260   u_int page=get_page(vaddr);
6261   u_int vpage=get_vpage(vaddr);
6262   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6263   do_dirty_stub_ds();
6264   ll_add(jump_in+page,vaddr,(void *)out);
6265   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6266   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6267     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6268   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6269     emit_writeword(HOST_BTREG,(int)&branch_target);
6270   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6271   address_generation(0,&regs[0],regs[0].regmap_entry);
6272   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6273     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6274   cop1_usable=0;
6275   is_delayslot=0;
6276   switch(itype[0]) {
6277     case ALU:
6278       alu_assemble(0,&regs[0]);break;
6279     case IMM16:
6280       imm16_assemble(0,&regs[0]);break;
6281     case SHIFT:
6282       shift_assemble(0,&regs[0]);break;
6283     case SHIFTIMM:
6284       shiftimm_assemble(0,&regs[0]);break;
6285     case LOAD:
6286       load_assemble(0,&regs[0]);break;
6287     case LOADLR:
6288       loadlr_assemble(0,&regs[0]);break;
6289     case STORE:
6290       store_assemble(0,&regs[0]);break;
6291     case STORELR:
6292       storelr_assemble(0,&regs[0]);break;
6293     case COP0:
6294       cop0_assemble(0,&regs[0]);break;
6295     case COP1:
6296       cop1_assemble(0,&regs[0]);break;
6297     case C1LS:
6298       c1ls_assemble(0,&regs[0]);break;
6299     case FCONV:
6300       fconv_assemble(0,&regs[0]);break;
6301     case FLOAT:
6302       float_assemble(0,&regs[0]);break;
6303     case FCOMP:
6304       fcomp_assemble(0,&regs[0]);break;
6305     case MULTDIV:
6306       multdiv_assemble(0,&regs[0]);break;
6307     case MOV:
6308       mov_assemble(0,&regs[0]);break;
6309     case SYSCALL:
6310     case HLECALL:
6311     case SPAN:
6312     case UJUMP:
6313     case RJUMP:
6314     case CJUMP:
6315     case SJUMP:
6316     case FJUMP:
6317       printf("Jump in the delay slot.  This is probably a bug.\n");
6318   }
6319   int btaddr=get_reg(regs[0].regmap,BTREG);
6320   if(btaddr<0) {
6321     btaddr=get_reg(regs[0].regmap,-1);
6322     emit_readword((int)&branch_target,btaddr);
6323   }
6324   assert(btaddr!=HOST_CCREG);
6325   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6326 #ifdef HOST_IMM8
6327   emit_movimm(start+4,HOST_TEMPREG);
6328   emit_cmp(btaddr,HOST_TEMPREG);
6329 #else
6330   emit_cmpimm(btaddr,start+4);
6331 #endif
6332   int branch=(int)out;
6333   emit_jeq(0);
6334   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6335   emit_jmp(jump_vaddr_reg[btaddr]);
6336   set_jump_target(branch,(int)out);
6337   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6338   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6339 }
6340
6341 // Basic liveness analysis for MIPS registers
6342 void unneeded_registers(int istart,int iend,int r)
6343 {
6344   int i;
6345   uint64_t u,uu,b,bu;
6346   uint64_t temp_u,temp_uu;
6347   uint64_t tdep;
6348   if(iend==slen-1) {
6349     u=1;uu=1;
6350   }else{
6351     u=unneeded_reg[iend+1];
6352     uu=unneeded_reg_upper[iend+1];
6353     u=1;uu=1;
6354   }
6355   for (i=iend;i>=istart;i--)
6356   {
6357     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6358     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6359     {
6360       // If subroutine call, flag return address as a possible branch target
6361       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6362       
6363       if(ba[i]<start || ba[i]>=(start+slen*4))
6364       {
6365         // Branch out of this block, flush all regs
6366         u=1;
6367         uu=1;
6368         /* Hexagon hack 
6369         if(itype[i]==UJUMP&&rt1[i]==31)
6370         {
6371           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6372         }
6373         if(itype[i]==RJUMP&&rs1[i]==31)
6374         {
6375           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6376         }
6377         if(start>0x80000400&&start<0x80800000) {
6378           if(itype[i]==UJUMP&&rt1[i]==31)
6379           {
6380             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6381             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6382           }
6383           if(itype[i]==RJUMP&&rs1[i]==31)
6384           {
6385             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6386             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6387           }
6388         }*/
6389         branch_unneeded_reg[i]=u;
6390         branch_unneeded_reg_upper[i]=uu;
6391         // Merge in delay slot
6392         tdep=(~uu>>rt1[i+1])&1;
6393         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6394         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6395         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6396         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6397         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6398         u|=1;uu|=1;
6399         // If branch is "likely" (and conditional)
6400         // then we skip the delay slot on the fall-thru path
6401         if(likely[i]) {
6402           if(i<slen-1) {
6403             u&=unneeded_reg[i+2];
6404             uu&=unneeded_reg_upper[i+2];
6405           }
6406           else
6407           {
6408             u=1;
6409             uu=1;
6410           }
6411         }
6412       }
6413       else
6414       {
6415         // Internal branch, flag target
6416         bt[(ba[i]-start)>>2]=1;
6417         if(ba[i]<=start+i*4) {
6418           // Backward branch
6419           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6420           {
6421             // Unconditional branch
6422             temp_u=1;temp_uu=1;
6423           } else {
6424             // Conditional branch (not taken case)
6425             temp_u=unneeded_reg[i+2];
6426             temp_uu=unneeded_reg_upper[i+2];
6427           }
6428           // Merge in delay slot
6429           tdep=(~temp_uu>>rt1[i+1])&1;
6430           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6431           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6432           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6433           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6434           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6435           temp_u|=1;temp_uu|=1;
6436           // If branch is "likely" (and conditional)
6437           // then we skip the delay slot on the fall-thru path
6438           if(likely[i]) {
6439             if(i<slen-1) {
6440               temp_u&=unneeded_reg[i+2];
6441               temp_uu&=unneeded_reg_upper[i+2];
6442             }
6443             else
6444             {
6445               temp_u=1;
6446               temp_uu=1;
6447             }
6448           }
6449           tdep=(~temp_uu>>rt1[i])&1;
6450           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6451           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6452           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6453           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6454           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6455           temp_u|=1;temp_uu|=1;
6456           unneeded_reg[i]=temp_u;
6457           unneeded_reg_upper[i]=temp_uu;
6458           // Only go three levels deep.  This recursion can take an
6459           // excessive amount of time if there are a lot of nested loops.
6460           if(r<2) {
6461             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6462           }else{
6463             unneeded_reg[(ba[i]-start)>>2]=1;
6464             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6465           }
6466         } /*else*/ if(1) {
6467           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6468           {
6469             // Unconditional branch
6470             u=unneeded_reg[(ba[i]-start)>>2];
6471             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6472             branch_unneeded_reg[i]=u;
6473             branch_unneeded_reg_upper[i]=uu;
6474         //u=1;
6475         //uu=1;
6476         //branch_unneeded_reg[i]=u;
6477         //branch_unneeded_reg_upper[i]=uu;
6478             // Merge in delay slot
6479             tdep=(~uu>>rt1[i+1])&1;
6480             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6481             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6482             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6483             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6484             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6485             u|=1;uu|=1;
6486           } else {
6487             // Conditional branch
6488             b=unneeded_reg[(ba[i]-start)>>2];
6489             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6490             branch_unneeded_reg[i]=b;
6491             branch_unneeded_reg_upper[i]=bu;
6492         //b=1;
6493         //bu=1;
6494         //branch_unneeded_reg[i]=b;
6495         //branch_unneeded_reg_upper[i]=bu;
6496             // Branch delay slot
6497             tdep=(~uu>>rt1[i+1])&1;
6498             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6499             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6500             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6501             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6502             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6503             b|=1;bu|=1;
6504             // If branch is "likely" then we skip the
6505             // delay slot on the fall-thru path
6506             if(likely[i]) {
6507               u=b;
6508               uu=bu;
6509               if(i<slen-1) {
6510                 u&=unneeded_reg[i+2];
6511                 uu&=unneeded_reg_upper[i+2];
6512         //u=1;
6513         //uu=1;
6514               }
6515             } else {
6516               u&=b;
6517               uu&=bu;
6518         //u=1;
6519         //uu=1;
6520             }
6521             if(i<slen-1) {
6522               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6523               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6524         //branch_unneeded_reg[i]=1;
6525         //branch_unneeded_reg_upper[i]=1;
6526             } else {
6527               branch_unneeded_reg[i]=1;
6528               branch_unneeded_reg_upper[i]=1;
6529             }
6530           }
6531         }
6532       }
6533     }
6534     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
6535     {
6536       // SYSCALL instruction (software interrupt)
6537       u=1;
6538       uu=1;
6539     }
6540     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6541     {
6542       // ERET instruction (return from interrupt)
6543       u=1;
6544       uu=1;
6545     }
6546     //u=uu=1; // DEBUG
6547     tdep=(~uu>>rt1[i])&1;
6548     // Written registers are unneeded
6549     u|=1LL<<rt1[i];
6550     u|=1LL<<rt2[i];
6551     uu|=1LL<<rt1[i];
6552     uu|=1LL<<rt2[i];
6553     // Accessed registers are needed
6554     u&=~(1LL<<rs1[i]);
6555     u&=~(1LL<<rs2[i]);
6556     uu&=~(1LL<<us1[i]);
6557     uu&=~(1LL<<us2[i]);
6558     // Source-target dependencies
6559     uu&=~(tdep<<dep1[i]);
6560     uu&=~(tdep<<dep2[i]);
6561     // R0 is always unneeded
6562     u|=1;uu|=1;
6563     // Save it
6564     unneeded_reg[i]=u;
6565     unneeded_reg_upper[i]=uu;
6566 #ifdef FORCE32
6567     unneeded_reg_upper[i]=-1LL;
6568 #endif
6569     /*
6570     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6571     printf("U:");
6572     int r;
6573     for(r=1;r<=CCREG;r++) {
6574       if((unneeded_reg[i]>>r)&1) {
6575         if(r==HIREG) printf(" HI");
6576         else if(r==LOREG) printf(" LO");
6577         else printf(" r%d",r);
6578       }
6579     }
6580     printf(" UU:");
6581     for(r=1;r<=CCREG;r++) {
6582       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6583         if(r==HIREG) printf(" HI");
6584         else if(r==LOREG) printf(" LO");
6585         else printf(" r%d",r);
6586       }
6587     }
6588     printf("\n");*/
6589   }
6590 }
6591
6592 // Identify registers which are likely to contain 32-bit values
6593 // This is used to predict whether any branches will jump to a
6594 // location with 64-bit values in registers.
6595 static void provisional_32bit()
6596 {
6597   int i,j;
6598   uint64_t is32=1;
6599   uint64_t lastbranch=1;
6600   
6601   for(i=0;i<slen;i++)
6602   {
6603     if(i>0) {
6604       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6605         if(i>1) is32=lastbranch;
6606         else is32=1;
6607       }
6608     }
6609     if(i>1)
6610     {
6611       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6612         if(likely[i-2]) {
6613           if(i>2) is32=lastbranch;
6614           else is32=1;
6615         }
6616       }
6617       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6618       {
6619         if(rs1[i-2]==0||rs2[i-2]==0)
6620         {
6621           if(rs1[i-2]) {
6622             is32|=1LL<<rs1[i-2];
6623           }
6624           if(rs2[i-2]) {
6625             is32|=1LL<<rs2[i-2];
6626           }
6627         }
6628       }
6629     }
6630     // If something jumps here with 64-bit values
6631     // then promote those registers to 64 bits
6632     if(bt[i])
6633     {
6634       uint64_t temp_is32=is32;
6635       for(j=i-1;j>=0;j--)
6636       {
6637         if(ba[j]==start+i*4) 
6638           //temp_is32&=branch_regs[j].is32;
6639           temp_is32&=p32[j];
6640       }
6641       for(j=i;j<slen;j++)
6642       {
6643         if(ba[j]==start+i*4) 
6644           temp_is32=1;
6645       }
6646       is32=temp_is32;
6647     }
6648     int type=itype[i];
6649     int op=opcode[i];
6650     int op2=opcode2[i];
6651     int rt=rt1[i];
6652     int s1=rs1[i];
6653     int s2=rs2[i];
6654     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6655       // Branches don't write registers, consider the delay slot instead.
6656       type=itype[i+1];
6657       op=opcode[i+1];
6658       op2=opcode2[i+1];
6659       rt=rt1[i+1];
6660       s1=rs1[i+1];
6661       s2=rs2[i+1];
6662       lastbranch=is32;
6663     }
6664     switch(type) {
6665       case LOAD:
6666         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6667            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6668           is32&=~(1LL<<rt);
6669         else
6670           is32|=1LL<<rt;
6671         break;
6672       case STORE:
6673       case STORELR:
6674         break;
6675       case LOADLR:
6676         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6677         if(op==0x22) is32|=1LL<<rt; // LWL
6678         break;
6679       case IMM16:
6680         if (op==0x08||op==0x09|| // ADDI/ADDIU
6681             op==0x0a||op==0x0b|| // SLTI/SLTIU
6682             op==0x0c|| // ANDI
6683             op==0x0f)  // LUI
6684         {
6685           is32|=1LL<<rt;
6686         }
6687         if(op==0x18||op==0x19) { // DADDI/DADDIU
6688           is32&=~(1LL<<rt);
6689           //if(imm[i]==0)
6690           //  is32|=((is32>>s1)&1LL)<<rt;
6691         }
6692         if(op==0x0d||op==0x0e) { // ORI/XORI
6693           uint64_t sr=((is32>>s1)&1LL);
6694           is32&=~(1LL<<rt);
6695           is32|=sr<<rt;
6696         }
6697         break;
6698       case UJUMP:
6699         break;
6700       case RJUMP:
6701         break;
6702       case CJUMP:
6703         break;
6704       case SJUMP:
6705         break;
6706       case FJUMP:
6707         break;
6708       case ALU:
6709         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6710           is32|=1LL<<rt;
6711         }
6712         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6713           is32|=1LL<<rt;
6714         }
6715         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6716           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6717           is32&=~(1LL<<rt);
6718           is32|=sr<<rt;
6719         }
6720         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6721           if(s1==0&&s2==0) {
6722             is32|=1LL<<rt;
6723           }
6724           else if(s2==0) {
6725             uint64_t sr=((is32>>s1)&1LL);
6726             is32&=~(1LL<<rt);
6727             is32|=sr<<rt;
6728           }
6729           else if(s1==0) {
6730             uint64_t sr=((is32>>s2)&1LL);
6731             is32&=~(1LL<<rt);
6732             is32|=sr<<rt;
6733           }
6734           else {
6735             is32&=~(1LL<<rt);
6736           }
6737         }
6738         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6739           if(s1==0&&s2==0) {
6740             is32|=1LL<<rt;
6741           }
6742           else if(s2==0) {
6743             uint64_t sr=((is32>>s1)&1LL);
6744             is32&=~(1LL<<rt);
6745             is32|=sr<<rt;
6746           }
6747           else {
6748             is32&=~(1LL<<rt);
6749           }
6750         }
6751         break;
6752       case MULTDIV:
6753         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6754           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6755         }
6756         else {
6757           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6758         }
6759         break;
6760       case MOV:
6761         {
6762           uint64_t sr=((is32>>s1)&1LL);
6763           is32&=~(1LL<<rt);
6764           is32|=sr<<rt;
6765         }
6766         break;
6767       case SHIFT:
6768         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6769         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6770         break;
6771       case SHIFTIMM:
6772         is32|=1LL<<rt;
6773         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6774         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6775         break;
6776       case COP0:
6777         if(op2==0) is32|=1LL<<rt; // MFC0
6778         break;
6779       case COP1:
6780         if(op2==0) is32|=1LL<<rt; // MFC1
6781         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6782         if(op2==2) is32|=1LL<<rt; // CFC1
6783         break;
6784       case C1LS:
6785         break;
6786       case FLOAT:
6787       case FCONV:
6788         break;
6789       case FCOMP:
6790         break;
6791       case SYSCALL:
6792       case HLECALL:
6793         break;
6794       default:
6795         break;
6796     }
6797     is32|=1;
6798     p32[i]=is32;
6799
6800     if(i>0)
6801     {
6802       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6803       {
6804         if(rt1[i-1]==31) // JAL/JALR
6805         {
6806           // Subroutine call will return here, don't alloc any registers
6807           is32=1;
6808         }
6809         else if(i+1<slen)
6810         {
6811           // Internal branch will jump here, match registers to caller
6812           is32=0x3FFFFFFFFLL;
6813         }
6814       }
6815     }
6816   }
6817 }
6818
6819 // Identify registers which may be assumed to contain 32-bit values
6820 // and where optimizations will rely on this.
6821 // This is used to determine whether backward branches can safely
6822 // jump to a location with 64-bit values in registers.
6823 static void provisional_r32()
6824 {
6825   u_int r32=0;
6826   int i;
6827   
6828   for (i=slen-1;i>=0;i--)
6829   {
6830     int hr;
6831     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6832     {
6833       if(ba[i]<start || ba[i]>=(start+slen*4))
6834       {
6835         // Branch out of this block, don't need anything
6836         r32=0;
6837       }
6838       else
6839       {
6840         // Internal branch
6841         // Need whatever matches the target
6842         // (and doesn't get overwritten by the delay slot instruction)
6843         r32=0;
6844         int t=(ba[i]-start)>>2;
6845         if(ba[i]>start+i*4) {
6846           // Forward branch
6847           //if(!(requires_32bit[t]&~regs[i].was32))
6848           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6849           if(!(pr32[t]&~regs[i].was32))
6850             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6851         }else{
6852           // Backward branch
6853           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6854             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6855         }
6856       }
6857       // Conditional branch may need registers for following instructions
6858       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6859       {
6860         if(i<slen-2) {
6861           //r32|=requires_32bit[i+2];
6862           r32|=pr32[i+2];
6863           r32&=regs[i].was32;
6864           // Mark this address as a branch target since it may be called
6865           // upon return from interrupt
6866           //bt[i+2]=1;
6867         }
6868       }
6869       // Merge in delay slot
6870       if(!likely[i]) {
6871         // These are overwritten unless the branch is "likely"
6872         // and the delay slot is nullified if not taken
6873         r32&=~(1LL<<rt1[i+1]);
6874         r32&=~(1LL<<rt2[i+1]);
6875       }
6876       // Assume these are needed (delay slot)
6877       if(us1[i+1]>0)
6878       {
6879         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6880       }
6881       if(us2[i+1]>0)
6882       {
6883         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6884       }
6885       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6886       {
6887         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6888       }
6889       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6890       {
6891         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6892       }
6893     }
6894     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
6895     {
6896       // SYSCALL instruction (software interrupt)
6897       r32=0;
6898     }
6899     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6900     {
6901       // ERET instruction (return from interrupt)
6902       r32=0;
6903     }
6904     // Check 32 bits
6905     r32&=~(1LL<<rt1[i]);
6906     r32&=~(1LL<<rt2[i]);
6907     if(us1[i]>0)
6908     {
6909       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6910     }
6911     if(us2[i]>0)
6912     {
6913       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6914     }
6915     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6916     {
6917       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6918     }
6919     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6920     {
6921       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6922     }
6923     //requires_32bit[i]=r32;
6924     pr32[i]=r32;
6925     
6926     // Dirty registers which are 32-bit, require 32-bit input
6927     // as they will be written as 32-bit values
6928     for(hr=0;hr<HOST_REGS;hr++)
6929     {
6930       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6931         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6932           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6933           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6934           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6935         }
6936       }
6937     }
6938   }
6939 }
6940
6941 // Write back dirty registers as soon as we will no longer modify them,
6942 // so that we don't end up with lots of writes at the branches.
6943 void clean_registers(int istart,int iend,int wr)
6944 {
6945   int i;
6946   int r;
6947   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6948   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6949   if(iend==slen-1) {
6950     will_dirty_i=will_dirty_next=0;
6951     wont_dirty_i=wont_dirty_next=0;
6952   }else{
6953     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6954     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6955   }
6956   for (i=iend;i>=istart;i--)
6957   {
6958     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6959     {
6960       if(ba[i]<start || ba[i]>=(start+slen*4))
6961       {
6962         // Branch out of this block, flush all regs
6963         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6964         {
6965           // Unconditional branch
6966           will_dirty_i=0;
6967           wont_dirty_i=0;
6968           // Merge in delay slot (will dirty)
6969           for(r=0;r<HOST_REGS;r++) {
6970             if(r!=EXCLUDE_REG) {
6971               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6972               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6973               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6974               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6975               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6976               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6977               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6978               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6979               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6980               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6981               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6982               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6983               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6984               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6985             }
6986           }
6987         }
6988         else
6989         {
6990           // Conditional branch
6991           will_dirty_i=0;
6992           wont_dirty_i=wont_dirty_next;
6993           // Merge in delay slot (will dirty)
6994           for(r=0;r<HOST_REGS;r++) {
6995             if(r!=EXCLUDE_REG) {
6996               if(!likely[i]) {
6997                 // Might not dirty if likely branch is not taken
6998                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6999                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7000                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7001                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7002                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7003                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7004                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7005                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7006                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7007                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7008                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7009                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7010                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7011                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7012               }
7013             }
7014           }
7015         }
7016         // Merge in delay slot (wont dirty)
7017         for(r=0;r<HOST_REGS;r++) {
7018           if(r!=EXCLUDE_REG) {
7019             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7020             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7021             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7022             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7023             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7024             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7025             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7026             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7027             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7028             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7029           }
7030         }
7031         if(wr) {
7032           #ifndef DESTRUCTIVE_WRITEBACK
7033           branch_regs[i].dirty&=wont_dirty_i;
7034           #endif
7035           branch_regs[i].dirty|=will_dirty_i;
7036         }
7037       }
7038       else
7039       {
7040         // Internal branch
7041         if(ba[i]<=start+i*4) {
7042           // Backward branch
7043           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7044           {
7045             // Unconditional branch
7046             temp_will_dirty=0;
7047             temp_wont_dirty=0;
7048             // Merge in delay slot (will dirty)
7049             for(r=0;r<HOST_REGS;r++) {
7050               if(r!=EXCLUDE_REG) {
7051                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7052                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7053                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7054                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7055                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7056                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7057                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7058                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7059                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7060                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7061                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7062                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7063                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7064                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7065               }
7066             }
7067           } else {
7068             // Conditional branch (not taken case)
7069             temp_will_dirty=will_dirty_next;
7070             temp_wont_dirty=wont_dirty_next;
7071             // Merge in delay slot (will dirty)
7072             for(r=0;r<HOST_REGS;r++) {
7073               if(r!=EXCLUDE_REG) {
7074                 if(!likely[i]) {
7075                   // Will not dirty if likely branch is not taken
7076                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7077                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7078                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7079                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7080                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7081                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7082                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7083                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7084                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7085                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7086                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7087                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7088                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7089                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7090                 }
7091               }
7092             }
7093           }
7094           // Merge in delay slot (wont dirty)
7095           for(r=0;r<HOST_REGS;r++) {
7096             if(r!=EXCLUDE_REG) {
7097               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7098               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7099               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7100               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7101               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7102               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7103               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7104               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7105               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7106               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7107             }
7108           }
7109           // Deal with changed mappings
7110           if(i<iend) {
7111             for(r=0;r<HOST_REGS;r++) {
7112               if(r!=EXCLUDE_REG) {
7113                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7114                   temp_will_dirty&=~(1<<r);
7115                   temp_wont_dirty&=~(1<<r);
7116                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7117                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7118                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7119                   } else {
7120                     temp_will_dirty|=1<<r;
7121                     temp_wont_dirty|=1<<r;
7122                   }
7123                 }
7124               }
7125             }
7126           }
7127           if(wr) {
7128             will_dirty[i]=temp_will_dirty;
7129             wont_dirty[i]=temp_wont_dirty;
7130             clean_registers((ba[i]-start)>>2,i-1,0);
7131           }else{
7132             // Limit recursion.  It can take an excessive amount
7133             // of time if there are a lot of nested loops.
7134             will_dirty[(ba[i]-start)>>2]=0;
7135             wont_dirty[(ba[i]-start)>>2]=-1;
7136           }
7137         }
7138         /*else*/ if(1)
7139         {
7140           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7141           {
7142             // Unconditional branch
7143             will_dirty_i=0;
7144             wont_dirty_i=0;
7145           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7146             for(r=0;r<HOST_REGS;r++) {
7147               if(r!=EXCLUDE_REG) {
7148                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7149                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7150                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7151                 }
7152               }
7153             }
7154           //}
7155             // Merge in delay slot
7156             for(r=0;r<HOST_REGS;r++) {
7157               if(r!=EXCLUDE_REG) {
7158                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7159                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7160                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7161                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7162                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7163                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7164                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7165                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7166                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7167                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7168                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7169                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7170                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7171                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7172               }
7173             }
7174           } else {
7175             // Conditional branch
7176             will_dirty_i=will_dirty_next;
7177             wont_dirty_i=wont_dirty_next;
7178           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7179             for(r=0;r<HOST_REGS;r++) {
7180               if(r!=EXCLUDE_REG) {
7181                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7182                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7183                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7184                 }
7185                 else
7186                 {
7187                   will_dirty_i&=~(1<<r);
7188                 }
7189                 // Treat delay slot as part of branch too
7190                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7191                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7192                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7193                 }
7194                 else
7195                 {
7196                   will_dirty[i+1]&=~(1<<r);
7197                 }*/
7198               }
7199             }
7200           //}
7201             // Merge in delay slot
7202             for(r=0;r<HOST_REGS;r++) {
7203               if(r!=EXCLUDE_REG) {
7204                 if(!likely[i]) {
7205                   // Might not dirty if likely branch is not taken
7206                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7207                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7208                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7209                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7210                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7211                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7212                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7213                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7214                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7215                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7216                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7217                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7218                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7219                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7220                 }
7221               }
7222             }
7223           }
7224           // Merge in delay slot
7225           for(r=0;r<HOST_REGS;r++) {
7226             if(r!=EXCLUDE_REG) {
7227               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7228               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7229               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7230               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7231               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7232               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7233               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7234               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7235               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7236               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7237             }
7238           }
7239           if(wr) {
7240             #ifndef DESTRUCTIVE_WRITEBACK
7241             branch_regs[i].dirty&=wont_dirty_i;
7242             #endif
7243             branch_regs[i].dirty|=will_dirty_i;
7244           }
7245         }
7246       }
7247     }
7248     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
7249     {
7250       // SYSCALL instruction (software interrupt)
7251       will_dirty_i=0;
7252       wont_dirty_i=0;
7253     }
7254     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7255     {
7256       // ERET instruction (return from interrupt)
7257       will_dirty_i=0;
7258       wont_dirty_i=0;
7259     }
7260     will_dirty_next=will_dirty_i;
7261     wont_dirty_next=wont_dirty_i;
7262     for(r=0;r<HOST_REGS;r++) {
7263       if(r!=EXCLUDE_REG) {
7264         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7265         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7266         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7267         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7268         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7269         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7270         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7271         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7272         if(i>istart) {
7273           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7274           {
7275             // Don't store a register immediately after writing it,
7276             // may prevent dual-issue.
7277             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7278             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7279           }
7280         }
7281       }
7282     }
7283     // Save it
7284     will_dirty[i]=will_dirty_i;
7285     wont_dirty[i]=wont_dirty_i;
7286     // Mark registers that won't be dirtied as not dirty
7287     if(wr) {
7288       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7289       for(r=0;r<HOST_REGS;r++) {
7290         if((will_dirty_i>>r)&1) {
7291           printf(" r%d",r);
7292         }
7293       }
7294       printf("\n");*/
7295
7296       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7297         regs[i].dirty|=will_dirty_i;
7298         #ifndef DESTRUCTIVE_WRITEBACK
7299         regs[i].dirty&=wont_dirty_i;
7300         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7301         {
7302           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7303             for(r=0;r<HOST_REGS;r++) {
7304               if(r!=EXCLUDE_REG) {
7305                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7306                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7307                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7308               }
7309             }
7310           }
7311         }
7312         else
7313         {
7314           if(i<iend) {
7315             for(r=0;r<HOST_REGS;r++) {
7316               if(r!=EXCLUDE_REG) {
7317                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7318                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7319                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7320               }
7321             }
7322           }
7323         }
7324         #endif
7325       //}
7326     }
7327     // Deal with changed mappings
7328     temp_will_dirty=will_dirty_i;
7329     temp_wont_dirty=wont_dirty_i;
7330     for(r=0;r<HOST_REGS;r++) {
7331       if(r!=EXCLUDE_REG) {
7332         int nr;
7333         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7334           if(wr) {
7335             #ifndef DESTRUCTIVE_WRITEBACK
7336             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7337             #endif
7338             regs[i].wasdirty|=will_dirty_i&(1<<r);
7339           }
7340         }
7341         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7342           // Register moved to a different register
7343           will_dirty_i&=~(1<<r);
7344           wont_dirty_i&=~(1<<r);
7345           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7346           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7347           if(wr) {
7348             #ifndef DESTRUCTIVE_WRITEBACK
7349             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7350             #endif
7351             regs[i].wasdirty|=will_dirty_i&(1<<r);
7352           }
7353         }
7354         else {
7355           will_dirty_i&=~(1<<r);
7356           wont_dirty_i&=~(1<<r);
7357           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7358             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7359             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7360           } else {
7361             wont_dirty_i|=1<<r;
7362             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7363           }
7364         }
7365       }
7366     }
7367   }
7368 }
7369
7370   /* disassembly */
7371 void disassemble_inst(int i)
7372 {
7373     if (bt[i]) printf("*"); else printf(" ");
7374     switch(itype[i]) {
7375       case UJUMP:
7376         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7377       case CJUMP:
7378         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7379       case SJUMP:
7380         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7381       case FJUMP:
7382         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7383       case RJUMP:
7384         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7385       case SPAN:
7386         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7387       case IMM16:
7388         if(opcode[i]==0xf) //LUI
7389           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7390         else
7391           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7392         break;
7393       case LOAD:
7394       case LOADLR:
7395         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7396         break;
7397       case STORE:
7398       case STORELR:
7399         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7400         break;
7401       case ALU:
7402       case SHIFT:
7403         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7404         break;
7405       case MULTDIV:
7406         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7407         break;
7408       case SHIFTIMM:
7409         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7410         break;
7411       case MOV:
7412         if((opcode2[i]&0x1d)==0x10)
7413           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7414         else if((opcode2[i]&0x1d)==0x11)
7415           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7416         else
7417           printf (" %x: %s\n",start+i*4,insn[i]);
7418         break;
7419       case COP0:
7420         if(opcode2[i]==0)
7421           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7422         else if(opcode2[i]==4)
7423           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7424         else printf (" %x: %s\n",start+i*4,insn[i]);
7425         break;
7426       case COP1:
7427         if(opcode2[i]<3)
7428           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7429         else if(opcode2[i]>3)
7430           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7431         else printf (" %x: %s\n",start+i*4,insn[i]);
7432         break;
7433       case C1LS:
7434         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7435         break;
7436       default:
7437         //printf (" %s %8x\n",insn[i],source[i]);
7438         printf (" %x: %s\n",start+i*4,insn[i]);
7439     }
7440 }
7441
7442 void new_dynarec_init()
7443 {
7444   printf("Init new dynarec\n");
7445   out=(u_char *)BASE_ADDR;
7446   if (mmap (out, 1<<TARGET_SIZE_2,
7447             PROT_READ | PROT_WRITE | PROT_EXEC,
7448             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7449             -1, 0) <= 0) {printf("mmap() failed\n");}
7450 #ifdef MUPEN64
7451   rdword=&readmem_dword;
7452   fake_pc.f.r.rs=&readmem_dword;
7453   fake_pc.f.r.rt=&readmem_dword;
7454   fake_pc.f.r.rd=&readmem_dword;
7455 #endif
7456   int n;
7457   for(n=0x80000;n<0x80800;n++)
7458     invalid_code[n]=1;
7459   for(n=0;n<65536;n++)
7460     hash_table[n][0]=hash_table[n][2]=-1;
7461   memset(mini_ht,-1,sizeof(mini_ht));
7462   memset(restore_candidate,0,sizeof(restore_candidate));
7463   copy=shadow;
7464   expirep=16384; // Expiry pointer, +2 blocks
7465   pending_exception=0;
7466   literalcount=0;
7467 #ifdef HOST_IMM8
7468   // Copy this into local area so we don't have to put it in every literal pool
7469   invc_ptr=invalid_code;
7470 #endif
7471   stop_after_jal=0;
7472   // TLB
7473   using_tlb=0;
7474   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7475     memory_map[n]=-1;
7476   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7477     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7478   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7479     memory_map[n]=-1;
7480 #ifdef MUPEN64
7481   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7482     writemem[n] = write_nomem_new;
7483     writememb[n] = write_nomemb_new;
7484     writememh[n] = write_nomemh_new;
7485 #ifndef FORCE32
7486     writememd[n] = write_nomemd_new;
7487 #endif
7488     readmem[n] = read_nomem_new;
7489     readmemb[n] = read_nomemb_new;
7490     readmemh[n] = read_nomemh_new;
7491 #ifndef FORCE32
7492     readmemd[n] = read_nomemd_new;
7493 #endif
7494   }
7495   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7496     writemem[n] = write_rdram_new;
7497     writememb[n] = write_rdramb_new;
7498     writememh[n] = write_rdramh_new;
7499 #ifndef FORCE32
7500     writememd[n] = write_rdramd_new;
7501 #endif
7502   }
7503   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7504     writemem[n] = write_nomem_new;
7505     writememb[n] = write_nomemb_new;
7506     writememh[n] = write_nomemh_new;
7507 #ifndef FORCE32
7508     writememd[n] = write_nomemd_new;
7509 #endif
7510     readmem[n] = read_nomem_new;
7511     readmemb[n] = read_nomemb_new;
7512     readmemh[n] = read_nomemh_new;
7513 #ifndef FORCE32
7514     readmemd[n] = read_nomemd_new;
7515 #endif
7516   }
7517 #endif
7518   tlb_hacks();
7519   arch_init();
7520 }
7521
7522 void new_dynarec_cleanup()
7523 {
7524   int n;
7525   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7526   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7527   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7528   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7529   #ifdef ROM_COPY
7530   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7531   #endif
7532 }
7533
7534 int new_recompile_block(int addr)
7535 {
7536 /*
7537   if(addr==0x800cd050) {
7538     int block;
7539     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7540     int n;
7541     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7542   }
7543 */
7544   //if(Count==365117028) tracedebug=1;
7545   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7546   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7547   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7548   //if(debug) 
7549   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7550   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7551   /*if(Count>=312978186) {
7552     rlist();
7553   }*/
7554   //rlist();
7555   start = (u_int)addr&~3;
7556   //assert(((u_int)addr&1)==0);
7557 #ifdef PCSX
7558   if (Config.HLE && start == 0x80001000) {
7559     // XXX: is this enough? Maybe check hleSoftCall?
7560     u_int page=get_page(start);
7561     ll_add(jump_in+page,start,out);
7562     invalid_code[start>>12]=0;
7563     emit_movimm(start,0);
7564     emit_writeword(0,(int)&pcaddr);
7565     emit_jmp((int)new_dyna_leave); // enough??
7566     return 0;
7567   }
7568   else if ((u_int)addr < 0x00200000) {
7569     // used for BIOS calls mostly?
7570     source = (u_int *)((u_int)rdram+start-0);
7571     pagelimit = 0x00200000;
7572   }
7573   else
7574 #endif
7575 #ifdef MUPEN64
7576   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7577     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7578     pagelimit = 0xa4001000;
7579   }
7580   else
7581 #endif
7582   if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7583     source = (u_int *)((u_int)rdram+start-0x80000000);
7584     pagelimit = 0x80800000;
7585   }
7586 #ifndef DISABLE_TLB
7587   else if ((signed int)addr >= (signed int)0xC0000000) {
7588     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7589     //if(tlb_LUT_r[start>>12])
7590       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7591     if((signed int)memory_map[start>>12]>=0) {
7592       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7593       pagelimit=(start+4096)&0xFFFFF000;
7594       int map=memory_map[start>>12];
7595       int i;
7596       for(i=0;i<5;i++) {
7597         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7598         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7599       }
7600       assem_debug("pagelimit=%x\n",pagelimit);
7601       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7602     }
7603     else {
7604       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7605       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7606       return 1; // Caller will invoke exception handler
7607     }
7608     //printf("source= %x\n",(int)source);
7609   }
7610 #endif
7611   else {
7612     printf("Compile at bogus memory address: %x \n", (int)addr);
7613     exit(1);
7614   }
7615
7616   /* Pass 1: disassemble */
7617   /* Pass 2: register dependencies, branch targets */
7618   /* Pass 3: register allocation */
7619   /* Pass 4: branch dependencies */
7620   /* Pass 5: pre-alloc */
7621   /* Pass 6: optimize clean/dirty state */
7622   /* Pass 7: flag 32-bit registers */
7623   /* Pass 8: assembly */
7624   /* Pass 9: linker */
7625   /* Pass 10: garbage collection / free memory */
7626
7627   int i,j;
7628   int done=0;
7629   unsigned int type,op,op2;
7630
7631   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7632   
7633   /* Pass 1 disassembly */
7634
7635   for(i=0;!done;i++) {
7636     bt[i]=0;likely[i]=0;op2=0;
7637     opcode[i]=op=source[i]>>26;
7638     switch(op)
7639     {
7640       case 0x00: strcpy(insn[i],"special"); type=NI;
7641         op2=source[i]&0x3f;
7642         switch(op2)
7643         {
7644           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7645           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7646           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7647           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7648           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7649           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7650           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7651           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7652           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7653           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7654           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7655           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7656           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7657           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7658           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7659           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7660           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7661           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7662           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7663           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7664           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7665           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7666           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7667           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7668           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7669           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7670           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7671           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7672           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7673           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7674           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7675           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7676           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7677           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7678           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7679           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7680           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7681           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7682           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7683           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7684           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7685           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7686           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7687           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7688           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7689           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7690           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7691           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7692           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7693           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7694           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7695           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7696         }
7697         break;
7698       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7699         op2=(source[i]>>16)&0x1f;
7700         switch(op2)
7701         {
7702           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7703           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7704           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7705           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7706           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7707           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7708           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7709           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7710           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7711           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7712           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7713           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7714           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7715           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7716         }
7717         break;
7718       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7719       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7720       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7721       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7722       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7723       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7724       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7725       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7726       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7727       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7728       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7729       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7730       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7731       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7732       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7733         op2=(source[i]>>21)&0x1f;
7734         switch(op2)
7735         {
7736           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7737           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7738           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7739           switch(source[i]&0x3f)
7740           {
7741             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7742             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7743             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7744             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7745             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7746           }
7747         }
7748         break;
7749       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7750         op2=(source[i]>>21)&0x1f;
7751         switch(op2)
7752         {
7753           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7754           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7755           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7756           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7757           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7758           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7759           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7760           switch((source[i]>>16)&0x3)
7761           {
7762             case 0x00: strcpy(insn[i],"BC1F"); break;
7763             case 0x01: strcpy(insn[i],"BC1T"); break;
7764             case 0x02: strcpy(insn[i],"BC1FL"); break;
7765             case 0x03: strcpy(insn[i],"BC1TL"); break;
7766           }
7767           break;
7768           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7769           switch(source[i]&0x3f)
7770           {
7771             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7772             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7773             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7774             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7775             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7776             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7777             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7778             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7779             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7780             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7781             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7782             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7783             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7784             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7785             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7786             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7787             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7788             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7789             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7790             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7791             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7792             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7793             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7794             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7795             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7796             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7797             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7798             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7799             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7800             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7801             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7802             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7803             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7804             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7805             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7806           }
7807           break;
7808           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7809           switch(source[i]&0x3f)
7810           {
7811             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7812             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7813             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7814             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7815             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7816             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7817             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7818             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7819             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7820             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7821             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7822             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7823             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7824             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7825             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7826             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7827             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7828             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7829             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7830             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7831             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7832             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7833             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7834             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7835             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7836             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7837             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7838             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7839             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7840             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7841             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7842             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7843             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7844             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7845             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7846           }
7847           break;
7848           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7849           switch(source[i]&0x3f)
7850           {
7851             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7852             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7853           }
7854           break;
7855           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7856           switch(source[i]&0x3f)
7857           {
7858             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7859             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7860           }
7861           break;
7862         }
7863         break;
7864       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7865       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7866       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7867       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7868       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7869       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7870       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7871       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7872       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7873       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7874       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7875       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7876       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7877       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7878       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7879       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7880       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7881       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7882       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7883       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7884       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7885       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7886       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7887       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7888       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7889       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7890       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7891       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7892       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7893       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7894       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7895 #ifdef PCSX
7896       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7897 #endif
7898       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7899       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7900       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7901       default: strcpy(insn[i],"???"); type=NI;
7902         printf("NI %08x @%08x\n", source[i], addr + i*4);
7903         break;
7904     }
7905     itype[i]=type;
7906     opcode2[i]=op2;
7907     /* Get registers/immediates */
7908     lt1[i]=0;
7909     us1[i]=0;
7910     us2[i]=0;
7911     dep1[i]=0;
7912     dep2[i]=0;
7913     switch(type) {
7914       case LOAD:
7915         rs1[i]=(source[i]>>21)&0x1f;
7916         rs2[i]=0;
7917         rt1[i]=(source[i]>>16)&0x1f;
7918         rt2[i]=0;
7919         imm[i]=(short)source[i];
7920         break;
7921       case STORE:
7922       case STORELR:
7923         rs1[i]=(source[i]>>21)&0x1f;
7924         rs2[i]=(source[i]>>16)&0x1f;
7925         rt1[i]=0;
7926         rt2[i]=0;
7927         imm[i]=(short)source[i];
7928         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7929         break;
7930       case LOADLR:
7931         // LWL/LWR only load part of the register,
7932         // therefore the target register must be treated as a source too
7933         rs1[i]=(source[i]>>21)&0x1f;
7934         rs2[i]=(source[i]>>16)&0x1f;
7935         rt1[i]=(source[i]>>16)&0x1f;
7936         rt2[i]=0;
7937         imm[i]=(short)source[i];
7938         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7939         if(op==0x26) dep1[i]=rt1[i]; // LWR
7940         break;
7941       case IMM16:
7942         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7943         else rs1[i]=(source[i]>>21)&0x1f;
7944         rs2[i]=0;
7945         rt1[i]=(source[i]>>16)&0x1f;
7946         rt2[i]=0;
7947         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7948           imm[i]=(unsigned short)source[i];
7949         }else{
7950           imm[i]=(short)source[i];
7951         }
7952         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7953         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7954         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7955         break;
7956       case UJUMP:
7957         rs1[i]=0;
7958         rs2[i]=0;
7959         rt1[i]=0;
7960         rt2[i]=0;
7961         // The JAL instruction writes to r31.
7962         if (op&1) {
7963           rt1[i]=31;
7964         }
7965         rs2[i]=CCREG;
7966         break;
7967       case RJUMP:
7968         rs1[i]=(source[i]>>21)&0x1f;
7969         rs2[i]=0;
7970         rt1[i]=0;
7971         rt2[i]=0;
7972         // The JALR instruction writes to r31.
7973         if (op2&1) {
7974           rt1[i]=31;   
7975         }
7976         rs2[i]=CCREG;
7977         break;
7978       case CJUMP:
7979         rs1[i]=(source[i]>>21)&0x1f;
7980         rs2[i]=(source[i]>>16)&0x1f;
7981         rt1[i]=0;
7982         rt2[i]=0;
7983         if(op&2) { // BGTZ/BLEZ
7984           rs2[i]=0;
7985         }
7986         us1[i]=rs1[i];
7987         us2[i]=rs2[i];
7988         likely[i]=op>>4;
7989         break;
7990       case SJUMP:
7991         rs1[i]=(source[i]>>21)&0x1f;
7992         rs2[i]=CCREG;
7993         rt1[i]=0;
7994         rt2[i]=0;
7995         us1[i]=rs1[i];
7996         if(op2&0x10) { // BxxAL
7997           rt1[i]=31;
7998           // NOTE: If the branch is not taken, r31 is still overwritten
7999         }
8000         likely[i]=(op2&2)>>1;
8001         break;
8002       case FJUMP:
8003         rs1[i]=FSREG;
8004         rs2[i]=CSREG;
8005         rt1[i]=0;
8006         rt2[i]=0;
8007         likely[i]=((source[i])>>17)&1;
8008         break;
8009       case ALU:
8010         rs1[i]=(source[i]>>21)&0x1f; // source
8011         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8012         rt1[i]=(source[i]>>11)&0x1f; // destination
8013         rt2[i]=0;
8014         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8015           us1[i]=rs1[i];us2[i]=rs2[i];
8016         }
8017         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8018           dep1[i]=rs1[i];dep2[i]=rs2[i];
8019         }
8020         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8021           dep1[i]=rs1[i];dep2[i]=rs2[i];
8022         }
8023         break;
8024       case MULTDIV:
8025         rs1[i]=(source[i]>>21)&0x1f; // source
8026         rs2[i]=(source[i]>>16)&0x1f; // divisor
8027         rt1[i]=HIREG;
8028         rt2[i]=LOREG;
8029         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8030           us1[i]=rs1[i];us2[i]=rs2[i];
8031         }
8032         break;
8033       case MOV:
8034         rs1[i]=0;
8035         rs2[i]=0;
8036         rt1[i]=0;
8037         rt2[i]=0;
8038         if(op2==0x10) rs1[i]=HIREG; // MFHI
8039         if(op2==0x11) rt1[i]=HIREG; // MTHI
8040         if(op2==0x12) rs1[i]=LOREG; // MFLO
8041         if(op2==0x13) rt1[i]=LOREG; // MTLO
8042         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8043         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8044         dep1[i]=rs1[i];
8045         break;
8046       case SHIFT:
8047         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8048         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8049         rt1[i]=(source[i]>>11)&0x1f; // destination
8050         rt2[i]=0;
8051         // DSLLV/DSRLV/DSRAV are 64-bit
8052         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8053         break;
8054       case SHIFTIMM:
8055         rs1[i]=(source[i]>>16)&0x1f;
8056         rs2[i]=0;
8057         rt1[i]=(source[i]>>11)&0x1f;
8058         rt2[i]=0;
8059         imm[i]=(source[i]>>6)&0x1f;
8060         // DSxx32 instructions
8061         if(op2>=0x3c) imm[i]|=0x20;
8062         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8063         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8064         break;
8065       case COP0:
8066         rs1[i]=0;
8067         rs2[i]=0;
8068         rt1[i]=0;
8069         rt2[i]=0;
8070         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8071         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8072         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8073         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8074         break;
8075       case COP1:
8076         rs1[i]=0;
8077         rs2[i]=0;
8078         rt1[i]=0;
8079         rt2[i]=0;
8080         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8081         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8082         if(op2==5) us1[i]=rs1[i]; // DMTC1
8083         rs2[i]=CSREG;
8084         break;
8085       case C1LS:
8086         rs1[i]=(source[i]>>21)&0x1F;
8087         rs2[i]=CSREG;
8088         rt1[i]=0;
8089         rt2[i]=0;
8090         imm[i]=(short)source[i];
8091         break;
8092       case FLOAT:
8093       case FCONV:
8094         rs1[i]=0;
8095         rs2[i]=CSREG;
8096         rt1[i]=0;
8097         rt2[i]=0;
8098         break;
8099       case FCOMP:
8100         rs1[i]=FSREG;
8101         rs2[i]=CSREG;
8102         rt1[i]=FSREG;
8103         rt2[i]=0;
8104         break;
8105       case SYSCALL:
8106       case HLECALL:
8107         rs1[i]=CCREG;
8108         rs2[i]=0;
8109         rt1[i]=0;
8110         rt2[i]=0;
8111         break;
8112       default:
8113         rs1[i]=0;
8114         rs2[i]=0;
8115         rt1[i]=0;
8116         rt2[i]=0;
8117     }
8118     /* Calculate branch target addresses */
8119     if(type==UJUMP)
8120       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8121     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8122       ba[i]=start+i*4+8; // Ignore never taken branch
8123     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8124       ba[i]=start+i*4+8; // Ignore never taken branch
8125     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8126       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8127     else ba[i]=-1;
8128     /* Is this the end of the block? */
8129     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8130       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8131         done=1;
8132         // Does the block continue due to a branch?
8133         for(j=i-1;j>=0;j--)
8134         {
8135           if(ba[j]==start+i*4+4) done=j=0;
8136           if(ba[j]==start+i*4+8) done=j=0;
8137         }
8138       }
8139       else {
8140         if(stop_after_jal) done=1;
8141         // Stop on BREAK
8142         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8143       }
8144       // Don't recompile stuff that's already compiled
8145       if(check_addr(start+i*4+4)) done=1;
8146       // Don't get too close to the limit
8147       if(i>MAXBLOCK/2) done=1;
8148     }
8149     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8150     if(itype[i-1]==HLECALL) done=1;
8151     assert(i<MAXBLOCK-1);
8152     if(start+i*4==pagelimit-4) done=1;
8153     assert(start+i*4<pagelimit);
8154     if (i==MAXBLOCK-1) done=1;
8155     // Stop if we're compiling junk
8156     if(itype[i]==NI&&opcode[i]==0x11) {
8157       done=stop_after_jal=1;
8158       printf("Disabled speculative precompilation\n");
8159     }
8160   }
8161   slen=i;
8162   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8163     if(start+i*4==pagelimit) {
8164       itype[i-1]=SPAN;
8165     }
8166   }
8167   assert(slen>0);
8168
8169   /* Pass 2 - Register dependencies and branch targets */
8170
8171   unneeded_registers(0,slen-1,0);
8172   
8173   /* Pass 3 - Register allocation */
8174
8175   struct regstat current; // Current register allocations/status
8176   current.is32=1;
8177   current.dirty=0;
8178   current.u=unneeded_reg[0];
8179   current.uu=unneeded_reg_upper[0];
8180   clear_all_regs(current.regmap);
8181   alloc_reg(&current,0,CCREG);
8182   dirty_reg(&current,CCREG);
8183   current.isconst=0;
8184   current.wasconst=0;
8185   int ds=0;
8186   int cc=0;
8187   int hr;
8188   
8189   provisional_32bit();
8190   
8191   if((u_int)addr&1) {
8192     // First instruction is delay slot
8193     cc=-1;
8194     bt[1]=1;
8195     ds=1;
8196     unneeded_reg[0]=1;
8197     unneeded_reg_upper[0]=1;
8198     current.regmap[HOST_BTREG]=BTREG;
8199   }
8200   
8201   for(i=0;i<slen;i++)
8202   {
8203     if(bt[i])
8204     {
8205       int hr;
8206       for(hr=0;hr<HOST_REGS;hr++)
8207       {
8208         // Is this really necessary?
8209         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8210       }
8211       current.isconst=0;
8212     }
8213     if(i>1)
8214     {
8215       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8216       {
8217         if(rs1[i-2]==0||rs2[i-2]==0)
8218         {
8219           if(rs1[i-2]) {
8220             current.is32|=1LL<<rs1[i-2];
8221             int hr=get_reg(current.regmap,rs1[i-2]|64);
8222             if(hr>=0) current.regmap[hr]=-1;
8223           }
8224           if(rs2[i-2]) {
8225             current.is32|=1LL<<rs2[i-2];
8226             int hr=get_reg(current.regmap,rs2[i-2]|64);
8227             if(hr>=0) current.regmap[hr]=-1;
8228           }
8229         }
8230       }
8231     }
8232     // If something jumps here with 64-bit values
8233     // then promote those registers to 64 bits
8234     if(bt[i])
8235     {
8236       uint64_t temp_is32=current.is32;
8237       for(j=i-1;j>=0;j--)
8238       {
8239         if(ba[j]==start+i*4) 
8240           temp_is32&=branch_regs[j].is32;
8241       }
8242       for(j=i;j<slen;j++)
8243       {
8244         if(ba[j]==start+i*4) 
8245           //temp_is32=1;
8246           temp_is32&=p32[j];
8247       }
8248       if(temp_is32!=current.is32) {
8249         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8250         #ifdef DESTRUCTIVE_WRITEBACK
8251         for(hr=0;hr<HOST_REGS;hr++)
8252         {
8253           int r=current.regmap[hr];
8254           if(r>0&&r<64)
8255           {
8256             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8257               temp_is32|=1LL<<r;
8258               //printf("restore %d\n",r);
8259             }
8260           }
8261         }
8262         #endif
8263         current.is32=temp_is32;
8264       }
8265     }
8266 #ifdef FORCE32
8267     memset(p32, 0xff, sizeof(p32));
8268     current.is32=-1LL;
8269 #endif
8270
8271     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8272     regs[i].wasconst=current.isconst;
8273     regs[i].was32=current.is32;
8274     regs[i].wasdirty=current.dirty;
8275     #ifdef DESTRUCTIVE_WRITEBACK
8276     // To change a dirty register from 32 to 64 bits, we must write
8277     // it out during the previous cycle (for branches, 2 cycles)
8278     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8279     {
8280       uint64_t temp_is32=current.is32;
8281       for(j=i-1;j>=0;j--)
8282       {
8283         if(ba[j]==start+i*4+4) 
8284           temp_is32&=branch_regs[j].is32;
8285       }
8286       for(j=i;j<slen;j++)
8287       {
8288         if(ba[j]==start+i*4+4) 
8289           //temp_is32=1;
8290           temp_is32&=p32[j];
8291       }
8292       if(temp_is32!=current.is32) {
8293         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8294         for(hr=0;hr<HOST_REGS;hr++)
8295         {
8296           int r=current.regmap[hr];
8297           if(r>0)
8298           {
8299             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8300               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8301               {
8302                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8303                 {
8304                   //printf("dump %d/r%d\n",hr,r);
8305                   current.regmap[hr]=-1;
8306                   if(get_reg(current.regmap,r|64)>=0) 
8307                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8308                 }
8309               }
8310             }
8311           }
8312         }
8313       }
8314     }
8315     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8316     {
8317       uint64_t temp_is32=current.is32;
8318       for(j=i-1;j>=0;j--)
8319       {
8320         if(ba[j]==start+i*4+8) 
8321           temp_is32&=branch_regs[j].is32;
8322       }
8323       for(j=i;j<slen;j++)
8324       {
8325         if(ba[j]==start+i*4+8) 
8326           //temp_is32=1;
8327           temp_is32&=p32[j];
8328       }
8329       if(temp_is32!=current.is32) {
8330         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8331         for(hr=0;hr<HOST_REGS;hr++)
8332         {
8333           int r=current.regmap[hr];
8334           if(r>0)
8335           {
8336             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8337               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8338               {
8339                 //printf("dump %d/r%d\n",hr,r);
8340                 current.regmap[hr]=-1;
8341                 if(get_reg(current.regmap,r|64)>=0) 
8342                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8343               }
8344             }
8345           }
8346         }
8347       }
8348     }
8349     #endif
8350     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8351       if(i+1<slen) {
8352         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8353         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8354         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8355         current.u|=1;
8356         current.uu|=1;
8357       } else {
8358         current.u=1;
8359         current.uu=1;
8360       }
8361     } else {
8362       if(i+1<slen) {
8363         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8364         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8365         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8366         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8367         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8368         current.u|=1;
8369         current.uu|=1;
8370       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8371     }
8372     is_ds[i]=ds;
8373     if(ds) {
8374       ds=0; // Skip delay slot, already allocated as part of branch
8375       // ...but we need to alloc it in case something jumps here
8376       if(i+1<slen) {
8377         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8378         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8379       }else{
8380         current.u=branch_unneeded_reg[i-1];
8381         current.uu=branch_unneeded_reg_upper[i-1];
8382       }
8383       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8384       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8385       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8386       current.u|=1;
8387       current.uu|=1;
8388       struct regstat temp;
8389       memcpy(&temp,&current,sizeof(current));
8390       temp.wasdirty=temp.dirty;
8391       temp.was32=temp.is32;
8392       // TODO: Take into account unconditional branches, as below
8393       delayslot_alloc(&temp,i);
8394       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8395       regs[i].wasdirty=temp.wasdirty;
8396       regs[i].was32=temp.was32;
8397       regs[i].dirty=temp.dirty;
8398       regs[i].is32=temp.is32;
8399       regs[i].isconst=0;
8400       regs[i].wasconst=0;
8401       current.isconst=0;
8402       // Create entry (branch target) regmap
8403       for(hr=0;hr<HOST_REGS;hr++)
8404       {
8405         int r=temp.regmap[hr];
8406         if(r>=0) {
8407           if(r!=regmap_pre[i][hr]) {
8408             regs[i].regmap_entry[hr]=-1;
8409           }
8410           else
8411           {
8412             if(r<64){
8413               if((current.u>>r)&1) {
8414                 regs[i].regmap_entry[hr]=-1;
8415                 regs[i].regmap[hr]=-1;
8416                 //Don't clear regs in the delay slot as the branch might need them
8417                 //current.regmap[hr]=-1;
8418               }else
8419                 regs[i].regmap_entry[hr]=r;
8420             }
8421             else {
8422               if((current.uu>>(r&63))&1) {
8423                 regs[i].regmap_entry[hr]=-1;
8424                 regs[i].regmap[hr]=-1;
8425                 //Don't clear regs in the delay slot as the branch might need them
8426                 //current.regmap[hr]=-1;
8427               }else
8428                 regs[i].regmap_entry[hr]=r;
8429             }
8430           }
8431         } else {
8432           // First instruction expects CCREG to be allocated
8433           if(i==0&&hr==HOST_CCREG) 
8434             regs[i].regmap_entry[hr]=CCREG;
8435           else
8436             regs[i].regmap_entry[hr]=-1;
8437         }
8438       }
8439     }
8440     else { // Not delay slot
8441       switch(itype[i]) {
8442         case UJUMP:
8443           //current.isconst=0; // DEBUG
8444           //current.wasconst=0; // DEBUG
8445           //regs[i].wasconst=0; // DEBUG
8446           clear_const(&current,rt1[i]);
8447           alloc_cc(&current,i);
8448           dirty_reg(&current,CCREG);
8449           if (rt1[i]==31) {
8450             alloc_reg(&current,i,31);
8451             dirty_reg(&current,31);
8452             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8453             #ifdef REG_PREFETCH
8454             alloc_reg(&current,i,PTEMP);
8455             #endif
8456             //current.is32|=1LL<<rt1[i];
8457           }
8458           delayslot_alloc(&current,i+1);
8459           //current.isconst=0; // DEBUG
8460           ds=1;
8461           //printf("i=%d, isconst=%x\n",i,current.isconst);
8462           break;
8463         case RJUMP:
8464           //current.isconst=0;
8465           //current.wasconst=0;
8466           //regs[i].wasconst=0;
8467           clear_const(&current,rs1[i]);
8468           clear_const(&current,rt1[i]);
8469           alloc_cc(&current,i);
8470           dirty_reg(&current,CCREG);
8471           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8472             alloc_reg(&current,i,rs1[i]);
8473             if (rt1[i]==31) {
8474               alloc_reg(&current,i,31);
8475               dirty_reg(&current,31);
8476               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8477               #ifdef REG_PREFETCH
8478               alloc_reg(&current,i,PTEMP);
8479               #endif
8480             }
8481             #ifdef USE_MINI_HT
8482             if(rs1[i]==31) { // JALR
8483               alloc_reg(&current,i,RHASH);
8484               #ifndef HOST_IMM_ADDR32
8485               alloc_reg(&current,i,RHTBL);
8486               #endif
8487             }
8488             #endif
8489             delayslot_alloc(&current,i+1);
8490           } else {
8491             // The delay slot overwrites our source register,
8492             // allocate a temporary register to hold the old value.
8493             current.isconst=0;
8494             current.wasconst=0;
8495             regs[i].wasconst=0;
8496             delayslot_alloc(&current,i+1);
8497             current.isconst=0;
8498             alloc_reg(&current,i,RTEMP);
8499           }
8500           //current.isconst=0; // DEBUG
8501           ds=1;
8502           break;
8503         case CJUMP:
8504           //current.isconst=0;
8505           //current.wasconst=0;
8506           //regs[i].wasconst=0;
8507           clear_const(&current,rs1[i]);
8508           clear_const(&current,rs2[i]);
8509           if((opcode[i]&0x3E)==4) // BEQ/BNE
8510           {
8511             alloc_cc(&current,i);
8512             dirty_reg(&current,CCREG);
8513             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8514             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8515             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8516             {
8517               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8518               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8519             }
8520             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8521                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8522               // The delay slot overwrites one of our conditions.
8523               // Allocate the branch condition registers instead.
8524               // Note that such a sequence of instructions could
8525               // be considered a bug since the branch can not be
8526               // re-executed if an exception occurs.
8527               current.isconst=0;
8528               current.wasconst=0;
8529               regs[i].wasconst=0;
8530               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8531               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8532               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8533               {
8534                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8535                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8536               }
8537             }
8538             else delayslot_alloc(&current,i+1);
8539           }
8540           else
8541           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8542           {
8543             alloc_cc(&current,i);
8544             dirty_reg(&current,CCREG);
8545             alloc_reg(&current,i,rs1[i]);
8546             if(!(current.is32>>rs1[i]&1))
8547             {
8548               alloc_reg64(&current,i,rs1[i]);
8549             }
8550             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8551               // The delay slot overwrites one of our conditions.
8552               // Allocate the branch condition registers instead.
8553               // Note that such a sequence of instructions could
8554               // be considered a bug since the branch can not be
8555               // re-executed if an exception occurs.
8556               current.isconst=0;
8557               current.wasconst=0;
8558               regs[i].wasconst=0;
8559               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8560               if(!((current.is32>>rs1[i])&1))
8561               {
8562                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8563               }
8564             }
8565             else delayslot_alloc(&current,i+1);
8566           }
8567           else
8568           // Don't alloc the delay slot yet because we might not execute it
8569           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8570           {
8571             current.isconst=0;
8572             current.wasconst=0;
8573             regs[i].wasconst=0;
8574             alloc_cc(&current,i);
8575             dirty_reg(&current,CCREG);
8576             alloc_reg(&current,i,rs1[i]);
8577             alloc_reg(&current,i,rs2[i]);
8578             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8579             {
8580               alloc_reg64(&current,i,rs1[i]);
8581               alloc_reg64(&current,i,rs2[i]);
8582             }
8583           }
8584           else
8585           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8586           {
8587             current.isconst=0;
8588             current.wasconst=0;
8589             regs[i].wasconst=0;
8590             alloc_cc(&current,i);
8591             dirty_reg(&current,CCREG);
8592             alloc_reg(&current,i,rs1[i]);
8593             if(!(current.is32>>rs1[i]&1))
8594             {
8595               alloc_reg64(&current,i,rs1[i]);
8596             }
8597           }
8598           ds=1;
8599           //current.isconst=0;
8600           break;
8601         case SJUMP:
8602           //current.isconst=0;
8603           //current.wasconst=0;
8604           //regs[i].wasconst=0;
8605           clear_const(&current,rs1[i]);
8606           clear_const(&current,rt1[i]);
8607           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8608           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8609           {
8610             alloc_cc(&current,i);
8611             dirty_reg(&current,CCREG);
8612             alloc_reg(&current,i,rs1[i]);
8613             if(!(current.is32>>rs1[i]&1))
8614             {
8615               alloc_reg64(&current,i,rs1[i]);
8616             }
8617             if (rt1[i]==31) { // BLTZAL/BGEZAL
8618               alloc_reg(&current,i,31);
8619               dirty_reg(&current,31);
8620               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8621               //#ifdef REG_PREFETCH
8622               //alloc_reg(&current,i,PTEMP);
8623               //#endif
8624               //current.is32|=1LL<<rt1[i];
8625             }
8626             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8627               // The delay slot overwrites the branch condition.
8628               // Allocate the branch condition registers instead.
8629               // Note that such a sequence of instructions could
8630               // be considered a bug since the branch can not be
8631               // re-executed if an exception occurs.
8632               current.isconst=0;
8633               current.wasconst=0;
8634               regs[i].wasconst=0;
8635               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8636               if(!((current.is32>>rs1[i])&1))
8637               {
8638                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8639               }
8640             }
8641             else delayslot_alloc(&current,i+1);
8642           }
8643           else
8644           // Don't alloc the delay slot yet because we might not execute it
8645           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8646           {
8647             current.isconst=0;
8648             current.wasconst=0;
8649             regs[i].wasconst=0;
8650             alloc_cc(&current,i);
8651             dirty_reg(&current,CCREG);
8652             alloc_reg(&current,i,rs1[i]);
8653             if(!(current.is32>>rs1[i]&1))
8654             {
8655               alloc_reg64(&current,i,rs1[i]);
8656             }
8657           }
8658           ds=1;
8659           //current.isconst=0;
8660           break;
8661         case FJUMP:
8662           current.isconst=0;
8663           current.wasconst=0;
8664           regs[i].wasconst=0;
8665           if(likely[i]==0) // BC1F/BC1T
8666           {
8667             // TODO: Theoretically we can run out of registers here on x86.
8668             // The delay slot can allocate up to six, and we need to check
8669             // CSREG before executing the delay slot.  Possibly we can drop
8670             // the cycle count and then reload it after checking that the
8671             // FPU is in a usable state, or don't do out-of-order execution.
8672             alloc_cc(&current,i);
8673             dirty_reg(&current,CCREG);
8674             alloc_reg(&current,i,FSREG);
8675             alloc_reg(&current,i,CSREG);
8676             if(itype[i+1]==FCOMP) {
8677               // The delay slot overwrites the branch condition.
8678               // Allocate the branch condition registers instead.
8679               // Note that such a sequence of instructions could
8680               // be considered a bug since the branch can not be
8681               // re-executed if an exception occurs.
8682               alloc_cc(&current,i);
8683               dirty_reg(&current,CCREG);
8684               alloc_reg(&current,i,CSREG);
8685               alloc_reg(&current,i,FSREG);
8686             }
8687             else {
8688               delayslot_alloc(&current,i+1);
8689               alloc_reg(&current,i+1,CSREG);
8690             }
8691           }
8692           else
8693           // Don't alloc the delay slot yet because we might not execute it
8694           if(likely[i]) // BC1FL/BC1TL
8695           {
8696             alloc_cc(&current,i);
8697             dirty_reg(&current,CCREG);
8698             alloc_reg(&current,i,CSREG);
8699             alloc_reg(&current,i,FSREG);
8700           }
8701           ds=1;
8702           current.isconst=0;
8703           break;
8704         case IMM16:
8705           imm16_alloc(&current,i);
8706           break;
8707         case LOAD:
8708         case LOADLR:
8709           load_alloc(&current,i);
8710           break;
8711         case STORE:
8712         case STORELR:
8713           store_alloc(&current,i);
8714           break;
8715         case ALU:
8716           alu_alloc(&current,i);
8717           break;
8718         case SHIFT:
8719           shift_alloc(&current,i);
8720           break;
8721         case MULTDIV:
8722           multdiv_alloc(&current,i);
8723           break;
8724         case SHIFTIMM:
8725           shiftimm_alloc(&current,i);
8726           break;
8727         case MOV:
8728           mov_alloc(&current,i);
8729           break;
8730         case COP0:
8731           cop0_alloc(&current,i);
8732           break;
8733         case COP1:
8734           cop1_alloc(&current,i);
8735           break;
8736         case C1LS:
8737           c1ls_alloc(&current,i);
8738           break;
8739         case FCONV:
8740           fconv_alloc(&current,i);
8741           break;
8742         case FLOAT:
8743           float_alloc(&current,i);
8744           break;
8745         case FCOMP:
8746           fcomp_alloc(&current,i);
8747           break;
8748         case SYSCALL:
8749         case HLECALL:
8750           syscall_alloc(&current,i);
8751           break;
8752         case SPAN:
8753           pagespan_alloc(&current,i);
8754           break;
8755       }
8756       
8757       // Drop the upper half of registers that have become 32-bit
8758       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8759       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8760         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8761         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8762         current.uu|=1;
8763       } else {
8764         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8765         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8766         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8767         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8768         current.uu|=1;
8769       }
8770
8771       // Create entry (branch target) regmap
8772       for(hr=0;hr<HOST_REGS;hr++)
8773       {
8774         int r,or,er;
8775         r=current.regmap[hr];
8776         if(r>=0) {
8777           if(r!=regmap_pre[i][hr]) {
8778             // TODO: delay slot (?)
8779             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8780             if(or<0||(r&63)>=TEMPREG){
8781               regs[i].regmap_entry[hr]=-1;
8782             }
8783             else
8784             {
8785               // Just move it to a different register
8786               regs[i].regmap_entry[hr]=r;
8787               // If it was dirty before, it's still dirty
8788               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8789             }
8790           }
8791           else
8792           {
8793             // Unneeded
8794             if(r==0){
8795               regs[i].regmap_entry[hr]=0;
8796             }
8797             else
8798             if(r<64){
8799               if((current.u>>r)&1) {
8800                 regs[i].regmap_entry[hr]=-1;
8801                 //regs[i].regmap[hr]=-1;
8802                 current.regmap[hr]=-1;
8803               }else
8804                 regs[i].regmap_entry[hr]=r;
8805             }
8806             else {
8807               if((current.uu>>(r&63))&1) {
8808                 regs[i].regmap_entry[hr]=-1;
8809                 //regs[i].regmap[hr]=-1;
8810                 current.regmap[hr]=-1;
8811               }else
8812                 regs[i].regmap_entry[hr]=r;
8813             }
8814           }
8815         } else {
8816           // Branches expect CCREG to be allocated at the target
8817           if(regmap_pre[i][hr]==CCREG) 
8818             regs[i].regmap_entry[hr]=CCREG;
8819           else
8820             regs[i].regmap_entry[hr]=-1;
8821         }
8822       }
8823       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8824     }
8825     /* Branch post-alloc */
8826     if(i>0)
8827     {
8828       current.was32=current.is32;
8829       current.wasdirty=current.dirty;
8830       switch(itype[i-1]) {
8831         case UJUMP:
8832           memcpy(&branch_regs[i-1],&current,sizeof(current));
8833           branch_regs[i-1].isconst=0;
8834           branch_regs[i-1].wasconst=0;
8835           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8836           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8837           alloc_cc(&branch_regs[i-1],i-1);
8838           dirty_reg(&branch_regs[i-1],CCREG);
8839           if(rt1[i-1]==31) { // JAL
8840             alloc_reg(&branch_regs[i-1],i-1,31);
8841             dirty_reg(&branch_regs[i-1],31);
8842             branch_regs[i-1].is32|=1LL<<31;
8843           }
8844           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8845           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8846           break;
8847         case RJUMP:
8848           memcpy(&branch_regs[i-1],&current,sizeof(current));
8849           branch_regs[i-1].isconst=0;
8850           branch_regs[i-1].wasconst=0;
8851           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8852           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8853           alloc_cc(&branch_regs[i-1],i-1);
8854           dirty_reg(&branch_regs[i-1],CCREG);
8855           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8856           if(rt1[i-1]==31) { // JALR
8857             alloc_reg(&branch_regs[i-1],i-1,31);
8858             dirty_reg(&branch_regs[i-1],31);
8859             branch_regs[i-1].is32|=1LL<<31;
8860           }
8861           #ifdef USE_MINI_HT
8862           if(rs1[i-1]==31) { // JALR
8863             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8864             #ifndef HOST_IMM_ADDR32
8865             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8866             #endif
8867           }
8868           #endif
8869           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8870           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8871           break;
8872         case CJUMP:
8873           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8874           {
8875             alloc_cc(&current,i-1);
8876             dirty_reg(&current,CCREG);
8877             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8878                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8879               // The delay slot overwrote one of our conditions
8880               // Delay slot goes after the test (in order)
8881               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8882               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8883               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8884               current.u|=1;
8885               current.uu|=1;
8886               delayslot_alloc(&current,i);
8887               current.isconst=0;
8888             }
8889             else
8890             {
8891               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8892               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8893               // Alloc the branch condition registers
8894               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8895               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8896               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8897               {
8898                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8899                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8900               }
8901             }
8902             memcpy(&branch_regs[i-1],&current,sizeof(current));
8903             branch_regs[i-1].isconst=0;
8904             branch_regs[i-1].wasconst=0;
8905             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8906             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8907           }
8908           else
8909           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8910           {
8911             alloc_cc(&current,i-1);
8912             dirty_reg(&current,CCREG);
8913             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8914               // The delay slot overwrote the branch condition
8915               // Delay slot goes after the test (in order)
8916               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8917               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8918               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8919               current.u|=1;
8920               current.uu|=1;
8921               delayslot_alloc(&current,i);
8922               current.isconst=0;
8923             }
8924             else
8925             {
8926               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8927               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8928               // Alloc the branch condition register
8929               alloc_reg(&current,i-1,rs1[i-1]);
8930               if(!(current.is32>>rs1[i-1]&1))
8931               {
8932                 alloc_reg64(&current,i-1,rs1[i-1]);
8933               }
8934             }
8935             memcpy(&branch_regs[i-1],&current,sizeof(current));
8936             branch_regs[i-1].isconst=0;
8937             branch_regs[i-1].wasconst=0;
8938             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8939             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8940           }
8941           else
8942           // Alloc the delay slot in case the branch is taken
8943           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8944           {
8945             memcpy(&branch_regs[i-1],&current,sizeof(current));
8946             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8947             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8948             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8949             alloc_cc(&branch_regs[i-1],i);
8950             dirty_reg(&branch_regs[i-1],CCREG);
8951             delayslot_alloc(&branch_regs[i-1],i);
8952             branch_regs[i-1].isconst=0;
8953             alloc_reg(&current,i,CCREG); // Not taken path
8954             dirty_reg(&current,CCREG);
8955             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8956           }
8957           else
8958           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8959           {
8960             memcpy(&branch_regs[i-1],&current,sizeof(current));
8961             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8962             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8963             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8964             alloc_cc(&branch_regs[i-1],i);
8965             dirty_reg(&branch_regs[i-1],CCREG);
8966             delayslot_alloc(&branch_regs[i-1],i);
8967             branch_regs[i-1].isconst=0;
8968             alloc_reg(&current,i,CCREG); // Not taken path
8969             dirty_reg(&current,CCREG);
8970             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8971           }
8972           break;
8973         case SJUMP:
8974           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8975           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8976           {
8977             alloc_cc(&current,i-1);
8978             dirty_reg(&current,CCREG);
8979             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8980               // The delay slot overwrote the branch condition
8981               // Delay slot goes after the test (in order)
8982               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8983               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8984               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8985               current.u|=1;
8986               current.uu|=1;
8987               delayslot_alloc(&current,i);
8988               current.isconst=0;
8989             }
8990             else
8991             {
8992               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8993               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8994               // Alloc the branch condition register
8995               alloc_reg(&current,i-1,rs1[i-1]);
8996               if(!(current.is32>>rs1[i-1]&1))
8997               {
8998                 alloc_reg64(&current,i-1,rs1[i-1]);
8999               }
9000             }
9001             memcpy(&branch_regs[i-1],&current,sizeof(current));
9002             branch_regs[i-1].isconst=0;
9003             branch_regs[i-1].wasconst=0;
9004             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9005             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9006           }
9007           else
9008           // Alloc the delay slot in case the branch is taken
9009           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9010           {
9011             memcpy(&branch_regs[i-1],&current,sizeof(current));
9012             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9013             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9014             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9015             alloc_cc(&branch_regs[i-1],i);
9016             dirty_reg(&branch_regs[i-1],CCREG);
9017             delayslot_alloc(&branch_regs[i-1],i);
9018             branch_regs[i-1].isconst=0;
9019             alloc_reg(&current,i,CCREG); // Not taken path
9020             dirty_reg(&current,CCREG);
9021             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9022           }
9023           // FIXME: BLTZAL/BGEZAL
9024           if(opcode2[i-1]&0x10) { // BxxZAL
9025             alloc_reg(&branch_regs[i-1],i-1,31);
9026             dirty_reg(&branch_regs[i-1],31);
9027             branch_regs[i-1].is32|=1LL<<31;
9028           }
9029           break;
9030         case FJUMP:
9031           if(likely[i-1]==0) // BC1F/BC1T
9032           {
9033             alloc_cc(&current,i-1);
9034             dirty_reg(&current,CCREG);
9035             if(itype[i]==FCOMP) {
9036               // The delay slot overwrote the branch condition
9037               // Delay slot goes after the test (in order)
9038               delayslot_alloc(&current,i);
9039               current.isconst=0;
9040             }
9041             else
9042             {
9043               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9044               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9045               // Alloc the branch condition register
9046               alloc_reg(&current,i-1,FSREG);
9047             }
9048             memcpy(&branch_regs[i-1],&current,sizeof(current));
9049             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9050           }
9051           else // BC1FL/BC1TL
9052           {
9053             // Alloc the delay slot in case the branch is taken
9054             memcpy(&branch_regs[i-1],&current,sizeof(current));
9055             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9056             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9057             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9058             alloc_cc(&branch_regs[i-1],i);
9059             dirty_reg(&branch_regs[i-1],CCREG);
9060             delayslot_alloc(&branch_regs[i-1],i);
9061             branch_regs[i-1].isconst=0;
9062             alloc_reg(&current,i,CCREG); // Not taken path
9063             dirty_reg(&current,CCREG);
9064             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9065           }
9066           break;
9067       }
9068
9069       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9070       {
9071         if(rt1[i-1]==31) // JAL/JALR
9072         {
9073           // Subroutine call will return here, don't alloc any registers
9074           current.is32=1;
9075           current.dirty=0;
9076           clear_all_regs(current.regmap);
9077           alloc_reg(&current,i,CCREG);
9078           dirty_reg(&current,CCREG);
9079         }
9080         else if(i+1<slen)
9081         {
9082           // Internal branch will jump here, match registers to caller
9083           current.is32=0x3FFFFFFFFLL;
9084           current.dirty=0;
9085           clear_all_regs(current.regmap);
9086           alloc_reg(&current,i,CCREG);
9087           dirty_reg(&current,CCREG);
9088           for(j=i-1;j>=0;j--)
9089           {
9090             if(ba[j]==start+i*4+4) {
9091               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9092               current.is32=branch_regs[j].is32;
9093               current.dirty=branch_regs[j].dirty;
9094               break;
9095             }
9096           }
9097           while(j>=0) {
9098             if(ba[j]==start+i*4+4) {
9099               for(hr=0;hr<HOST_REGS;hr++) {
9100                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9101                   current.regmap[hr]=-1;
9102                 }
9103                 current.is32&=branch_regs[j].is32;
9104                 current.dirty&=branch_regs[j].dirty;
9105               }
9106             }
9107             j--;
9108           }
9109         }
9110       }
9111     }
9112
9113     // Count cycles in between branches
9114     ccadj[i]=cc;
9115     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9116     {
9117       cc=0;
9118     }
9119     else
9120     {
9121       cc++;
9122     }
9123
9124     flush_dirty_uppers(&current);
9125     if(!is_ds[i]) {
9126       regs[i].is32=current.is32;
9127       regs[i].dirty=current.dirty;
9128       regs[i].isconst=current.isconst;
9129       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9130     }
9131     for(hr=0;hr<HOST_REGS;hr++) {
9132       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9133         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9134           regs[i].wasconst&=~(1<<hr);
9135         }
9136       }
9137     }
9138     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9139   }
9140   
9141   /* Pass 4 - Cull unused host registers */
9142   
9143   uint64_t nr=0;
9144   
9145   for (i=slen-1;i>=0;i--)
9146   {
9147     int hr;
9148     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9149     {
9150       if(ba[i]<start || ba[i]>=(start+slen*4))
9151       {
9152         // Branch out of this block, don't need anything
9153         nr=0;
9154       }
9155       else
9156       {
9157         // Internal branch
9158         // Need whatever matches the target
9159         nr=0;
9160         int t=(ba[i]-start)>>2;
9161         for(hr=0;hr<HOST_REGS;hr++)
9162         {
9163           if(regs[i].regmap_entry[hr]>=0) {
9164             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9165           }
9166         }
9167       }
9168       // Conditional branch may need registers for following instructions
9169       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9170       {
9171         if(i<slen-2) {
9172           nr|=needed_reg[i+2];
9173           for(hr=0;hr<HOST_REGS;hr++)
9174           {
9175             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9176             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9177           }
9178         }
9179       }
9180       // Don't need stuff which is overwritten
9181       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9182       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9183       // Merge in delay slot
9184       for(hr=0;hr<HOST_REGS;hr++)
9185       {
9186         if(!likely[i]) {
9187           // These are overwritten unless the branch is "likely"
9188           // and the delay slot is nullified if not taken
9189           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9190           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9191         }
9192         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9193         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9194         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9195         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9196         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9197         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9198         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9199         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9200         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9201           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9202           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9203         }
9204         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9205           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9206           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9207         }
9208         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9209           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9210           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9211         }
9212       }
9213     }
9214     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
9215     {
9216       // SYSCALL instruction (software interrupt)
9217       nr=0;
9218     }
9219     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9220     {
9221       // ERET instruction (return from interrupt)
9222       nr=0;
9223     }
9224     else // Non-branch
9225     {
9226       if(i<slen-1) {
9227         for(hr=0;hr<HOST_REGS;hr++) {
9228           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9229           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9230           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9231           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9232         }
9233       }
9234     }
9235     for(hr=0;hr<HOST_REGS;hr++)
9236     {
9237       // Overwritten registers are not needed
9238       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9239       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9240       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9241       // Source registers are needed
9242       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9243       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9244       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9245       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9246       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9247       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9248       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9249       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9250       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9251         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9252         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9253       }
9254       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9255         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9256         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9257       }
9258       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9259         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9260         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9261       }
9262       // Don't store a register immediately after writing it,
9263       // may prevent dual-issue.
9264       // But do so if this is a branch target, otherwise we
9265       // might have to load the register before the branch.
9266       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9267         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9268            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9269           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9270           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9271         }
9272         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9273            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9274           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9275           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9276         }
9277       }
9278     }
9279     // Cycle count is needed at branches.  Assume it is needed at the target too.
9280     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9281       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9282       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9283     }
9284     // Save it
9285     needed_reg[i]=nr;
9286     
9287     // Deallocate unneeded registers
9288     for(hr=0;hr<HOST_REGS;hr++)
9289     {
9290       if(!((nr>>hr)&1)) {
9291         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9292         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9293            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9294            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9295         {
9296           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9297           {
9298             if(likely[i]) {
9299               regs[i].regmap[hr]=-1;
9300               regs[i].isconst&=~(1<<hr);
9301               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9302             }
9303           }
9304         }
9305         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9306         {
9307           int d1=0,d2=0,map=0,temp=0;
9308           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9309           {
9310             d1=dep1[i+1];
9311             d2=dep2[i+1];
9312           }
9313           if(using_tlb) {
9314             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9315                itype[i+1]==STORE || itype[i+1]==STORELR ||
9316                itype[i+1]==C1LS )
9317             map=TLREG;
9318           } else
9319           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9320             map=INVCP;
9321           }
9322           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9323              itype[i+1]==C1LS )
9324             temp=FTEMP;
9325           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9326              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9327              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9328              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9329              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9330              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9331              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9332              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9333              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9334              regs[i].regmap[hr]!=map )
9335           {
9336             regs[i].regmap[hr]=-1;
9337             regs[i].isconst&=~(1<<hr);
9338             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9339                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9340                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9341                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9342                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9343                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9344                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9345                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9346                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9347                branch_regs[i].regmap[hr]!=map)
9348             {
9349               branch_regs[i].regmap[hr]=-1;
9350               branch_regs[i].regmap_entry[hr]=-1;
9351               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9352               {
9353                 if(!likely[i]&&i<slen-2) {
9354                   regmap_pre[i+2][hr]=-1;
9355                 }
9356               }
9357             }
9358           }
9359         }
9360         else
9361         {
9362           // Non-branch
9363           if(i>0)
9364           {
9365             int d1=0,d2=0,map=-1,temp=-1;
9366             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9367             {
9368               d1=dep1[i];
9369               d2=dep2[i];
9370             }
9371             if(using_tlb) {
9372               if(itype[i]==LOAD || itype[i]==LOADLR ||
9373                  itype[i]==STORE || itype[i]==STORELR ||
9374                  itype[i]==C1LS )
9375               map=TLREG;
9376             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9377               map=INVCP;
9378             }
9379             if(itype[i]==LOADLR || itype[i]==STORELR ||
9380                itype[i]==C1LS )
9381               temp=FTEMP;
9382             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9383                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9384                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9385                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9386                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9387                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9388             {
9389               if(i<slen-1&&!is_ds[i]) {
9390                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9391                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9392                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9393                 {
9394                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9395                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9396                 }
9397                 regmap_pre[i+1][hr]=-1;
9398                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9399               }
9400               regs[i].regmap[hr]=-1;
9401               regs[i].isconst&=~(1<<hr);
9402             }
9403           }
9404         }
9405       }
9406     }
9407   }
9408   
9409   /* Pass 5 - Pre-allocate registers */
9410   
9411   // If a register is allocated during a loop, try to allocate it for the
9412   // entire loop, if possible.  This avoids loading/storing registers
9413   // inside of the loop.
9414
9415   signed char f_regmap[HOST_REGS];
9416   clear_all_regs(f_regmap);
9417   for(i=0;i<slen-1;i++)
9418   {
9419     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9420     {
9421       if(ba[i]>=start && ba[i]<(start+i*4)) 
9422       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9423       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9424       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9425       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9426       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9427       {
9428         int t=(ba[i]-start)>>2;
9429         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9430         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9431         for(hr=0;hr<HOST_REGS;hr++)
9432         {
9433           if(regs[i].regmap[hr]>64) {
9434             if(!((regs[i].dirty>>hr)&1))
9435               f_regmap[hr]=regs[i].regmap[hr];
9436             else f_regmap[hr]=-1;
9437           }
9438           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9439           if(branch_regs[i].regmap[hr]>64) {
9440             if(!((branch_regs[i].dirty>>hr)&1))
9441               f_regmap[hr]=branch_regs[i].regmap[hr];
9442             else f_regmap[hr]=-1;
9443           }
9444           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9445           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9446           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9447           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9448           {
9449             // Test both in case the delay slot is ooo,
9450             // could be done better...
9451             if(count_free_regs(branch_regs[i].regmap)<2
9452              ||count_free_regs(regs[i].regmap)<2) 
9453               f_regmap[hr]=branch_regs[i].regmap[hr];
9454           }
9455           // Avoid dirty->clean transition
9456           // #ifdef DESTRUCTIVE_WRITEBACK here?
9457           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9458           if(f_regmap[hr]>0) {
9459             if(regs[t].regmap_entry[hr]<0) {
9460               int r=f_regmap[hr];
9461               for(j=t;j<=i;j++)
9462               {
9463                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9464                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9465                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9466                 if(r>63) {
9467                   // NB This can exclude the case where the upper-half
9468                   // register is lower numbered than the lower-half
9469                   // register.  Not sure if it's worth fixing...
9470                   if(get_reg(regs[j].regmap,r&63)<0) break;
9471                   if(regs[j].is32&(1LL<<(r&63))) break;
9472                 }
9473                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9474                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9475                   int k;
9476                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9477                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9478                     if(r>63) {
9479                       if(get_reg(regs[i].regmap,r&63)<0) break;
9480                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9481                     }
9482                     k=i;
9483                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9484                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9485                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9486                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9487                       ||itype[k-1]==FCOMP) {
9488                         if(count_free_regs(regs[k-1].regmap)<2) {
9489                           //printf("no free regs for store %x\n",start+(k-1)*4);
9490                           break;
9491                         }
9492                       }
9493                       else
9494                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9495                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9496                         //printf("no-match due to different register\n");
9497                         break;
9498                       }
9499                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9500                         //printf("no-match due to branch\n");
9501                         break;
9502                       }
9503                       // call/ret fast path assumes no registers allocated
9504                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9505                         break;
9506                       }
9507                       if(r>63) {
9508                         // NB This can exclude the case where the upper-half
9509                         // register is lower numbered than the lower-half
9510                         // register.  Not sure if it's worth fixing...
9511                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9512                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9513                       }
9514                       k--;
9515                     }
9516                     if(i<slen-1) {
9517                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9518                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9519                         //printf("bad match after branch\n");
9520                         break;
9521                       }
9522                     }
9523                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9524                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9525                       while(k<i) {
9526                         regs[k].regmap_entry[hr]=f_regmap[hr];
9527                         regs[k].regmap[hr]=f_regmap[hr];
9528                         regmap_pre[k+1][hr]=f_regmap[hr];
9529                         regs[k].wasdirty&=~(1<<hr);
9530                         regs[k].dirty&=~(1<<hr);
9531                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9532                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9533                         regs[k].wasconst&=~(1<<hr);
9534                         regs[k].isconst&=~(1<<hr);
9535                         k++;
9536                       }
9537                     }
9538                     else {
9539                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9540                       break;
9541                     }
9542                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9543                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9544                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9545                       regs[i].regmap_entry[hr]=f_regmap[hr];
9546                       regs[i].regmap[hr]=f_regmap[hr];
9547                       regs[i].wasdirty&=~(1<<hr);
9548                       regs[i].dirty&=~(1<<hr);
9549                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9550                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9551                       regs[i].wasconst&=~(1<<hr);
9552                       regs[i].isconst&=~(1<<hr);
9553                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9554                       branch_regs[i].wasdirty&=~(1<<hr);
9555                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9556                       branch_regs[i].regmap[hr]=f_regmap[hr];
9557                       branch_regs[i].dirty&=~(1<<hr);
9558                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9559                       branch_regs[i].wasconst&=~(1<<hr);
9560                       branch_regs[i].isconst&=~(1<<hr);
9561                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9562                         regmap_pre[i+2][hr]=f_regmap[hr];
9563                         regs[i+2].wasdirty&=~(1<<hr);
9564                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9565                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9566                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9567                       }
9568                     }
9569                   }
9570                   for(k=t;k<j;k++) {
9571                     regs[k].regmap_entry[hr]=f_regmap[hr];
9572                     regs[k].regmap[hr]=f_regmap[hr];
9573                     regmap_pre[k+1][hr]=f_regmap[hr];
9574                     regs[k+1].wasdirty&=~(1<<hr);
9575                     regs[k].dirty&=~(1<<hr);
9576                     regs[k].wasconst&=~(1<<hr);
9577                     regs[k].isconst&=~(1<<hr);
9578                   }
9579                   if(regs[j].regmap[hr]==f_regmap[hr])
9580                     regs[j].regmap_entry[hr]=f_regmap[hr];
9581                   break;
9582                 }
9583                 if(j==i) break;
9584                 if(regs[j].regmap[hr]>=0)
9585                   break;
9586                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9587                   //printf("no-match due to different register\n");
9588                   break;
9589                 }
9590                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9591                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9592                   break;
9593                 }
9594                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9595                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9596                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9597                   if(count_free_regs(regs[j].regmap)<2) {
9598                     //printf("No free regs for store %x\n",start+j*4);
9599                     break;
9600                   }
9601                 }
9602                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9603                 if(f_regmap[hr]>=64) {
9604                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9605                     break;
9606                   }
9607                   else
9608                   {
9609                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9610                       break;
9611                     }
9612                   }
9613                 }
9614               }
9615             }
9616           }
9617         }
9618       }
9619     }else{
9620       int count=0;
9621       for(hr=0;hr<HOST_REGS;hr++)
9622       {
9623         if(hr!=EXCLUDE_REG) {
9624           if(regs[i].regmap[hr]>64) {
9625             if(!((regs[i].dirty>>hr)&1))
9626               f_regmap[hr]=regs[i].regmap[hr];
9627           }
9628           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9629           else if(regs[i].regmap[hr]<0) count++;
9630         }
9631       }
9632       // Try to restore cycle count at branch targets
9633       if(bt[i]) {
9634         for(j=i;j<slen-1;j++) {
9635           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9636           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9637           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9638           ||itype[j]==FCOMP||itype[j]==FCONV) {
9639             if(count_free_regs(regs[j].regmap)<2) {
9640               //printf("no free regs for store %x\n",start+j*4);
9641               break;
9642             }
9643           }
9644           else
9645           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9646         }
9647         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9648           int k=i;
9649           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9650           while(k<j) {
9651             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9652             regs[k].regmap[HOST_CCREG]=CCREG;
9653             regmap_pre[k+1][HOST_CCREG]=CCREG;
9654             regs[k+1].wasdirty|=1<<HOST_CCREG;
9655             regs[k].dirty|=1<<HOST_CCREG;
9656             regs[k].wasconst&=~(1<<HOST_CCREG);
9657             regs[k].isconst&=~(1<<HOST_CCREG);
9658             k++;
9659           }
9660           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9661         }
9662         // Work backwards from the branch target
9663         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9664         {
9665           //printf("Extend backwards\n");
9666           int k;
9667           k=i;
9668           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9669             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9670             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9671             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9672               if(count_free_regs(regs[k-1].regmap)<2) {
9673                 //printf("no free regs for store %x\n",start+(k-1)*4);
9674                 break;
9675               }
9676             }
9677             else
9678             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9679             k--;
9680           }
9681           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9682             //printf("Extend CC, %x ->\n",start+k*4);
9683             while(k<=i) {
9684               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9685               regs[k].regmap[HOST_CCREG]=CCREG;
9686               regmap_pre[k+1][HOST_CCREG]=CCREG;
9687               regs[k+1].wasdirty|=1<<HOST_CCREG;
9688               regs[k].dirty|=1<<HOST_CCREG;
9689               regs[k].wasconst&=~(1<<HOST_CCREG);
9690               regs[k].isconst&=~(1<<HOST_CCREG);
9691               k++;
9692             }
9693           }
9694           else {
9695             //printf("Fail Extend CC, %x ->\n",start+k*4);
9696           }
9697         }
9698       }
9699       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9700          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9701          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9702          itype[i]!=FCONV&&itype[i]!=FCOMP)
9703       {
9704         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9705       }
9706     }
9707   }
9708   
9709   // This allocates registers (if possible) one instruction prior
9710   // to use, which can avoid a load-use penalty on certain CPUs.
9711   for(i=0;i<slen-1;i++)
9712   {
9713     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9714     {
9715       if(!bt[i+1])
9716       {
9717         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9718         {
9719           if(rs1[i+1]) {
9720             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9721             {
9722               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9723               {
9724                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9725                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9726                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9727                 regs[i].isconst&=~(1<<hr);
9728                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9729                 constmap[i][hr]=constmap[i+1][hr];
9730                 regs[i+1].wasdirty&=~(1<<hr);
9731                 regs[i].dirty&=~(1<<hr);
9732               }
9733             }
9734           }
9735           if(rs2[i+1]) {
9736             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9737             {
9738               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9739               {
9740                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9741                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9742                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9743                 regs[i].isconst&=~(1<<hr);
9744                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9745                 constmap[i][hr]=constmap[i+1][hr];
9746                 regs[i+1].wasdirty&=~(1<<hr);
9747                 regs[i].dirty&=~(1<<hr);
9748               }
9749             }
9750           }
9751           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9752             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9753             {
9754               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9755               {
9756                 regs[i].regmap[hr]=rs1[i+1];
9757                 regmap_pre[i+1][hr]=rs1[i+1];
9758                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9759                 regs[i].isconst&=~(1<<hr);
9760                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9761                 constmap[i][hr]=constmap[i+1][hr];
9762                 regs[i+1].wasdirty&=~(1<<hr);
9763                 regs[i].dirty&=~(1<<hr);
9764               }
9765             }
9766           }
9767           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9768             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9769             {
9770               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9771               {
9772                 regs[i].regmap[hr]=rs1[i+1];
9773                 regmap_pre[i+1][hr]=rs1[i+1];
9774                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9775                 regs[i].isconst&=~(1<<hr);
9776                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9777                 constmap[i][hr]=constmap[i+1][hr];
9778                 regs[i+1].wasdirty&=~(1<<hr);
9779                 regs[i].dirty&=~(1<<hr);
9780               }
9781             }
9782           }
9783           #ifndef HOST_IMM_ADDR32
9784           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9785             hr=get_reg(regs[i+1].regmap,TLREG);
9786             if(hr>=0) {
9787               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9788               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9789                 int nr;
9790                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9791                 {
9792                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9793                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9794                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9795                   regs[i].isconst&=~(1<<hr);
9796                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9797                   constmap[i][hr]=constmap[i+1][hr];
9798                   regs[i+1].wasdirty&=~(1<<hr);
9799                   regs[i].dirty&=~(1<<hr);
9800                 }
9801                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9802                 {
9803                   // move it to another register
9804                   regs[i+1].regmap[hr]=-1;
9805                   regmap_pre[i+2][hr]=-1;
9806                   regs[i+1].regmap[nr]=TLREG;
9807                   regmap_pre[i+2][nr]=TLREG;
9808                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9809                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9810                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9811                   regs[i].isconst&=~(1<<nr);
9812                   regs[i+1].isconst&=~(1<<nr);
9813                   regs[i].dirty&=~(1<<nr);
9814                   regs[i+1].wasdirty&=~(1<<nr);
9815                   regs[i+1].dirty&=~(1<<nr);
9816                   regs[i+2].wasdirty&=~(1<<nr);
9817                 }
9818               }
9819             }
9820           }
9821           #endif
9822           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9823             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9824               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9825               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9826               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9827               assert(hr>=0);
9828               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9829               {
9830                 regs[i].regmap[hr]=rs1[i+1];
9831                 regmap_pre[i+1][hr]=rs1[i+1];
9832                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9833                 regs[i].isconst&=~(1<<hr);
9834                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9835                 constmap[i][hr]=constmap[i+1][hr];
9836                 regs[i+1].wasdirty&=~(1<<hr);
9837                 regs[i].dirty&=~(1<<hr);
9838               }
9839             }
9840           }
9841           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9842             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9843               int nr;
9844               hr=get_reg(regs[i+1].regmap,FTEMP);
9845               assert(hr>=0);
9846               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9847               {
9848                 regs[i].regmap[hr]=rs1[i+1];
9849                 regmap_pre[i+1][hr]=rs1[i+1];
9850                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9851                 regs[i].isconst&=~(1<<hr);
9852                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9853                 constmap[i][hr]=constmap[i+1][hr];
9854                 regs[i+1].wasdirty&=~(1<<hr);
9855                 regs[i].dirty&=~(1<<hr);
9856               }
9857               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9858               {
9859                 // move it to another register
9860                 regs[i+1].regmap[hr]=-1;
9861                 regmap_pre[i+2][hr]=-1;
9862                 regs[i+1].regmap[nr]=FTEMP;
9863                 regmap_pre[i+2][nr]=FTEMP;
9864                 regs[i].regmap[nr]=rs1[i+1];
9865                 regmap_pre[i+1][nr]=rs1[i+1];
9866                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9867                 regs[i].isconst&=~(1<<nr);
9868                 regs[i+1].isconst&=~(1<<nr);
9869                 regs[i].dirty&=~(1<<nr);
9870                 regs[i+1].wasdirty&=~(1<<nr);
9871                 regs[i+1].dirty&=~(1<<nr);
9872                 regs[i+2].wasdirty&=~(1<<nr);
9873               }
9874             }
9875           }
9876           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9877             if(itype[i+1]==LOAD) 
9878               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9879             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9880               hr=get_reg(regs[i+1].regmap,FTEMP);
9881             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9882               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9883               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9884             }
9885             if(hr>=0&&regs[i].regmap[hr]<0) {
9886               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9887               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9888                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9889                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9890                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9891                 regs[i].isconst&=~(1<<hr);
9892                 regs[i+1].wasdirty&=~(1<<hr);
9893                 regs[i].dirty&=~(1<<hr);
9894               }
9895             }
9896           }
9897         }
9898       }
9899     }
9900   }
9901   
9902   /* Pass 6 - Optimize clean/dirty state */
9903   clean_registers(0,slen-1,1);
9904   
9905   /* Pass 7 - Identify 32-bit registers */
9906   
9907   provisional_r32();
9908
9909   u_int r32=0;
9910   
9911   for (i=slen-1;i>=0;i--)
9912   {
9913     int hr;
9914     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9915     {
9916       if(ba[i]<start || ba[i]>=(start+slen*4))
9917       {
9918         // Branch out of this block, don't need anything
9919         r32=0;
9920       }
9921       else
9922       {
9923         // Internal branch
9924         // Need whatever matches the target
9925         // (and doesn't get overwritten by the delay slot instruction)
9926         r32=0;
9927         int t=(ba[i]-start)>>2;
9928         if(ba[i]>start+i*4) {
9929           // Forward branch
9930           if(!(requires_32bit[t]&~regs[i].was32))
9931             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9932         }else{
9933           // Backward branch
9934           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9935           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9936           if(!(pr32[t]&~regs[i].was32))
9937             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9938         }
9939       }
9940       // Conditional branch may need registers for following instructions
9941       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9942       {
9943         if(i<slen-2) {
9944           r32|=requires_32bit[i+2];
9945           r32&=regs[i].was32;
9946           // Mark this address as a branch target since it may be called
9947           // upon return from interrupt
9948           bt[i+2]=1;
9949         }
9950       }
9951       // Merge in delay slot
9952       if(!likely[i]) {
9953         // These are overwritten unless the branch is "likely"
9954         // and the delay slot is nullified if not taken
9955         r32&=~(1LL<<rt1[i+1]);
9956         r32&=~(1LL<<rt2[i+1]);
9957       }
9958       // Assume these are needed (delay slot)
9959       if(us1[i+1]>0)
9960       {
9961         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9962       }
9963       if(us2[i+1]>0)
9964       {
9965         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9966       }
9967       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9968       {
9969         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9970       }
9971       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9972       {
9973         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9974       }
9975     }
9976     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
9977     {
9978       // SYSCALL instruction (software interrupt)
9979       r32=0;
9980     }
9981     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9982     {
9983       // ERET instruction (return from interrupt)
9984       r32=0;
9985     }
9986     // Check 32 bits
9987     r32&=~(1LL<<rt1[i]);
9988     r32&=~(1LL<<rt2[i]);
9989     if(us1[i]>0)
9990     {
9991       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9992     }
9993     if(us2[i]>0)
9994     {
9995       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9996     }
9997     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
9998     {
9999       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10000     }
10001     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10002     {
10003       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10004     }
10005     requires_32bit[i]=r32;
10006     
10007     // Dirty registers which are 32-bit, require 32-bit input
10008     // as they will be written as 32-bit values
10009     for(hr=0;hr<HOST_REGS;hr++)
10010     {
10011       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10012         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10013           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10014           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10015         }
10016       }
10017     }
10018     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10019   }
10020
10021   if(itype[slen-1]==SPAN) {
10022     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10023   }
10024   
10025   /* Debug/disassembly */
10026   if((void*)assem_debug==(void*)printf) 
10027   for(i=0;i<slen;i++)
10028   {
10029     printf("U:");
10030     int r;
10031     for(r=1;r<=CCREG;r++) {
10032       if((unneeded_reg[i]>>r)&1) {
10033         if(r==HIREG) printf(" HI");
10034         else if(r==LOREG) printf(" LO");
10035         else printf(" r%d",r);
10036       }
10037     }
10038 #ifndef FORCE32
10039     printf(" UU:");
10040     for(r=1;r<=CCREG;r++) {
10041       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10042         if(r==HIREG) printf(" HI");
10043         else if(r==LOREG) printf(" LO");
10044         else printf(" r%d",r);
10045       }
10046     }
10047     printf(" 32:");
10048     for(r=0;r<=CCREG;r++) {
10049       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10050       if((regs[i].was32>>r)&1) {
10051         if(r==CCREG) printf(" CC");
10052         else if(r==HIREG) printf(" HI");
10053         else if(r==LOREG) printf(" LO");
10054         else printf(" r%d",r);
10055       }
10056     }
10057 #endif
10058     printf("\n");
10059     #if defined(__i386__) || defined(__x86_64__)
10060     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10061     #endif
10062     #ifdef __arm__
10063     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10064     #endif
10065     printf("needs: ");
10066     if(needed_reg[i]&1) printf("eax ");
10067     if((needed_reg[i]>>1)&1) printf("ecx ");
10068     if((needed_reg[i]>>2)&1) printf("edx ");
10069     if((needed_reg[i]>>3)&1) printf("ebx ");
10070     if((needed_reg[i]>>5)&1) printf("ebp ");
10071     if((needed_reg[i]>>6)&1) printf("esi ");
10072     if((needed_reg[i]>>7)&1) printf("edi ");
10073     printf("r:");
10074     for(r=0;r<=CCREG;r++) {
10075       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10076       if((requires_32bit[i]>>r)&1) {
10077         if(r==CCREG) printf(" CC");
10078         else if(r==HIREG) printf(" HI");
10079         else if(r==LOREG) printf(" LO");
10080         else printf(" r%d",r);
10081       }
10082     }
10083     printf("\n");
10084     /*printf("pr:");
10085     for(r=0;r<=CCREG;r++) {
10086       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10087       if((pr32[i]>>r)&1) {
10088         if(r==CCREG) printf(" CC");
10089         else if(r==HIREG) printf(" HI");
10090         else if(r==LOREG) printf(" LO");
10091         else printf(" r%d",r);
10092       }
10093     }
10094     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10095     printf("\n");*/
10096     #if defined(__i386__) || defined(__x86_64__)
10097     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10098     printf("dirty: ");
10099     if(regs[i].wasdirty&1) printf("eax ");
10100     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10101     if((regs[i].wasdirty>>2)&1) printf("edx ");
10102     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10103     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10104     if((regs[i].wasdirty>>6)&1) printf("esi ");
10105     if((regs[i].wasdirty>>7)&1) printf("edi ");
10106     #endif
10107     #ifdef __arm__
10108     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10109     printf("dirty: ");
10110     if(regs[i].wasdirty&1) printf("r0 ");
10111     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10112     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10113     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10114     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10115     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10116     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10117     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10118     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10119     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10120     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10121     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10122     #endif
10123     printf("\n");
10124     disassemble_inst(i);
10125     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10126     #if defined(__i386__) || defined(__x86_64__)
10127     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10128     if(regs[i].dirty&1) printf("eax ");
10129     if((regs[i].dirty>>1)&1) printf("ecx ");
10130     if((regs[i].dirty>>2)&1) printf("edx ");
10131     if((regs[i].dirty>>3)&1) printf("ebx ");
10132     if((regs[i].dirty>>5)&1) printf("ebp ");
10133     if((regs[i].dirty>>6)&1) printf("esi ");
10134     if((regs[i].dirty>>7)&1) printf("edi ");
10135     #endif
10136     #ifdef __arm__
10137     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10138     if(regs[i].dirty&1) printf("r0 ");
10139     if((regs[i].dirty>>1)&1) printf("r1 ");
10140     if((regs[i].dirty>>2)&1) printf("r2 ");
10141     if((regs[i].dirty>>3)&1) printf("r3 ");
10142     if((regs[i].dirty>>4)&1) printf("r4 ");
10143     if((regs[i].dirty>>5)&1) printf("r5 ");
10144     if((regs[i].dirty>>6)&1) printf("r6 ");
10145     if((regs[i].dirty>>7)&1) printf("r7 ");
10146     if((regs[i].dirty>>8)&1) printf("r8 ");
10147     if((regs[i].dirty>>9)&1) printf("r9 ");
10148     if((regs[i].dirty>>10)&1) printf("r10 ");
10149     if((regs[i].dirty>>12)&1) printf("r12 ");
10150     #endif
10151     printf("\n");
10152     if(regs[i].isconst) {
10153       printf("constants: ");
10154       #if defined(__i386__) || defined(__x86_64__)
10155       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10156       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10157       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10158       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10159       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10160       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10161       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10162       #endif
10163       #ifdef __arm__
10164       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10165       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10166       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10167       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10168       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10169       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10170       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10171       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10172       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10173       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10174       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10175       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10176       #endif
10177       printf("\n");
10178     }
10179 #ifndef FORCE32
10180     printf(" 32:");
10181     for(r=0;r<=CCREG;r++) {
10182       if((regs[i].is32>>r)&1) {
10183         if(r==CCREG) printf(" CC");
10184         else if(r==HIREG) printf(" HI");
10185         else if(r==LOREG) printf(" LO");
10186         else printf(" r%d",r);
10187       }
10188     }
10189     printf("\n");
10190 #endif
10191     /*printf(" p32:");
10192     for(r=0;r<=CCREG;r++) {
10193       if((p32[i]>>r)&1) {
10194         if(r==CCREG) printf(" CC");
10195         else if(r==HIREG) printf(" HI");
10196         else if(r==LOREG) printf(" LO");
10197         else printf(" r%d",r);
10198       }
10199     }
10200     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10201     else printf("\n");*/
10202     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10203       #if defined(__i386__) || defined(__x86_64__)
10204       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10205       if(branch_regs[i].dirty&1) printf("eax ");
10206       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10207       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10208       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10209       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10210       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10211       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10212       #endif
10213       #ifdef __arm__
10214       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10215       if(branch_regs[i].dirty&1) printf("r0 ");
10216       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10217       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10218       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10219       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10220       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10221       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10222       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10223       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10224       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10225       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10226       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10227       #endif
10228 #ifndef FORCE32
10229       printf(" 32:");
10230       for(r=0;r<=CCREG;r++) {
10231         if((branch_regs[i].is32>>r)&1) {
10232           if(r==CCREG) printf(" CC");
10233           else if(r==HIREG) printf(" HI");
10234           else if(r==LOREG) printf(" LO");
10235           else printf(" r%d",r);
10236         }
10237       }
10238       printf("\n");
10239 #endif
10240     }
10241   }
10242
10243   /* Pass 8 - Assembly */
10244   linkcount=0;stubcount=0;
10245   ds=0;is_delayslot=0;
10246   cop1_usable=0;
10247   uint64_t is32_pre=0;
10248   u_int dirty_pre=0;
10249   u_int beginning=(u_int)out;
10250   if((u_int)addr&1) {
10251     ds=1;
10252     pagespan_ds();
10253   }
10254   for(i=0;i<slen;i++)
10255   {
10256     //if(ds) printf("ds: ");
10257     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10258     if(ds) {
10259       ds=0; // Skip delay slot
10260       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10261       instr_addr[i]=0;
10262     } else {
10263       #ifndef DESTRUCTIVE_WRITEBACK
10264       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10265       {
10266         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10267               unneeded_reg[i],unneeded_reg_upper[i]);
10268         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10269               unneeded_reg[i],unneeded_reg_upper[i]);
10270       }
10271       is32_pre=regs[i].is32;
10272       dirty_pre=regs[i].dirty;
10273       #endif
10274       // write back
10275       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10276       {
10277         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10278                       unneeded_reg[i],unneeded_reg_upper[i]);
10279         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10280       }
10281       // branch target entry point
10282       instr_addr[i]=(u_int)out;
10283       assem_debug("<->\n");
10284       // load regs
10285       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10286         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10287       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10288       address_generation(i,&regs[i],regs[i].regmap_entry);
10289       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10290       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10291       {
10292         // Load the delay slot registers if necessary
10293         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10294           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10295         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10296           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10297         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10298           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10299       }
10300       else if(i+1<slen)
10301       {
10302         // Preload registers for following instruction
10303         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10304           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10305             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10306         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10307           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10308             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10309       }
10310       // TODO: if(is_ooo(i)) address_generation(i+1);
10311       if(itype[i]==CJUMP||itype[i]==FJUMP)
10312         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10313       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10314         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10315       if(bt[i]) cop1_usable=0;
10316       // assemble
10317       switch(itype[i]) {
10318         case ALU:
10319           alu_assemble(i,&regs[i]);break;
10320         case IMM16:
10321           imm16_assemble(i,&regs[i]);break;
10322         case SHIFT:
10323           shift_assemble(i,&regs[i]);break;
10324         case SHIFTIMM:
10325           shiftimm_assemble(i,&regs[i]);break;
10326         case LOAD:
10327           load_assemble(i,&regs[i]);break;
10328         case LOADLR:
10329           loadlr_assemble(i,&regs[i]);break;
10330         case STORE:
10331           store_assemble(i,&regs[i]);break;
10332         case STORELR:
10333           storelr_assemble(i,&regs[i]);break;
10334         case COP0:
10335           cop0_assemble(i,&regs[i]);break;
10336         case COP1:
10337           cop1_assemble(i,&regs[i]);break;
10338         case C1LS:
10339           c1ls_assemble(i,&regs[i]);break;
10340         case FCONV:
10341           fconv_assemble(i,&regs[i]);break;
10342         case FLOAT:
10343           float_assemble(i,&regs[i]);break;
10344         case FCOMP:
10345           fcomp_assemble(i,&regs[i]);break;
10346         case MULTDIV:
10347           multdiv_assemble(i,&regs[i]);break;
10348         case MOV:
10349           mov_assemble(i,&regs[i]);break;
10350         case SYSCALL:
10351           syscall_assemble(i,&regs[i]);break;
10352         case HLECALL:
10353           hlecall_assemble(i,&regs[i]);break;
10354         case UJUMP:
10355           ujump_assemble(i,&regs[i]);ds=1;break;
10356         case RJUMP:
10357           rjump_assemble(i,&regs[i]);ds=1;break;
10358         case CJUMP:
10359           cjump_assemble(i,&regs[i]);ds=1;break;
10360         case SJUMP:
10361           sjump_assemble(i,&regs[i]);ds=1;break;
10362         case FJUMP:
10363           fjump_assemble(i,&regs[i]);ds=1;break;
10364         case SPAN:
10365           pagespan_assemble(i,&regs[i]);break;
10366       }
10367       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10368         literal_pool(1024);
10369       else
10370         literal_pool_jumpover(256);
10371     }
10372   }
10373   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10374   // If the block did not end with an unconditional branch,
10375   // add a jump to the next instruction.
10376   if(i>1) {
10377     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10378       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10379       assert(i==slen);
10380       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10381         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10382         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10383           emit_loadreg(CCREG,HOST_CCREG);
10384         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10385       }
10386       else if(!likely[i-2])
10387       {
10388         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10389         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10390       }
10391       else
10392       {
10393         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10394         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10395       }
10396       add_to_linker((int)out,start+i*4,0);
10397       emit_jmp(0);
10398     }
10399   }
10400   else
10401   {
10402     assert(i>0);
10403     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10404     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10405     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10406       emit_loadreg(CCREG,HOST_CCREG);
10407     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10408     add_to_linker((int)out,start+i*4,0);
10409     emit_jmp(0);
10410   }
10411
10412   // TODO: delay slot stubs?
10413   // Stubs
10414   for(i=0;i<stubcount;i++)
10415   {
10416     switch(stubs[i][0])
10417     {
10418       case LOADB_STUB:
10419       case LOADH_STUB:
10420       case LOADW_STUB:
10421       case LOADD_STUB:
10422       case LOADBU_STUB:
10423       case LOADHU_STUB:
10424         do_readstub(i);break;
10425       case STOREB_STUB:
10426       case STOREH_STUB:
10427       case STOREW_STUB:
10428       case STORED_STUB:
10429         do_writestub(i);break;
10430       case CC_STUB:
10431         do_ccstub(i);break;
10432       case INVCODE_STUB:
10433         do_invstub(i);break;
10434       case FP_STUB:
10435         do_cop1stub(i);break;
10436       case STORELR_STUB:
10437         do_unalignedwritestub(i);break;
10438     }
10439   }
10440
10441   /* Pass 9 - Linker */
10442   for(i=0;i<linkcount;i++)
10443   {
10444     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10445     literal_pool(64);
10446     if(!link_addr[i][2])
10447     {
10448       void *stub=out;
10449       void *addr=check_addr(link_addr[i][1]);
10450       emit_extjump(link_addr[i][0],link_addr[i][1]);
10451       if(addr) {
10452         set_jump_target(link_addr[i][0],(int)addr);
10453         add_link(link_addr[i][1],stub);
10454       }
10455       else set_jump_target(link_addr[i][0],(int)stub);
10456     }
10457     else
10458     {
10459       // Internal branch
10460       int target=(link_addr[i][1]-start)>>2;
10461       assert(target>=0&&target<slen);
10462       assert(instr_addr[target]);
10463       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10464       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10465       //#else
10466       set_jump_target(link_addr[i][0],instr_addr[target]);
10467       //#endif
10468     }
10469   }
10470   // External Branch Targets (jump_in)
10471   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10472   for(i=0;i<slen;i++)
10473   {
10474     if(bt[i]||i==0)
10475     {
10476       if(instr_addr[i]) // TODO - delay slots (=null)
10477       {
10478         u_int vaddr=start+i*4;
10479         u_int page=get_page(vaddr);
10480         u_int vpage=get_vpage(vaddr);
10481         literal_pool(256);
10482         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10483         if(!requires_32bit[i])
10484         {
10485           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10486           assem_debug("jump_in: %x\n",start+i*4);
10487           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10488           int entry_point=do_dirty_stub(i);
10489           ll_add(jump_in+page,vaddr,(void *)entry_point);
10490           // If there was an existing entry in the hash table,
10491           // replace it with the new address.
10492           // Don't add new entries.  We'll insert the
10493           // ones that actually get used in check_addr().
10494           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10495           if(ht_bin[0]==vaddr) {
10496             ht_bin[1]=entry_point;
10497           }
10498           if(ht_bin[2]==vaddr) {
10499             ht_bin[3]=entry_point;
10500           }
10501         }
10502         else
10503         {
10504           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10505           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10506           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10507           //int entry_point=(int)out;
10508           ////assem_debug("entry_point: %x\n",entry_point);
10509           //load_regs_entry(i);
10510           //if(entry_point==(int)out)
10511           //  entry_point=instr_addr[i];
10512           //else
10513           //  emit_jmp(instr_addr[i]);
10514           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10515           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10516           int entry_point=do_dirty_stub(i);
10517           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10518         }
10519       }
10520     }
10521   }
10522   // Write out the literal pool if necessary
10523   literal_pool(0);
10524   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10525   // Align code
10526   if(((u_int)out)&7) emit_addnop(13);
10527   #endif
10528   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10529   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10530   memcpy(copy,source,slen*4);
10531   copy+=slen*4;
10532   
10533   #ifdef __arm__
10534   __clear_cache((void *)beginning,out);
10535   #endif
10536   
10537   // If we're within 256K of the end of the buffer,
10538   // start over from the beginning. (Is 256K enough?)
10539   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10540   
10541   // Trap writes to any of the pages we compiled
10542   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10543     invalid_code[i]=0;
10544 #ifndef DISABLE_TLB
10545     memory_map[i]|=0x40000000;
10546     if((signed int)start>=(signed int)0xC0000000) {
10547       assert(using_tlb);
10548       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10549       invalid_code[j]=0;
10550       memory_map[j]|=0x40000000;
10551       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10552     }
10553 #endif
10554   }
10555   
10556   /* Pass 10 - Free memory by expiring oldest blocks */
10557   
10558   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10559   while(expirep!=end)
10560   {
10561     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10562     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10563     inv_debug("EXP: Phase %d\n",expirep);
10564     switch((expirep>>11)&3)
10565     {
10566       case 0:
10567         // Clear jump_in and jump_dirty
10568         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10569         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10570         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10571         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10572         break;
10573       case 1:
10574         // Clear pointers
10575         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10576         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10577         break;
10578       case 2:
10579         // Clear hash table
10580         for(i=0;i<32;i++) {
10581           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10582           if((ht_bin[3]>>shift)==(base>>shift) ||
10583              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10584             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10585             ht_bin[2]=ht_bin[3]=-1;
10586           }
10587           if((ht_bin[1]>>shift)==(base>>shift) ||
10588              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10589             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10590             ht_bin[0]=ht_bin[2];
10591             ht_bin[1]=ht_bin[3];
10592             ht_bin[2]=ht_bin[3]=-1;
10593           }
10594         }
10595         break;
10596       case 3:
10597         // Clear jump_out
10598         #ifdef __arm__
10599         if((expirep&2047)==0)
10600           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10601         #endif
10602         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10603         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10604         break;
10605     }
10606     expirep=(expirep+1)&65535;
10607   }
10608   return 0;
10609 }