drc: fix unsaved register
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   uint64_t unneeded_reg[MAXBLOCK];
88   uint64_t unneeded_reg_upper[MAXBLOCK];
89   uint64_t branch_unneeded_reg[MAXBLOCK];
90   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
91   uint64_t p32[MAXBLOCK];
92   uint64_t pr32[MAXBLOCK];
93   signed char regmap_pre[MAXBLOCK][HOST_REGS];
94   signed char regmap[MAXBLOCK][HOST_REGS];
95   signed char regmap_entry[MAXBLOCK][HOST_REGS];
96   uint64_t constmap[MAXBLOCK][HOST_REGS];
97   uint64_t known_value[HOST_REGS];
98   u_int known_reg;
99   struct regstat regs[MAXBLOCK];
100   struct regstat branch_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124   u_int using_tlb;
125   u_int stop_after_jal;
126   extern u_char restore_candidate[512];
127   extern int cycle_count;
128
129   /* registers that may be allocated */
130   /* 1-31 gpr */
131 #define HIREG 32 // hi
132 #define LOREG 33 // lo
133 #define FSREG 34 // FPU status (FCSR)
134 #define CSREG 35 // Coprocessor status
135 #define CCREG 36 // Cycle count
136 #define INVCP 37 // Pointer to invalid_code
137 #define TEMPREG 38
138 #define FTEMP 38 // FPU temporary register
139 #define PTEMP 39 // Prefetch temporary register
140 #define TLREG 40 // TLB mapping offset
141 #define RHASH 41 // Return address hash
142 #define RHTBL 42 // Return address hash table address
143 #define RTEMP 43 // JR/JALR address register
144 #define MAXREG 43
145 #define AGEN1 44 // Address generation temporary register
146 #define AGEN2 45 // Address generation temporary register
147 #define MGEN1 46 // Maptable address generation temporary register
148 #define MGEN2 47 // Maptable address generation temporary register
149 #define BTREG 48 // Branch target temporary register
150
151   /* instruction types */
152 #define NOP 0     // No operation
153 #define LOAD 1    // Load
154 #define STORE 2   // Store
155 #define LOADLR 3  // Unaligned load
156 #define STORELR 4 // Unaligned store
157 #define MOV 5     // Move 
158 #define ALU 6     // Arithmetic/logic
159 #define MULTDIV 7 // Multiply/divide
160 #define SHIFT 8   // Shift by register
161 #define SHIFTIMM 9// Shift by immediate
162 #define IMM16 10  // 16-bit immediate
163 #define RJUMP 11  // Unconditional jump to register
164 #define UJUMP 12  // Unconditional jump
165 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
166 #define SJUMP 14  // Conditional branch (regimm format)
167 #define COP0 15   // Coprocessor 0
168 #define COP1 16   // Coprocessor 1
169 #define C1LS 17   // Coprocessor 1 load/store
170 #define FJUMP 18  // Conditional branch (floating point)
171 #define FLOAT 19  // Floating point unit
172 #define FCONV 20  // Convert integer to float
173 #define FCOMP 21  // Floating point compare (sets FSREG)
174 #define SYSCALL 22// SYSCALL
175 #define OTHER 23  // Other
176 #define SPAN 24   // Branch/delay slot spans 2 pages
177 #define NI 25     // Not implemented
178 #define HLECALL 26// PCSX fake opcodes for HLE
179
180   /* stubs */
181 #define CC_STUB 1
182 #define FP_STUB 2
183 #define LOADB_STUB 3
184 #define LOADH_STUB 4
185 #define LOADW_STUB 5
186 #define LOADD_STUB 6
187 #define LOADBU_STUB 7
188 #define LOADHU_STUB 8
189 #define STOREB_STUB 9
190 #define STOREH_STUB 10
191 #define STOREW_STUB 11
192 #define STORED_STUB 12
193 #define STORELR_STUB 13
194 #define INVCODE_STUB 14
195
196   /* branch codes */
197 #define TAKEN 1
198 #define NOTTAKEN 2
199 #define NULLDS 3
200
201 // asm linkage
202 int new_recompile_block(int addr);
203 void *get_addr_ht(u_int vaddr);
204 void invalidate_block(u_int block);
205 void invalidate_addr(u_int addr);
206 void remove_hash(int vaddr);
207 void jump_vaddr();
208 void dyna_linker();
209 void dyna_linker_ds();
210 void verify_code();
211 void verify_code_vm();
212 void verify_code_ds();
213 void cc_interrupt();
214 void fp_exception();
215 void fp_exception_ds();
216 void jump_syscall();
217 void jump_syscall_hle();
218 void jump_eret();
219 void jump_hlecall();
220 void new_dyna_leave();
221
222 // TLB
223 void TLBWI_new();
224 void TLBWR_new();
225 void read_nomem_new();
226 void read_nomemb_new();
227 void read_nomemh_new();
228 void read_nomemd_new();
229 void write_nomem_new();
230 void write_nomemb_new();
231 void write_nomemh_new();
232 void write_nomemd_new();
233 void write_rdram_new();
234 void write_rdramb_new();
235 void write_rdramh_new();
236 void write_rdramd_new();
237 extern u_int memory_map[1048576];
238
239 // Needed by assembler
240 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
241 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
242 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
243 void load_all_regs(signed char i_regmap[]);
244 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
245 void load_regs_entry(int t);
246 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
247
248 int tracedebug=0;
249
250 //#define DEBUG_CYCLE_COUNT 1
251
252 void nullf() {}
253 //#define assem_debug printf
254 //#define inv_debug printf
255 #define assem_debug nullf
256 #define inv_debug nullf
257
258 static void tlb_hacks()
259 {
260 #ifndef DISABLE_TLB
261   // Goldeneye hack
262   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
263   {
264     u_int addr;
265     int n;
266     switch (ROM_HEADER->Country_code&0xFF) 
267     {
268       case 0x45: // U
269         addr=0x34b30;
270         break;                   
271       case 0x4A: // J 
272         addr=0x34b70;    
273         break;    
274       case 0x50: // E 
275         addr=0x329f0;
276         break;                        
277       default: 
278         // Unknown country code
279         addr=0;
280         break;
281     }
282     u_int rom_addr=(u_int)rom;
283     #ifdef ROM_COPY
284     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
285     // in the lower 4G of memory to use this hack.  Copy it if necessary.
286     if((void *)rom>(void *)0xffffffff) {
287       munmap(ROM_COPY, 67108864);
288       if(mmap(ROM_COPY, 12582912,
289               PROT_READ | PROT_WRITE,
290               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
291               -1, 0) <= 0) {printf("mmap() failed\n");}
292       memcpy(ROM_COPY,rom,12582912);
293       rom_addr=(u_int)ROM_COPY;
294     }
295     #endif
296     if(addr) {
297       for(n=0x7F000;n<0x80000;n++) {
298         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
299       }
300     }
301   }
302 #endif
303 }
304
305 static u_int get_page(u_int vaddr)
306 {
307   u_int page=(vaddr^0x80000000)>>12;
308 #ifndef DISABLE_TLB
309   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
310 #endif
311   if(page>2048) page=2048+(page&2047);
312   return page;
313 }
314
315 static u_int get_vpage(u_int vaddr)
316 {
317   u_int vpage=(vaddr^0x80000000)>>12;
318 #ifndef DISABLE_TLB
319   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
320 #endif
321   if(vpage>2048) vpage=2048+(vpage&2047);
322   return vpage;
323 }
324
325 // Get address from virtual address
326 // This is called from the recompiled JR/JALR instructions
327 void *get_addr(u_int vaddr)
328 {
329   u_int page=get_page(vaddr);
330   u_int vpage=get_vpage(vaddr);
331   struct ll_entry *head;
332   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
333   head=jump_in[page];
334   while(head!=NULL) {
335     if(head->vaddr==vaddr&&head->reg32==0) {
336   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
337       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
338       ht_bin[3]=ht_bin[1];
339       ht_bin[2]=ht_bin[0];
340       ht_bin[1]=(int)head->addr;
341       ht_bin[0]=vaddr;
342       return head->addr;
343     }
344     head=head->next;
345   }
346   head=jump_dirty[vpage];
347   while(head!=NULL) {
348     if(head->vaddr==vaddr&&head->reg32==0) {
349       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
350       // Don't restore blocks which are about to expire from the cache
351       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
352       if(verify_dirty(head->addr)) {
353         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
354         invalid_code[vaddr>>12]=0;
355         memory_map[vaddr>>12]|=0x40000000;
356         if(vpage<2048) {
357 #ifndef DISABLE_TLB
358           if(tlb_LUT_r[vaddr>>12]) {
359             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
360             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
361           }
362 #endif
363           restore_candidate[vpage>>3]|=1<<(vpage&7);
364         }
365         else restore_candidate[page>>3]|=1<<(page&7);
366         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
367         if(ht_bin[0]==vaddr) {
368           ht_bin[1]=(int)head->addr; // Replace existing entry
369         }
370         else
371         {
372           ht_bin[3]=ht_bin[1];
373           ht_bin[2]=ht_bin[0];
374           ht_bin[1]=(int)head->addr;
375           ht_bin[0]=vaddr;
376         }
377         return head->addr;
378       }
379     }
380     head=head->next;
381   }
382   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
383   int r=new_recompile_block(vaddr);
384   if(r==0) return get_addr(vaddr);
385   // Execute in unmapped page, generate pagefault execption
386   Status|=2;
387   Cause=(vaddr<<31)|0x8;
388   EPC=(vaddr&1)?vaddr-5:vaddr;
389   BadVAddr=(vaddr&~1);
390   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
391   EntryHi=BadVAddr&0xFFFFE000;
392   return get_addr_ht(0x80000000);
393 }
394 // Look up address in hash table first
395 void *get_addr_ht(u_int vaddr)
396 {
397   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
398   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
399   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
400   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
401   return get_addr(vaddr);
402 }
403
404 void *get_addr_32(u_int vaddr,u_int flags)
405 {
406 #ifdef FORCE32
407   return get_addr(vaddr);
408 #endif
409   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
410   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
411   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
412   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
413   u_int page=get_page(vaddr);
414   u_int vpage=get_vpage(vaddr);
415   struct ll_entry *head;
416   head=jump_in[page];
417   while(head!=NULL) {
418     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
419       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
420       if(head->reg32==0) {
421         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
422         if(ht_bin[0]==-1) {
423           ht_bin[1]=(int)head->addr;
424           ht_bin[0]=vaddr;
425         }else if(ht_bin[2]==-1) {
426           ht_bin[3]=(int)head->addr;
427           ht_bin[2]=vaddr;
428         }
429         //ht_bin[3]=ht_bin[1];
430         //ht_bin[2]=ht_bin[0];
431         //ht_bin[1]=(int)head->addr;
432         //ht_bin[0]=vaddr;
433       }
434       return head->addr;
435     }
436     head=head->next;
437   }
438   head=jump_dirty[vpage];
439   while(head!=NULL) {
440     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
441       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
442       // Don't restore blocks which are about to expire from the cache
443       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
444       if(verify_dirty(head->addr)) {
445         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
446         invalid_code[vaddr>>12]=0;
447         memory_map[vaddr>>12]|=0x40000000;
448         if(vpage<2048) {
449 #ifndef DISABLE_TLB
450           if(tlb_LUT_r[vaddr>>12]) {
451             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
452             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
453           }
454 #endif
455           restore_candidate[vpage>>3]|=1<<(vpage&7);
456         }
457         else restore_candidate[page>>3]|=1<<(page&7);
458         if(head->reg32==0) {
459           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
460           if(ht_bin[0]==-1) {
461             ht_bin[1]=(int)head->addr;
462             ht_bin[0]=vaddr;
463           }else if(ht_bin[2]==-1) {
464             ht_bin[3]=(int)head->addr;
465             ht_bin[2]=vaddr;
466           }
467           //ht_bin[3]=ht_bin[1];
468           //ht_bin[2]=ht_bin[0];
469           //ht_bin[1]=(int)head->addr;
470           //ht_bin[0]=vaddr;
471         }
472         return head->addr;
473       }
474     }
475     head=head->next;
476   }
477   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
478   int r=new_recompile_block(vaddr);
479   if(r==0) return get_addr(vaddr);
480   // Execute in unmapped page, generate pagefault execption
481   Status|=2;
482   Cause=(vaddr<<31)|0x8;
483   EPC=(vaddr&1)?vaddr-5:vaddr;
484   BadVAddr=(vaddr&~1);
485   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
486   EntryHi=BadVAddr&0xFFFFE000;
487   return get_addr_ht(0x80000000);
488 }
489
490 void clear_all_regs(signed char regmap[])
491 {
492   int hr;
493   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
494 }
495
496 signed char get_reg(signed char regmap[],int r)
497 {
498   int hr;
499   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
500   return -1;
501 }
502
503 // Find a register that is available for two consecutive cycles
504 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
505 {
506   int hr;
507   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
508   return -1;
509 }
510
511 int count_free_regs(signed char regmap[])
512 {
513   int count=0;
514   int hr;
515   for(hr=0;hr<HOST_REGS;hr++)
516   {
517     if(hr!=EXCLUDE_REG) {
518       if(regmap[hr]<0) count++;
519     }
520   }
521   return count;
522 }
523
524 void dirty_reg(struct regstat *cur,signed char reg)
525 {
526   int hr;
527   if(!reg) return;
528   for (hr=0;hr<HOST_REGS;hr++) {
529     if((cur->regmap[hr]&63)==reg) {
530       cur->dirty|=1<<hr;
531     }
532   }
533 }
534
535 // If we dirty the lower half of a 64 bit register which is now being
536 // sign-extended, we need to dump the upper half.
537 // Note: Do this only after completion of the instruction, because
538 // some instructions may need to read the full 64-bit value even if
539 // overwriting it (eg SLTI, DSRA32).
540 static void flush_dirty_uppers(struct regstat *cur)
541 {
542   int hr,reg;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->dirty>>hr)&1) {
545       reg=cur->regmap[hr];
546       if(reg>=64) 
547         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
548     }
549   }
550 }
551
552 void set_const(struct regstat *cur,signed char reg,uint64_t value)
553 {
554   int hr;
555   if(!reg) return;
556   for (hr=0;hr<HOST_REGS;hr++) {
557     if(cur->regmap[hr]==reg) {
558       cur->isconst|=1<<hr;
559       cur->constmap[hr]=value;
560     }
561     else if((cur->regmap[hr]^64)==reg) {
562       cur->isconst|=1<<hr;
563       cur->constmap[hr]=value>>32;
564     }
565   }
566 }
567
568 void clear_const(struct regstat *cur,signed char reg)
569 {
570   int hr;
571   if(!reg) return;
572   for (hr=0;hr<HOST_REGS;hr++) {
573     if((cur->regmap[hr]&63)==reg) {
574       cur->isconst&=~(1<<hr);
575     }
576   }
577 }
578
579 int is_const(struct regstat *cur,signed char reg)
580 {
581   int hr;
582   if(!reg) return 1;
583   for (hr=0;hr<HOST_REGS;hr++) {
584     if((cur->regmap[hr]&63)==reg) {
585       return (cur->isconst>>hr)&1;
586     }
587   }
588   return 0;
589 }
590 uint64_t get_const(struct regstat *cur,signed char reg)
591 {
592   int hr;
593   if(!reg) return 0;
594   for (hr=0;hr<HOST_REGS;hr++) {
595     if(cur->regmap[hr]==reg) {
596       return cur->constmap[hr];
597     }
598   }
599   printf("Unknown constant in r%d\n",reg);
600   exit(1);
601 }
602
603 // Least soon needed registers
604 // Look at the next ten instructions and see which registers
605 // will be used.  Try not to reallocate these.
606 void lsn(u_char hsn[], int i, int *preferred_reg)
607 {
608   int j;
609   int b=-1;
610   for(j=0;j<9;j++)
611   {
612     if(i+j>=slen) {
613       j=slen-i-1;
614       break;
615     }
616     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
617     {
618       // Don't go past an unconditonal jump
619       j++;
620       break;
621     }
622   }
623   for(;j>=0;j--)
624   {
625     if(rs1[i+j]) hsn[rs1[i+j]]=j;
626     if(rs2[i+j]) hsn[rs2[i+j]]=j;
627     if(rt1[i+j]) hsn[rt1[i+j]]=j;
628     if(rt2[i+j]) hsn[rt2[i+j]]=j;
629     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
630       // Stores can allocate zero
631       hsn[rs1[i+j]]=j;
632       hsn[rs2[i+j]]=j;
633     }
634     // On some architectures stores need invc_ptr
635     #if defined(HOST_IMM8)
636     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
637       hsn[INVCP]=j;
638     }
639     #endif
640     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
641     {
642       hsn[CCREG]=j;
643       b=j;
644     }
645   }
646   if(b>=0)
647   {
648     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
649     {
650       // Follow first branch
651       int t=(ba[i+b]-start)>>2;
652       j=7-b;if(t+j>=slen) j=slen-t-1;
653       for(;j>=0;j--)
654       {
655         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
656         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
657         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
658         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
659       }
660     }
661     // TODO: preferred register based on backward branch
662   }
663   // Delay slot should preferably not overwrite branch conditions or cycle count
664   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
665     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
666     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
667     hsn[CCREG]=1;
668     // ...or hash tables
669     hsn[RHASH]=1;
670     hsn[RHTBL]=1;
671   }
672   // Coprocessor load/store needs FTEMP, even if not declared
673   if(itype[i]==C1LS) {
674     hsn[FTEMP]=0;
675   }
676   // Load L/R also uses FTEMP as a temporary register
677   if(itype[i]==LOADLR) {
678     hsn[FTEMP]=0;
679   }
680   // Also 64-bit SDL/SDR
681   if(opcode[i]==0x2c||opcode[i]==0x2d) {
682     hsn[FTEMP]=0;
683   }
684   // Don't remove the TLB registers either
685   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
686     hsn[TLREG]=0;
687   }
688   // Don't remove the miniht registers
689   if(itype[i]==UJUMP||itype[i]==RJUMP)
690   {
691     hsn[RHASH]=0;
692     hsn[RHTBL]=0;
693   }
694 }
695
696 // We only want to allocate registers if we're going to use them again soon
697 int needed_again(int r, int i)
698 {
699   int j;
700   int b=-1;
701   int rn=10;
702   int hr;
703   u_char hsn[MAXREG+1];
704   int preferred_reg;
705   
706   memset(hsn,10,sizeof(hsn));
707   lsn(hsn,i,&preferred_reg);
708   
709   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
710   {
711     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
712       return 0; // Don't need any registers if exiting the block
713   }
714   for(j=0;j<9;j++)
715   {
716     if(i+j>=slen) {
717       j=slen-i-1;
718       break;
719     }
720     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
721     {
722       // Don't go past an unconditonal jump
723       j++;
724       break;
725     }
726     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||((source[i+j]&0xfc00003f)==0x0d))
727     {
728       break;
729     }
730   }
731   for(;j>=1;j--)
732   {
733     if(rs1[i+j]==r) rn=j;
734     if(rs2[i+j]==r) rn=j;
735     if((unneeded_reg[i+j]>>r)&1) rn=10;
736     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
737     {
738       b=j;
739     }
740   }
741   /*
742   if(b>=0)
743   {
744     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
745     {
746       // Follow first branch
747       int o=rn;
748       int t=(ba[i+b]-start)>>2;
749       j=7-b;if(t+j>=slen) j=slen-t-1;
750       for(;j>=0;j--)
751       {
752         if(!((unneeded_reg[t+j]>>r)&1)) {
753           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
754           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
755         }
756         else rn=o;
757       }
758     }
759   }*/
760   for(hr=0;hr<HOST_REGS;hr++) {
761     if(hr!=EXCLUDE_REG) {
762       if(rn<hsn[hr]) return 1;
763     }
764   }
765   return 0;
766 }
767
768 // Try to match register allocations at the end of a loop with those
769 // at the beginning
770 int loop_reg(int i, int r, int hr)
771 {
772   int j,k;
773   for(j=0;j<9;j++)
774   {
775     if(i+j>=slen) {
776       j=slen-i-1;
777       break;
778     }
779     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
780     {
781       // Don't go past an unconditonal jump
782       j++;
783       break;
784     }
785   }
786   k=0;
787   if(i>0){
788     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
789       k--;
790   }
791   for(;k<j;k++)
792   {
793     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
794     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
795     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
796     {
797       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
798       {
799         int t=(ba[i+k]-start)>>2;
800         int reg=get_reg(regs[t].regmap_entry,r);
801         if(reg>=0) return reg;
802         //reg=get_reg(regs[t+1].regmap_entry,r);
803         //if(reg>=0) return reg;
804       }
805     }
806   }
807   return hr;
808 }
809
810
811 // Allocate every register, preserving source/target regs
812 void alloc_all(struct regstat *cur,int i)
813 {
814   int hr;
815   
816   for(hr=0;hr<HOST_REGS;hr++) {
817     if(hr!=EXCLUDE_REG) {
818       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
819          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
820       {
821         cur->regmap[hr]=-1;
822         cur->dirty&=~(1<<hr);
823       }
824       // Don't need zeros
825       if((cur->regmap[hr]&63)==0)
826       {
827         cur->regmap[hr]=-1;
828         cur->dirty&=~(1<<hr);
829       }
830     }
831   }
832 }
833
834
835 void div64(int64_t dividend,int64_t divisor)
836 {
837   lo=dividend/divisor;
838   hi=dividend%divisor;
839   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
840   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
841 }
842 void divu64(uint64_t dividend,uint64_t divisor)
843 {
844   lo=dividend/divisor;
845   hi=dividend%divisor;
846   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
847   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
848 }
849
850 void mult64(uint64_t m1,uint64_t m2)
851 {
852    unsigned long long int op1, op2, op3, op4;
853    unsigned long long int result1, result2, result3, result4;
854    unsigned long long int temp1, temp2, temp3, temp4;
855    int sign = 0;
856    
857    if (m1 < 0)
858      {
859     op2 = -m1;
860     sign = 1 - sign;
861      }
862    else op2 = m1;
863    if (m2 < 0)
864      {
865     op4 = -m2;
866     sign = 1 - sign;
867      }
868    else op4 = m2;
869    
870    op1 = op2 & 0xFFFFFFFF;
871    op2 = (op2 >> 32) & 0xFFFFFFFF;
872    op3 = op4 & 0xFFFFFFFF;
873    op4 = (op4 >> 32) & 0xFFFFFFFF;
874    
875    temp1 = op1 * op3;
876    temp2 = (temp1 >> 32) + op1 * op4;
877    temp3 = op2 * op3;
878    temp4 = (temp3 >> 32) + op2 * op4;
879    
880    result1 = temp1 & 0xFFFFFFFF;
881    result2 = temp2 + (temp3 & 0xFFFFFFFF);
882    result3 = (result2 >> 32) + temp4;
883    result4 = (result3 >> 32);
884    
885    lo = result1 | (result2 << 32);
886    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
887    if (sign)
888      {
889     hi = ~hi;
890     if (!lo) hi++;
891     else lo = ~lo + 1;
892      }
893 }
894
895 void multu64(uint64_t m1,uint64_t m2)
896 {
897    unsigned long long int op1, op2, op3, op4;
898    unsigned long long int result1, result2, result3, result4;
899    unsigned long long int temp1, temp2, temp3, temp4;
900    
901    op1 = m1 & 0xFFFFFFFF;
902    op2 = (m1 >> 32) & 0xFFFFFFFF;
903    op3 = m2 & 0xFFFFFFFF;
904    op4 = (m2 >> 32) & 0xFFFFFFFF;
905    
906    temp1 = op1 * op3;
907    temp2 = (temp1 >> 32) + op1 * op4;
908    temp3 = op2 * op3;
909    temp4 = (temp3 >> 32) + op2 * op4;
910    
911    result1 = temp1 & 0xFFFFFFFF;
912    result2 = temp2 + (temp3 & 0xFFFFFFFF);
913    result3 = (result2 >> 32) + temp4;
914    result4 = (result3 >> 32);
915    
916    lo = result1 | (result2 << 32);
917    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
918    
919   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
920   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
921 }
922
923 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
924 {
925   if(bits) {
926     original<<=64-bits;
927     original>>=64-bits;
928     loaded<<=bits;
929     original|=loaded;
930   }
931   else original=loaded;
932   return original;
933 }
934 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
935 {
936   if(bits^56) {
937     original>>=64-(bits^56);
938     original<<=64-(bits^56);
939     loaded>>=bits^56;
940     original|=loaded;
941   }
942   else original=loaded;
943   return original;
944 }
945
946 #ifdef __i386__
947 #include "assem_x86.c"
948 #endif
949 #ifdef __x86_64__
950 #include "assem_x64.c"
951 #endif
952 #ifdef __arm__
953 #include "assem_arm.c"
954 #endif
955
956 // Add virtual address mapping to linked list
957 void ll_add(struct ll_entry **head,int vaddr,void *addr)
958 {
959   struct ll_entry *new_entry;
960   new_entry=malloc(sizeof(struct ll_entry));
961   assert(new_entry!=NULL);
962   new_entry->vaddr=vaddr;
963   new_entry->reg32=0;
964   new_entry->addr=addr;
965   new_entry->next=*head;
966   *head=new_entry;
967 }
968
969 // Add virtual address mapping for 32-bit compiled block
970 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
971 {
972   ll_add(head,vaddr,addr);
973 #ifndef FORCE32
974   (*head)->reg32=reg32;
975 #endif
976 }
977
978 // Check if an address is already compiled
979 // but don't return addresses which are about to expire from the cache
980 void *check_addr(u_int vaddr)
981 {
982   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
983   if(ht_bin[0]==vaddr) {
984     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
985       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
986   }
987   if(ht_bin[2]==vaddr) {
988     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
989       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
990   }
991   u_int page=get_page(vaddr);
992   struct ll_entry *head;
993   head=jump_in[page];
994   while(head!=NULL) {
995     if(head->vaddr==vaddr&&head->reg32==0) {
996       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
997         // Update existing entry with current address
998         if(ht_bin[0]==vaddr) {
999           ht_bin[1]=(int)head->addr;
1000           return head->addr;
1001         }
1002         if(ht_bin[2]==vaddr) {
1003           ht_bin[3]=(int)head->addr;
1004           return head->addr;
1005         }
1006         // Insert into hash table with low priority.
1007         // Don't evict existing entries, as they are probably
1008         // addresses that are being accessed frequently.
1009         if(ht_bin[0]==-1) {
1010           ht_bin[1]=(int)head->addr;
1011           ht_bin[0]=vaddr;
1012         }else if(ht_bin[2]==-1) {
1013           ht_bin[3]=(int)head->addr;
1014           ht_bin[2]=vaddr;
1015         }
1016         return head->addr;
1017       }
1018     }
1019     head=head->next;
1020   }
1021   return 0;
1022 }
1023
1024 void remove_hash(int vaddr)
1025 {
1026   //printf("remove hash: %x\n",vaddr);
1027   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1028   if(ht_bin[2]==vaddr) {
1029     ht_bin[2]=ht_bin[3]=-1;
1030   }
1031   if(ht_bin[0]==vaddr) {
1032     ht_bin[0]=ht_bin[2];
1033     ht_bin[1]=ht_bin[3];
1034     ht_bin[2]=ht_bin[3]=-1;
1035   }
1036 }
1037
1038 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1039 {
1040   struct ll_entry *next;
1041   while(*head) {
1042     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1043        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1044     {
1045       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1046       remove_hash((*head)->vaddr);
1047       next=(*head)->next;
1048       free(*head);
1049       *head=next;
1050     }
1051     else
1052     {
1053       head=&((*head)->next);
1054     }
1055   }
1056 }
1057
1058 // Remove all entries from linked list
1059 void ll_clear(struct ll_entry **head)
1060 {
1061   struct ll_entry *cur;
1062   struct ll_entry *next;
1063   if(cur=*head) {
1064     *head=0;
1065     while(cur) {
1066       next=cur->next;
1067       free(cur);
1068       cur=next;
1069     }
1070   }
1071 }
1072
1073 // Dereference the pointers and remove if it matches
1074 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1075 {
1076   while(head) {
1077     int ptr=get_pointer(head->addr);
1078     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1079     if(((ptr>>shift)==(addr>>shift)) ||
1080        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1081     {
1082       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1083       kill_pointer(head->addr);
1084     }
1085     head=head->next;
1086   }
1087 }
1088
1089 // This is called when we write to a compiled block (see do_invstub)
1090 int invalidate_page(u_int page)
1091 {
1092   int modified=0;
1093   struct ll_entry *head;
1094   struct ll_entry *next;
1095   head=jump_in[page];
1096   jump_in[page]=0;
1097   while(head!=NULL) {
1098     inv_debug("INVALIDATE: %x\n",head->vaddr);
1099     remove_hash(head->vaddr);
1100     next=head->next;
1101     free(head);
1102     head=next;
1103   }
1104   head=jump_out[page];
1105   jump_out[page]=0;
1106   while(head!=NULL) {
1107     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1108     kill_pointer(head->addr);
1109     modified=1;
1110     next=head->next;
1111     free(head);
1112     head=next;
1113   }
1114   return modified;
1115 }
1116 void invalidate_block(u_int block)
1117 {
1118   int modified;
1119   u_int page=get_page(block<<12);
1120   u_int vpage=get_vpage(block<<12);
1121   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1122   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1123   u_int first,last;
1124   first=last=page;
1125   struct ll_entry *head;
1126   head=jump_dirty[vpage];
1127   //printf("page=%d vpage=%d\n",page,vpage);
1128   while(head!=NULL) {
1129     u_int start,end;
1130     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1131       get_bounds((int)head->addr,&start,&end);
1132       //printf("start: %x end: %x\n",start,end);
1133       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1134         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1135           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1136           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1137         }
1138       }
1139 #ifndef DISABLE_TLB
1140       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1141         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1142           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1143           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1144         }
1145       }
1146 #endif
1147     }
1148     head=head->next;
1149   }
1150   //printf("first=%d last=%d\n",first,last);
1151   modified=invalidate_page(page);
1152   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1153   assert(last<page+5);
1154   // Invalidate the adjacent pages if a block crosses a 4K boundary
1155   while(first<page) {
1156     invalidate_page(first);
1157     first++;
1158   }
1159   for(first=page+1;first<last;first++) {
1160     invalidate_page(first);
1161   }
1162   
1163   // Don't trap writes
1164   invalid_code[block]=1;
1165 #ifndef DISABLE_TLB
1166   // If there is a valid TLB entry for this page, remove write protect
1167   if(tlb_LUT_w[block]) {
1168     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1169     // CHECK: Is this right?
1170     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1171     u_int real_block=tlb_LUT_w[block]>>12;
1172     invalid_code[real_block]=1;
1173     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1174   }
1175   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1176 #endif
1177   #ifdef __arm__
1178   if(modified)
1179     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1180   #endif
1181   #ifdef USE_MINI_HT
1182   memset(mini_ht,-1,sizeof(mini_ht));
1183   #endif
1184 }
1185 void invalidate_addr(u_int addr)
1186 {
1187   invalidate_block(addr>>12);
1188 }
1189 void invalidate_all_pages()
1190 {
1191   u_int page,n;
1192   for(page=0;page<4096;page++)
1193     invalidate_page(page);
1194   for(page=0;page<1048576;page++)
1195     if(!invalid_code[page]) {
1196       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1197       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1198     }
1199   #ifdef __arm__
1200   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1201   #endif
1202   #ifdef USE_MINI_HT
1203   memset(mini_ht,-1,sizeof(mini_ht));
1204   #endif
1205   #ifndef DISABLE_TLB
1206   // TLB
1207   for(page=0;page<0x100000;page++) {
1208     if(tlb_LUT_r[page]) {
1209       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1210       if(!tlb_LUT_w[page]||!invalid_code[page])
1211         memory_map[page]|=0x40000000; // Write protect
1212     }
1213     else memory_map[page]=-1;
1214     if(page==0x80000) page=0xC0000;
1215   }
1216   tlb_hacks();
1217   #endif
1218 }
1219
1220 // Add an entry to jump_out after making a link
1221 void add_link(u_int vaddr,void *src)
1222 {
1223   u_int page=get_page(vaddr);
1224   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1225   ll_add(jump_out+page,vaddr,src);
1226   //int ptr=get_pointer(src);
1227   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1228 }
1229
1230 // If a code block was found to be unmodified (bit was set in
1231 // restore_candidate) and it remains unmodified (bit is clear
1232 // in invalid_code) then move the entries for that 4K page from
1233 // the dirty list to the clean list.
1234 void clean_blocks(u_int page)
1235 {
1236   struct ll_entry *head;
1237   inv_debug("INV: clean_blocks page=%d\n",page);
1238   head=jump_dirty[page];
1239   while(head!=NULL) {
1240     if(!invalid_code[head->vaddr>>12]) {
1241       // Don't restore blocks which are about to expire from the cache
1242       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1243         u_int start,end;
1244         if(verify_dirty((int)head->addr)) {
1245           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1246           u_int i;
1247           u_int inv=0;
1248           get_bounds((int)head->addr,&start,&end);
1249           if(start-(u_int)rdram<0x800000) {
1250             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1251               inv|=invalid_code[i];
1252             }
1253           }
1254           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1255             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1256             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1257             if(addr<start||addr>=end) inv=1;
1258           }
1259           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1260             inv=1;
1261           }
1262           if(!inv) {
1263             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1264             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1265               u_int ppage=page;
1266 #ifndef DISABLE_TLB
1267               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1268 #endif
1269               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1270               //printf("page=%x, addr=%x\n",page,head->vaddr);
1271               //assert(head->vaddr>>12==(page|0x80000));
1272               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1273               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1274               if(!head->reg32) {
1275                 if(ht_bin[0]==head->vaddr) {
1276                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1277                 }
1278                 if(ht_bin[2]==head->vaddr) {
1279                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1280                 }
1281               }
1282             }
1283           }
1284         }
1285       }
1286     }
1287     head=head->next;
1288   }
1289 }
1290
1291
1292 void mov_alloc(struct regstat *current,int i)
1293 {
1294   // Note: Don't need to actually alloc the source registers
1295   if((~current->is32>>rs1[i])&1) {
1296     //alloc_reg64(current,i,rs1[i]);
1297     alloc_reg64(current,i,rt1[i]);
1298     current->is32&=~(1LL<<rt1[i]);
1299   } else {
1300     //alloc_reg(current,i,rs1[i]);
1301     alloc_reg(current,i,rt1[i]);
1302     current->is32|=(1LL<<rt1[i]);
1303   }
1304   clear_const(current,rs1[i]);
1305   clear_const(current,rt1[i]);
1306   dirty_reg(current,rt1[i]);
1307 }
1308
1309 void shiftimm_alloc(struct regstat *current,int i)
1310 {
1311   clear_const(current,rs1[i]);
1312   clear_const(current,rt1[i]);
1313   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1314   {
1315     if(rt1[i]) {
1316       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1317       else lt1[i]=rs1[i];
1318       alloc_reg(current,i,rt1[i]);
1319       current->is32|=1LL<<rt1[i];
1320       dirty_reg(current,rt1[i]);
1321     }
1322   }
1323   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1324   {
1325     if(rt1[i]) {
1326       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1327       alloc_reg64(current,i,rt1[i]);
1328       current->is32&=~(1LL<<rt1[i]);
1329       dirty_reg(current,rt1[i]);
1330     }
1331   }
1332   if(opcode2[i]==0x3c) // DSLL32
1333   {
1334     if(rt1[i]) {
1335       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1336       alloc_reg64(current,i,rt1[i]);
1337       current->is32&=~(1LL<<rt1[i]);
1338       dirty_reg(current,rt1[i]);
1339     }
1340   }
1341   if(opcode2[i]==0x3e) // DSRL32
1342   {
1343     if(rt1[i]) {
1344       alloc_reg64(current,i,rs1[i]);
1345       if(imm[i]==32) {
1346         alloc_reg64(current,i,rt1[i]);
1347         current->is32&=~(1LL<<rt1[i]);
1348       } else {
1349         alloc_reg(current,i,rt1[i]);
1350         current->is32|=1LL<<rt1[i];
1351       }
1352       dirty_reg(current,rt1[i]);
1353     }
1354   }
1355   if(opcode2[i]==0x3f) // DSRA32
1356   {
1357     if(rt1[i]) {
1358       alloc_reg64(current,i,rs1[i]);
1359       alloc_reg(current,i,rt1[i]);
1360       current->is32|=1LL<<rt1[i];
1361       dirty_reg(current,rt1[i]);
1362     }
1363   }
1364 }
1365
1366 void shift_alloc(struct regstat *current,int i)
1367 {
1368   if(rt1[i]) {
1369     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1370     {
1371       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1372       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1373       alloc_reg(current,i,rt1[i]);
1374       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1375       current->is32|=1LL<<rt1[i];
1376     } else { // DSLLV/DSRLV/DSRAV
1377       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1378       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1379       alloc_reg64(current,i,rt1[i]);
1380       current->is32&=~(1LL<<rt1[i]);
1381       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1382         alloc_reg_temp(current,i,-1);
1383     }
1384     clear_const(current,rs1[i]);
1385     clear_const(current,rs2[i]);
1386     clear_const(current,rt1[i]);
1387     dirty_reg(current,rt1[i]);
1388   }
1389 }
1390
1391 void alu_alloc(struct regstat *current,int i)
1392 {
1393   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1394     if(rt1[i]) {
1395       if(rs1[i]&&rs2[i]) {
1396         alloc_reg(current,i,rs1[i]);
1397         alloc_reg(current,i,rs2[i]);
1398       }
1399       else {
1400         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1401         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1402       }
1403       alloc_reg(current,i,rt1[i]);
1404     }
1405     current->is32|=1LL<<rt1[i];
1406   }
1407   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1408     if(rt1[i]) {
1409       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1410       {
1411         alloc_reg64(current,i,rs1[i]);
1412         alloc_reg64(current,i,rs2[i]);
1413         alloc_reg(current,i,rt1[i]);
1414       } else {
1415         alloc_reg(current,i,rs1[i]);
1416         alloc_reg(current,i,rs2[i]);
1417         alloc_reg(current,i,rt1[i]);
1418       }
1419     }
1420     current->is32|=1LL<<rt1[i];
1421   }
1422   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1423     if(rt1[i]) {
1424       if(rs1[i]&&rs2[i]) {
1425         alloc_reg(current,i,rs1[i]);
1426         alloc_reg(current,i,rs2[i]);
1427       }
1428       else
1429       {
1430         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1431         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1432       }
1433       alloc_reg(current,i,rt1[i]);
1434       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1435       {
1436         if(!((current->uu>>rt1[i])&1)) {
1437           alloc_reg64(current,i,rt1[i]);
1438         }
1439         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1440           if(rs1[i]&&rs2[i]) {
1441             alloc_reg64(current,i,rs1[i]);
1442             alloc_reg64(current,i,rs2[i]);
1443           }
1444           else
1445           {
1446             // Is is really worth it to keep 64-bit values in registers?
1447             #ifdef NATIVE_64BIT
1448             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1449             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1450             #endif
1451           }
1452         }
1453         current->is32&=~(1LL<<rt1[i]);
1454       } else {
1455         current->is32|=1LL<<rt1[i];
1456       }
1457     }
1458   }
1459   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1460     if(rt1[i]) {
1461       if(rs1[i]&&rs2[i]) {
1462         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1463           alloc_reg64(current,i,rs1[i]);
1464           alloc_reg64(current,i,rs2[i]);
1465           alloc_reg64(current,i,rt1[i]);
1466         } else {
1467           alloc_reg(current,i,rs1[i]);
1468           alloc_reg(current,i,rs2[i]);
1469           alloc_reg(current,i,rt1[i]);
1470         }
1471       }
1472       else {
1473         alloc_reg(current,i,rt1[i]);
1474         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1475           // DADD used as move, or zeroing
1476           // If we have a 64-bit source, then make the target 64 bits too
1477           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1478             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1479             alloc_reg64(current,i,rt1[i]);
1480           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1481             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1482             alloc_reg64(current,i,rt1[i]);
1483           }
1484           if(opcode2[i]>=0x2e&&rs2[i]) {
1485             // DSUB used as negation - 64-bit result
1486             // If we have a 32-bit register, extend it to 64 bits
1487             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1488             alloc_reg64(current,i,rt1[i]);
1489           }
1490         }
1491       }
1492       if(rs1[i]&&rs2[i]) {
1493         current->is32&=~(1LL<<rt1[i]);
1494       } else if(rs1[i]) {
1495         current->is32&=~(1LL<<rt1[i]);
1496         if((current->is32>>rs1[i])&1)
1497           current->is32|=1LL<<rt1[i];
1498       } else if(rs2[i]) {
1499         current->is32&=~(1LL<<rt1[i]);
1500         if((current->is32>>rs2[i])&1)
1501           current->is32|=1LL<<rt1[i];
1502       } else {
1503         current->is32|=1LL<<rt1[i];
1504       }
1505     }
1506   }
1507   clear_const(current,rs1[i]);
1508   clear_const(current,rs2[i]);
1509   clear_const(current,rt1[i]);
1510   dirty_reg(current,rt1[i]);
1511 }
1512
1513 void imm16_alloc(struct regstat *current,int i)
1514 {
1515   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1516   else lt1[i]=rs1[i];
1517   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1518   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1519     current->is32&=~(1LL<<rt1[i]);
1520     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1521       // TODO: Could preserve the 32-bit flag if the immediate is zero
1522       alloc_reg64(current,i,rt1[i]);
1523       alloc_reg64(current,i,rs1[i]);
1524     }
1525     clear_const(current,rs1[i]);
1526     clear_const(current,rt1[i]);
1527   }
1528   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1529     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1530     current->is32|=1LL<<rt1[i];
1531     clear_const(current,rs1[i]);
1532     clear_const(current,rt1[i]);
1533   }
1534   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1535     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1536       if(rs1[i]!=rt1[i]) {
1537         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1538         alloc_reg64(current,i,rt1[i]);
1539         current->is32&=~(1LL<<rt1[i]);
1540       }
1541     }
1542     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1543     if(is_const(current,rs1[i])) {
1544       int v=get_const(current,rs1[i]);
1545       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1546       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1547       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1548     }
1549     else clear_const(current,rt1[i]);
1550   }
1551   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1552     if(is_const(current,rs1[i])) {
1553       int v=get_const(current,rs1[i]);
1554       set_const(current,rt1[i],v+imm[i]);
1555     }
1556     else clear_const(current,rt1[i]);
1557     current->is32|=1LL<<rt1[i];
1558   }
1559   else {
1560     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1561     current->is32|=1LL<<rt1[i];
1562   }
1563   dirty_reg(current,rt1[i]);
1564 }
1565
1566 void load_alloc(struct regstat *current,int i)
1567 {
1568   clear_const(current,rt1[i]);
1569   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1570   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1571   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1572   if(rt1[i]) {
1573     alloc_reg(current,i,rt1[i]);
1574     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1575     {
1576       current->is32&=~(1LL<<rt1[i]);
1577       alloc_reg64(current,i,rt1[i]);
1578     }
1579     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1580     {
1581       current->is32&=~(1LL<<rt1[i]);
1582       alloc_reg64(current,i,rt1[i]);
1583       alloc_all(current,i);
1584       alloc_reg64(current,i,FTEMP);
1585     }
1586     else current->is32|=1LL<<rt1[i];
1587     dirty_reg(current,rt1[i]);
1588     // If using TLB, need a register for pointer to the mapping table
1589     if(using_tlb) alloc_reg(current,i,TLREG);
1590     // LWL/LWR need a temporary register for the old value
1591     if(opcode[i]==0x22||opcode[i]==0x26)
1592     {
1593       alloc_reg(current,i,FTEMP);
1594       alloc_reg_temp(current,i,-1);
1595     }
1596   }
1597   else
1598   {
1599     // Load to r0 (dummy load)
1600     // but we still need a register to calculate the address
1601     alloc_reg_temp(current,i,-1);
1602   }
1603 }
1604
1605 void store_alloc(struct regstat *current,int i)
1606 {
1607   clear_const(current,rs2[i]);
1608   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1609   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1610   alloc_reg(current,i,rs2[i]);
1611   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1612     alloc_reg64(current,i,rs2[i]);
1613     if(rs2[i]) alloc_reg(current,i,FTEMP);
1614   }
1615   // If using TLB, need a register for pointer to the mapping table
1616   if(using_tlb) alloc_reg(current,i,TLREG);
1617   #if defined(HOST_IMM8)
1618   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1619   else alloc_reg(current,i,INVCP);
1620   #endif
1621   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1622     alloc_reg(current,i,FTEMP);
1623   }
1624   // We need a temporary register for address generation
1625   alloc_reg_temp(current,i,-1);
1626 }
1627
1628 void c1ls_alloc(struct regstat *current,int i)
1629 {
1630   //clear_const(current,rs1[i]); // FIXME
1631   clear_const(current,rt1[i]);
1632   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1633   alloc_reg(current,i,CSREG); // Status
1634   alloc_reg(current,i,FTEMP);
1635   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1636     alloc_reg64(current,i,FTEMP);
1637   }
1638   // If using TLB, need a register for pointer to the mapping table
1639   if(using_tlb) alloc_reg(current,i,TLREG);
1640   #if defined(HOST_IMM8)
1641   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1642   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1643     alloc_reg(current,i,INVCP);
1644   #endif
1645   // We need a temporary register for address generation
1646   alloc_reg_temp(current,i,-1);
1647 }
1648
1649 #ifndef multdiv_alloc
1650 void multdiv_alloc(struct regstat *current,int i)
1651 {
1652   //  case 0x18: MULT
1653   //  case 0x19: MULTU
1654   //  case 0x1A: DIV
1655   //  case 0x1B: DIVU
1656   //  case 0x1C: DMULT
1657   //  case 0x1D: DMULTU
1658   //  case 0x1E: DDIV
1659   //  case 0x1F: DDIVU
1660   clear_const(current,rs1[i]);
1661   clear_const(current,rs2[i]);
1662   if(rs1[i]&&rs2[i])
1663   {
1664     if((opcode2[i]&4)==0) // 32-bit
1665     {
1666       current->u&=~(1LL<<HIREG);
1667       current->u&=~(1LL<<LOREG);
1668       alloc_reg(current,i,HIREG);
1669       alloc_reg(current,i,LOREG);
1670       alloc_reg(current,i,rs1[i]);
1671       alloc_reg(current,i,rs2[i]);
1672       current->is32|=1LL<<HIREG;
1673       current->is32|=1LL<<LOREG;
1674       dirty_reg(current,HIREG);
1675       dirty_reg(current,LOREG);
1676     }
1677     else // 64-bit
1678     {
1679       current->u&=~(1LL<<HIREG);
1680       current->u&=~(1LL<<LOREG);
1681       current->uu&=~(1LL<<HIREG);
1682       current->uu&=~(1LL<<LOREG);
1683       alloc_reg64(current,i,HIREG);
1684       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1685       alloc_reg64(current,i,rs1[i]);
1686       alloc_reg64(current,i,rs2[i]);
1687       alloc_all(current,i);
1688       current->is32&=~(1LL<<HIREG);
1689       current->is32&=~(1LL<<LOREG);
1690       dirty_reg(current,HIREG);
1691       dirty_reg(current,LOREG);
1692     }
1693   }
1694   else
1695   {
1696     // Multiply by zero is zero.
1697     // MIPS does not have a divide by zero exception.
1698     // The result is undefined, we return zero.
1699     alloc_reg(current,i,HIREG);
1700     alloc_reg(current,i,LOREG);
1701     current->is32|=1LL<<HIREG;
1702     current->is32|=1LL<<LOREG;
1703     dirty_reg(current,HIREG);
1704     dirty_reg(current,LOREG);
1705   }
1706 }
1707 #endif
1708
1709 void cop0_alloc(struct regstat *current,int i)
1710 {
1711   if(opcode2[i]==0) // MFC0
1712   {
1713     if(rt1[i]) {
1714       clear_const(current,rt1[i]);
1715       alloc_all(current,i);
1716       alloc_reg(current,i,rt1[i]);
1717       current->is32|=1LL<<rt1[i];
1718       dirty_reg(current,rt1[i]);
1719     }
1720   }
1721   else if(opcode2[i]==4) // MTC0
1722   {
1723     if(rs1[i]){
1724       clear_const(current,rs1[i]);
1725       alloc_reg(current,i,rs1[i]);
1726       alloc_all(current,i);
1727     }
1728     else {
1729       alloc_all(current,i); // FIXME: Keep r0
1730       current->u&=~1LL;
1731       alloc_reg(current,i,0);
1732     }
1733   }
1734   else
1735   {
1736     // TLBR/TLBWI/TLBWR/TLBP/ERET
1737     assert(opcode2[i]==0x10);
1738     alloc_all(current,i);
1739   }
1740 }
1741
1742 void cop1_alloc(struct regstat *current,int i)
1743 {
1744   alloc_reg(current,i,CSREG); // Load status
1745   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1746   {
1747     assert(rt1[i]);
1748     clear_const(current,rt1[i]);
1749     if(opcode2[i]==1) {
1750       alloc_reg64(current,i,rt1[i]); // DMFC1
1751       current->is32&=~(1LL<<rt1[i]);
1752     }else{
1753       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1754       current->is32|=1LL<<rt1[i];
1755     }
1756     dirty_reg(current,rt1[i]);
1757     alloc_reg_temp(current,i,-1);
1758   }
1759   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1760   {
1761     if(rs1[i]){
1762       clear_const(current,rs1[i]);
1763       if(opcode2[i]==5)
1764         alloc_reg64(current,i,rs1[i]); // DMTC1
1765       else
1766         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1767       alloc_reg_temp(current,i,-1);
1768     }
1769     else {
1770       current->u&=~1LL;
1771       alloc_reg(current,i,0);
1772       alloc_reg_temp(current,i,-1);
1773     }
1774   }
1775 }
1776 void fconv_alloc(struct regstat *current,int i)
1777 {
1778   alloc_reg(current,i,CSREG); // Load status
1779   alloc_reg_temp(current,i,-1);
1780 }
1781 void float_alloc(struct regstat *current,int i)
1782 {
1783   alloc_reg(current,i,CSREG); // Load status
1784   alloc_reg_temp(current,i,-1);
1785 }
1786 void fcomp_alloc(struct regstat *current,int i)
1787 {
1788   alloc_reg(current,i,CSREG); // Load status
1789   alloc_reg(current,i,FSREG); // Load flags
1790   dirty_reg(current,FSREG); // Flag will be modified
1791   alloc_reg_temp(current,i,-1);
1792 }
1793
1794 void syscall_alloc(struct regstat *current,int i)
1795 {
1796   alloc_cc(current,i);
1797   dirty_reg(current,CCREG);
1798   alloc_all(current,i);
1799   current->isconst=0;
1800 }
1801
1802 void delayslot_alloc(struct regstat *current,int i)
1803 {
1804   switch(itype[i]) {
1805     case UJUMP:
1806     case CJUMP:
1807     case SJUMP:
1808     case RJUMP:
1809     case FJUMP:
1810     case SYSCALL:
1811     case HLECALL:
1812     case SPAN:
1813       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1814       printf("Disabled speculative precompilation\n");
1815       stop_after_jal=1;
1816       break;
1817     case IMM16:
1818       imm16_alloc(current,i);
1819       break;
1820     case LOAD:
1821     case LOADLR:
1822       load_alloc(current,i);
1823       break;
1824     case STORE:
1825     case STORELR:
1826       store_alloc(current,i);
1827       break;
1828     case ALU:
1829       alu_alloc(current,i);
1830       break;
1831     case SHIFT:
1832       shift_alloc(current,i);
1833       break;
1834     case MULTDIV:
1835       multdiv_alloc(current,i);
1836       break;
1837     case SHIFTIMM:
1838       shiftimm_alloc(current,i);
1839       break;
1840     case MOV:
1841       mov_alloc(current,i);
1842       break;
1843     case COP0:
1844       cop0_alloc(current,i);
1845       break;
1846     case COP1:
1847       cop1_alloc(current,i);
1848       break;
1849     case C1LS:
1850       c1ls_alloc(current,i);
1851       break;
1852     case FCONV:
1853       fconv_alloc(current,i);
1854       break;
1855     case FLOAT:
1856       float_alloc(current,i);
1857       break;
1858     case FCOMP:
1859       fcomp_alloc(current,i);
1860       break;
1861   }
1862 }
1863
1864 // Special case where a branch and delay slot span two pages in virtual memory
1865 static void pagespan_alloc(struct regstat *current,int i)
1866 {
1867   current->isconst=0;
1868   current->wasconst=0;
1869   regs[i].wasconst=0;
1870   alloc_all(current,i);
1871   alloc_cc(current,i);
1872   dirty_reg(current,CCREG);
1873   if(opcode[i]==3) // JAL
1874   {
1875     alloc_reg(current,i,31);
1876     dirty_reg(current,31);
1877   }
1878   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1879   {
1880     alloc_reg(current,i,rs1[i]);
1881     if (rt1[i]==31) {
1882       alloc_reg(current,i,31);
1883       dirty_reg(current,31);
1884     }
1885   }
1886   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1887   {
1888     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1889     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1890     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1891     {
1892       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1893       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1894     }
1895   }
1896   else
1897   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1898   {
1899     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1900     if(!((current->is32>>rs1[i])&1))
1901     {
1902       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1903     }
1904   }
1905   else
1906   if(opcode[i]==0x11) // BC1
1907   {
1908     alloc_reg(current,i,FSREG);
1909     alloc_reg(current,i,CSREG);
1910   }
1911   //else ...
1912 }
1913
1914 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1915 {
1916   stubs[stubcount][0]=type;
1917   stubs[stubcount][1]=addr;
1918   stubs[stubcount][2]=retaddr;
1919   stubs[stubcount][3]=a;
1920   stubs[stubcount][4]=b;
1921   stubs[stubcount][5]=c;
1922   stubs[stubcount][6]=d;
1923   stubs[stubcount][7]=e;
1924   stubcount++;
1925 }
1926
1927 // Write out a single register
1928 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1929 {
1930   int hr;
1931   for(hr=0;hr<HOST_REGS;hr++) {
1932     if(hr!=EXCLUDE_REG) {
1933       if((regmap[hr]&63)==r) {
1934         if((dirty>>hr)&1) {
1935           if(regmap[hr]<64) {
1936             emit_storereg(r,hr);
1937 #ifndef FORCE32
1938             if((is32>>regmap[hr])&1) {
1939               emit_sarimm(hr,31,hr);
1940               emit_storereg(r|64,hr);
1941             }
1942 #endif
1943           }else{
1944             emit_storereg(r|64,hr);
1945           }
1946         }
1947       }
1948     }
1949   }
1950 }
1951
1952 int mchecksum()
1953 {
1954   //if(!tracedebug) return 0;
1955   int i;
1956   int sum=0;
1957   for(i=0;i<2097152;i++) {
1958     unsigned int temp=sum;
1959     sum<<=1;
1960     sum|=(~temp)>>31;
1961     sum^=((u_int *)rdram)[i];
1962   }
1963   return sum;
1964 }
1965 int rchecksum()
1966 {
1967   int i;
1968   int sum=0;
1969   for(i=0;i<64;i++)
1970     sum^=((u_int *)reg)[i];
1971   return sum;
1972 }
1973 void rlist()
1974 {
1975   int i;
1976   printf("TRACE: ");
1977   for(i=0;i<32;i++)
1978     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1979   printf("\n");
1980 #ifndef DISABLE_COP1
1981   printf("TRACE: ");
1982   for(i=0;i<32;i++)
1983     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1984   printf("\n");
1985 #endif
1986 }
1987
1988 void enabletrace()
1989 {
1990   tracedebug=1;
1991 }
1992
1993 void memdebug(int i)
1994 {
1995   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1996   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1997   //rlist();
1998   //if(tracedebug) {
1999   //if(Count>=-2084597794) {
2000   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2001   //if(0) {
2002     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2003     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2004     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2005     rlist();
2006     #ifdef __i386__
2007     printf("TRACE: %x\n",(&i)[-1]);
2008     #endif
2009     #ifdef __arm__
2010     int j;
2011     printf("TRACE: %x \n",(&j)[10]);
2012     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2013     #endif
2014     //fflush(stdout);
2015   }
2016   //printf("TRACE: %x\n",(&i)[-1]);
2017 }
2018
2019 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2020 {
2021   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2022 }
2023
2024 void alu_assemble(int i,struct regstat *i_regs)
2025 {
2026   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2027     if(rt1[i]) {
2028       signed char s1,s2,t;
2029       t=get_reg(i_regs->regmap,rt1[i]);
2030       if(t>=0) {
2031         s1=get_reg(i_regs->regmap,rs1[i]);
2032         s2=get_reg(i_regs->regmap,rs2[i]);
2033         if(rs1[i]&&rs2[i]) {
2034           assert(s1>=0);
2035           assert(s2>=0);
2036           if(opcode2[i]&2) emit_sub(s1,s2,t);
2037           else emit_add(s1,s2,t);
2038         }
2039         else if(rs1[i]) {
2040           if(s1>=0) emit_mov(s1,t);
2041           else emit_loadreg(rs1[i],t);
2042         }
2043         else if(rs2[i]) {
2044           if(s2>=0) {
2045             if(opcode2[i]&2) emit_neg(s2,t);
2046             else emit_mov(s2,t);
2047           }
2048           else {
2049             emit_loadreg(rs2[i],t);
2050             if(opcode2[i]&2) emit_neg(t,t);
2051           }
2052         }
2053         else emit_zeroreg(t);
2054       }
2055     }
2056   }
2057   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2058     if(rt1[i]) {
2059       signed char s1l,s2l,s1h,s2h,tl,th;
2060       tl=get_reg(i_regs->regmap,rt1[i]);
2061       th=get_reg(i_regs->regmap,rt1[i]|64);
2062       if(tl>=0) {
2063         s1l=get_reg(i_regs->regmap,rs1[i]);
2064         s2l=get_reg(i_regs->regmap,rs2[i]);
2065         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2066         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2067         if(rs1[i]&&rs2[i]) {
2068           assert(s1l>=0);
2069           assert(s2l>=0);
2070           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2071           else emit_adds(s1l,s2l,tl);
2072           if(th>=0) {
2073             #ifdef INVERTED_CARRY
2074             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2075             #else
2076             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2077             #endif
2078             else emit_add(s1h,s2h,th);
2079           }
2080         }
2081         else if(rs1[i]) {
2082           if(s1l>=0) emit_mov(s1l,tl);
2083           else emit_loadreg(rs1[i],tl);
2084           if(th>=0) {
2085             if(s1h>=0) emit_mov(s1h,th);
2086             else emit_loadreg(rs1[i]|64,th);
2087           }
2088         }
2089         else if(rs2[i]) {
2090           if(s2l>=0) {
2091             if(opcode2[i]&2) emit_negs(s2l,tl);
2092             else emit_mov(s2l,tl);
2093           }
2094           else {
2095             emit_loadreg(rs2[i],tl);
2096             if(opcode2[i]&2) emit_negs(tl,tl);
2097           }
2098           if(th>=0) {
2099             #ifdef INVERTED_CARRY
2100             if(s2h>=0) emit_mov(s2h,th);
2101             else emit_loadreg(rs2[i]|64,th);
2102             if(opcode2[i]&2) {
2103               emit_adcimm(-1,th); // x86 has inverted carry flag
2104               emit_not(th,th);
2105             }
2106             #else
2107             if(opcode2[i]&2) {
2108               if(s2h>=0) emit_rscimm(s2h,0,th);
2109               else {
2110                 emit_loadreg(rs2[i]|64,th);
2111                 emit_rscimm(th,0,th);
2112               }
2113             }else{
2114               if(s2h>=0) emit_mov(s2h,th);
2115               else emit_loadreg(rs2[i]|64,th);
2116             }
2117             #endif
2118           }
2119         }
2120         else {
2121           emit_zeroreg(tl);
2122           if(th>=0) emit_zeroreg(th);
2123         }
2124       }
2125     }
2126   }
2127   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2128     if(rt1[i]) {
2129       signed char s1l,s1h,s2l,s2h,t;
2130       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2131       {
2132         t=get_reg(i_regs->regmap,rt1[i]);
2133         //assert(t>=0);
2134         if(t>=0) {
2135           s1l=get_reg(i_regs->regmap,rs1[i]);
2136           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2137           s2l=get_reg(i_regs->regmap,rs2[i]);
2138           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2139           if(rs2[i]==0) // rx<r0
2140           {
2141             assert(s1h>=0);
2142             if(opcode2[i]==0x2a) // SLT
2143               emit_shrimm(s1h,31,t);
2144             else // SLTU (unsigned can not be less than zero)
2145               emit_zeroreg(t);
2146           }
2147           else if(rs1[i]==0) // r0<rx
2148           {
2149             assert(s2h>=0);
2150             if(opcode2[i]==0x2a) // SLT
2151               emit_set_gz64_32(s2h,s2l,t);
2152             else // SLTU (set if not zero)
2153               emit_set_nz64_32(s2h,s2l,t);
2154           }
2155           else {
2156             assert(s1l>=0);assert(s1h>=0);
2157             assert(s2l>=0);assert(s2h>=0);
2158             if(opcode2[i]==0x2a) // SLT
2159               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2160             else // SLTU
2161               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2162           }
2163         }
2164       } else {
2165         t=get_reg(i_regs->regmap,rt1[i]);
2166         //assert(t>=0);
2167         if(t>=0) {
2168           s1l=get_reg(i_regs->regmap,rs1[i]);
2169           s2l=get_reg(i_regs->regmap,rs2[i]);
2170           if(rs2[i]==0) // rx<r0
2171           {
2172             assert(s1l>=0);
2173             if(opcode2[i]==0x2a) // SLT
2174               emit_shrimm(s1l,31,t);
2175             else // SLTU (unsigned can not be less than zero)
2176               emit_zeroreg(t);
2177           }
2178           else if(rs1[i]==0) // r0<rx
2179           {
2180             assert(s2l>=0);
2181             if(opcode2[i]==0x2a) // SLT
2182               emit_set_gz32(s2l,t);
2183             else // SLTU (set if not zero)
2184               emit_set_nz32(s2l,t);
2185           }
2186           else{
2187             assert(s1l>=0);assert(s2l>=0);
2188             if(opcode2[i]==0x2a) // SLT
2189               emit_set_if_less32(s1l,s2l,t);
2190             else // SLTU
2191               emit_set_if_carry32(s1l,s2l,t);
2192           }
2193         }
2194       }
2195     }
2196   }
2197   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2198     if(rt1[i]) {
2199       signed char s1l,s1h,s2l,s2h,th,tl;
2200       tl=get_reg(i_regs->regmap,rt1[i]);
2201       th=get_reg(i_regs->regmap,rt1[i]|64);
2202       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2203       {
2204         assert(tl>=0);
2205         if(tl>=0) {
2206           s1l=get_reg(i_regs->regmap,rs1[i]);
2207           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2208           s2l=get_reg(i_regs->regmap,rs2[i]);
2209           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2210           if(rs1[i]&&rs2[i]) {
2211             assert(s1l>=0);assert(s1h>=0);
2212             assert(s2l>=0);assert(s2h>=0);
2213             if(opcode2[i]==0x24) { // AND
2214               emit_and(s1l,s2l,tl);
2215               emit_and(s1h,s2h,th);
2216             } else
2217             if(opcode2[i]==0x25) { // OR
2218               emit_or(s1l,s2l,tl);
2219               emit_or(s1h,s2h,th);
2220             } else
2221             if(opcode2[i]==0x26) { // XOR
2222               emit_xor(s1l,s2l,tl);
2223               emit_xor(s1h,s2h,th);
2224             } else
2225             if(opcode2[i]==0x27) { // NOR
2226               emit_or(s1l,s2l,tl);
2227               emit_or(s1h,s2h,th);
2228               emit_not(tl,tl);
2229               emit_not(th,th);
2230             }
2231           }
2232           else
2233           {
2234             if(opcode2[i]==0x24) { // AND
2235               emit_zeroreg(tl);
2236               emit_zeroreg(th);
2237             } else
2238             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2239               if(rs1[i]){
2240                 if(s1l>=0) emit_mov(s1l,tl);
2241                 else emit_loadreg(rs1[i],tl);
2242                 if(s1h>=0) emit_mov(s1h,th);
2243                 else emit_loadreg(rs1[i]|64,th);
2244               }
2245               else
2246               if(rs2[i]){
2247                 if(s2l>=0) emit_mov(s2l,tl);
2248                 else emit_loadreg(rs2[i],tl);
2249                 if(s2h>=0) emit_mov(s2h,th);
2250                 else emit_loadreg(rs2[i]|64,th);
2251               }
2252               else{
2253                 emit_zeroreg(tl);
2254                 emit_zeroreg(th);
2255               }
2256             } else
2257             if(opcode2[i]==0x27) { // NOR
2258               if(rs1[i]){
2259                 if(s1l>=0) emit_not(s1l,tl);
2260                 else{
2261                   emit_loadreg(rs1[i],tl);
2262                   emit_not(tl,tl);
2263                 }
2264                 if(s1h>=0) emit_not(s1h,th);
2265                 else{
2266                   emit_loadreg(rs1[i]|64,th);
2267                   emit_not(th,th);
2268                 }
2269               }
2270               else
2271               if(rs2[i]){
2272                 if(s2l>=0) emit_not(s2l,tl);
2273                 else{
2274                   emit_loadreg(rs2[i],tl);
2275                   emit_not(tl,tl);
2276                 }
2277                 if(s2h>=0) emit_not(s2h,th);
2278                 else{
2279                   emit_loadreg(rs2[i]|64,th);
2280                   emit_not(th,th);
2281                 }
2282               }
2283               else {
2284                 emit_movimm(-1,tl);
2285                 emit_movimm(-1,th);
2286               }
2287             }
2288           }
2289         }
2290       }
2291       else
2292       {
2293         // 32 bit
2294         if(tl>=0) {
2295           s1l=get_reg(i_regs->regmap,rs1[i]);
2296           s2l=get_reg(i_regs->regmap,rs2[i]);
2297           if(rs1[i]&&rs2[i]) {
2298             assert(s1l>=0);
2299             assert(s2l>=0);
2300             if(opcode2[i]==0x24) { // AND
2301               emit_and(s1l,s2l,tl);
2302             } else
2303             if(opcode2[i]==0x25) { // OR
2304               emit_or(s1l,s2l,tl);
2305             } else
2306             if(opcode2[i]==0x26) { // XOR
2307               emit_xor(s1l,s2l,tl);
2308             } else
2309             if(opcode2[i]==0x27) { // NOR
2310               emit_or(s1l,s2l,tl);
2311               emit_not(tl,tl);
2312             }
2313           }
2314           else
2315           {
2316             if(opcode2[i]==0x24) { // AND
2317               emit_zeroreg(tl);
2318             } else
2319             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2320               if(rs1[i]){
2321                 if(s1l>=0) emit_mov(s1l,tl);
2322                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2323               }
2324               else
2325               if(rs2[i]){
2326                 if(s2l>=0) emit_mov(s2l,tl);
2327                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2328               }
2329               else emit_zeroreg(tl);
2330             } else
2331             if(opcode2[i]==0x27) { // NOR
2332               if(rs1[i]){
2333                 if(s1l>=0) emit_not(s1l,tl);
2334                 else {
2335                   emit_loadreg(rs1[i],tl);
2336                   emit_not(tl,tl);
2337                 }
2338               }
2339               else
2340               if(rs2[i]){
2341                 if(s2l>=0) emit_not(s2l,tl);
2342                 else {
2343                   emit_loadreg(rs2[i],tl);
2344                   emit_not(tl,tl);
2345                 }
2346               }
2347               else emit_movimm(-1,tl);
2348             }
2349           }
2350         }
2351       }
2352     }
2353   }
2354 }
2355
2356 void imm16_assemble(int i,struct regstat *i_regs)
2357 {
2358   if (opcode[i]==0x0f) { // LUI
2359     if(rt1[i]) {
2360       signed char t;
2361       t=get_reg(i_regs->regmap,rt1[i]);
2362       //assert(t>=0);
2363       if(t>=0) {
2364         if(!((i_regs->isconst>>t)&1))
2365           emit_movimm(imm[i]<<16,t);
2366       }
2367     }
2368   }
2369   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2370     if(rt1[i]) {
2371       signed char s,t;
2372       t=get_reg(i_regs->regmap,rt1[i]);
2373       s=get_reg(i_regs->regmap,rs1[i]);
2374       if(rs1[i]) {
2375         //assert(t>=0);
2376         //assert(s>=0);
2377         if(t>=0) {
2378           if(!((i_regs->isconst>>t)&1)) {
2379             if(s<0) {
2380               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2381               emit_addimm(t,imm[i],t);
2382             }else{
2383               if(!((i_regs->wasconst>>s)&1))
2384                 emit_addimm(s,imm[i],t);
2385               else
2386                 emit_movimm(constmap[i][s]+imm[i],t);
2387             }
2388           }
2389         }
2390       } else {
2391         if(t>=0) {
2392           if(!((i_regs->isconst>>t)&1))
2393             emit_movimm(imm[i],t);
2394         }
2395       }
2396     }
2397   }
2398   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2399     if(rt1[i]) {
2400       signed char sh,sl,th,tl;
2401       th=get_reg(i_regs->regmap,rt1[i]|64);
2402       tl=get_reg(i_regs->regmap,rt1[i]);
2403       sh=get_reg(i_regs->regmap,rs1[i]|64);
2404       sl=get_reg(i_regs->regmap,rs1[i]);
2405       if(tl>=0) {
2406         if(rs1[i]) {
2407           assert(sh>=0);
2408           assert(sl>=0);
2409           if(th>=0) {
2410             emit_addimm64_32(sh,sl,imm[i],th,tl);
2411           }
2412           else {
2413             emit_addimm(sl,imm[i],tl);
2414           }
2415         } else {
2416           emit_movimm(imm[i],tl);
2417           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2418         }
2419       }
2420     }
2421   }
2422   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2423     if(rt1[i]) {
2424       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2425       signed char sh,sl,t;
2426       t=get_reg(i_regs->regmap,rt1[i]);
2427       sh=get_reg(i_regs->regmap,rs1[i]|64);
2428       sl=get_reg(i_regs->regmap,rs1[i]);
2429       //assert(t>=0);
2430       if(t>=0) {
2431         if(rs1[i]>0) {
2432           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2433           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2434             if(opcode[i]==0x0a) { // SLTI
2435               if(sl<0) {
2436                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2437                 emit_slti32(t,imm[i],t);
2438               }else{
2439                 emit_slti32(sl,imm[i],t);
2440               }
2441             }
2442             else { // SLTIU
2443               if(sl<0) {
2444                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2445                 emit_sltiu32(t,imm[i],t);
2446               }else{
2447                 emit_sltiu32(sl,imm[i],t);
2448               }
2449             }
2450           }else{ // 64-bit
2451             assert(sl>=0);
2452             if(opcode[i]==0x0a) // SLTI
2453               emit_slti64_32(sh,sl,imm[i],t);
2454             else // SLTIU
2455               emit_sltiu64_32(sh,sl,imm[i],t);
2456           }
2457         }else{
2458           // SLTI(U) with r0 is just stupid,
2459           // nonetheless examples can be found
2460           if(opcode[i]==0x0a) // SLTI
2461             if(0<imm[i]) emit_movimm(1,t);
2462             else emit_zeroreg(t);
2463           else // SLTIU
2464           {
2465             if(imm[i]) emit_movimm(1,t);
2466             else emit_zeroreg(t);
2467           }
2468         }
2469       }
2470     }
2471   }
2472   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2473     if(rt1[i]) {
2474       signed char sh,sl,th,tl;
2475       th=get_reg(i_regs->regmap,rt1[i]|64);
2476       tl=get_reg(i_regs->regmap,rt1[i]);
2477       sh=get_reg(i_regs->regmap,rs1[i]|64);
2478       sl=get_reg(i_regs->regmap,rs1[i]);
2479       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2480         if(opcode[i]==0x0c) //ANDI
2481         {
2482           if(rs1[i]) {
2483             if(sl<0) {
2484               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2485               emit_andimm(tl,imm[i],tl);
2486             }else{
2487               if(!((i_regs->wasconst>>sl)&1))
2488                 emit_andimm(sl,imm[i],tl);
2489               else
2490                 emit_movimm(constmap[i][sl]&imm[i],tl);
2491             }
2492           }
2493           else
2494             emit_zeroreg(tl);
2495           if(th>=0) emit_zeroreg(th);
2496         }
2497         else
2498         {
2499           if(rs1[i]) {
2500             if(sl<0) {
2501               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2502             }
2503             if(th>=0) {
2504               if(sh<0) {
2505                 emit_loadreg(rs1[i]|64,th);
2506               }else{
2507                 emit_mov(sh,th);
2508               }
2509             }
2510             if(opcode[i]==0x0d) //ORI
2511             if(sl<0) {
2512               emit_orimm(tl,imm[i],tl);
2513             }else{
2514               if(!((i_regs->wasconst>>sl)&1))
2515                 emit_orimm(sl,imm[i],tl);
2516               else
2517                 emit_movimm(constmap[i][sl]|imm[i],tl);
2518             }
2519             if(opcode[i]==0x0e) //XORI
2520             if(sl<0) {
2521               emit_xorimm(tl,imm[i],tl);
2522             }else{
2523               if(!((i_regs->wasconst>>sl)&1))
2524                 emit_xorimm(sl,imm[i],tl);
2525               else
2526                 emit_movimm(constmap[i][sl]^imm[i],tl);
2527             }
2528           }
2529           else {
2530             emit_movimm(imm[i],tl);
2531             if(th>=0) emit_zeroreg(th);
2532           }
2533         }
2534       }
2535     }
2536   }
2537 }
2538
2539 void shiftimm_assemble(int i,struct regstat *i_regs)
2540 {
2541   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2542   {
2543     if(rt1[i]) {
2544       signed char s,t;
2545       t=get_reg(i_regs->regmap,rt1[i]);
2546       s=get_reg(i_regs->regmap,rs1[i]);
2547       //assert(t>=0);
2548       if(t>=0){
2549         if(rs1[i]==0)
2550         {
2551           emit_zeroreg(t);
2552         }
2553         else
2554         {
2555           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2556           if(imm[i]) {
2557             if(opcode2[i]==0) // SLL
2558             {
2559               emit_shlimm(s<0?t:s,imm[i],t);
2560             }
2561             if(opcode2[i]==2) // SRL
2562             {
2563               emit_shrimm(s<0?t:s,imm[i],t);
2564             }
2565             if(opcode2[i]==3) // SRA
2566             {
2567               emit_sarimm(s<0?t:s,imm[i],t);
2568             }
2569           }else{
2570             // Shift by zero
2571             if(s>=0 && s!=t) emit_mov(s,t);
2572           }
2573         }
2574       }
2575       //emit_storereg(rt1[i],t); //DEBUG
2576     }
2577   }
2578   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2579   {
2580     if(rt1[i]) {
2581       signed char sh,sl,th,tl;
2582       th=get_reg(i_regs->regmap,rt1[i]|64);
2583       tl=get_reg(i_regs->regmap,rt1[i]);
2584       sh=get_reg(i_regs->regmap,rs1[i]|64);
2585       sl=get_reg(i_regs->regmap,rs1[i]);
2586       if(tl>=0) {
2587         if(rs1[i]==0)
2588         {
2589           emit_zeroreg(tl);
2590           if(th>=0) emit_zeroreg(th);
2591         }
2592         else
2593         {
2594           assert(sl>=0);
2595           assert(sh>=0);
2596           if(imm[i]) {
2597             if(opcode2[i]==0x38) // DSLL
2598             {
2599               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2600               emit_shlimm(sl,imm[i],tl);
2601             }
2602             if(opcode2[i]==0x3a) // DSRL
2603             {
2604               emit_shrdimm(sl,sh,imm[i],tl);
2605               if(th>=0) emit_shrimm(sh,imm[i],th);
2606             }
2607             if(opcode2[i]==0x3b) // DSRA
2608             {
2609               emit_shrdimm(sl,sh,imm[i],tl);
2610               if(th>=0) emit_sarimm(sh,imm[i],th);
2611             }
2612           }else{
2613             // Shift by zero
2614             if(sl!=tl) emit_mov(sl,tl);
2615             if(th>=0&&sh!=th) emit_mov(sh,th);
2616           }
2617         }
2618       }
2619     }
2620   }
2621   if(opcode2[i]==0x3c) // DSLL32
2622   {
2623     if(rt1[i]) {
2624       signed char sl,tl,th;
2625       tl=get_reg(i_regs->regmap,rt1[i]);
2626       th=get_reg(i_regs->regmap,rt1[i]|64);
2627       sl=get_reg(i_regs->regmap,rs1[i]);
2628       if(th>=0||tl>=0){
2629         assert(tl>=0);
2630         assert(th>=0);
2631         assert(sl>=0);
2632         emit_mov(sl,th);
2633         emit_zeroreg(tl);
2634         if(imm[i]>32)
2635         {
2636           emit_shlimm(th,imm[i]&31,th);
2637         }
2638       }
2639     }
2640   }
2641   if(opcode2[i]==0x3e) // DSRL32
2642   {
2643     if(rt1[i]) {
2644       signed char sh,tl,th;
2645       tl=get_reg(i_regs->regmap,rt1[i]);
2646       th=get_reg(i_regs->regmap,rt1[i]|64);
2647       sh=get_reg(i_regs->regmap,rs1[i]|64);
2648       if(tl>=0){
2649         assert(sh>=0);
2650         emit_mov(sh,tl);
2651         if(th>=0) emit_zeroreg(th);
2652         if(imm[i]>32)
2653         {
2654           emit_shrimm(tl,imm[i]&31,tl);
2655         }
2656       }
2657     }
2658   }
2659   if(opcode2[i]==0x3f) // DSRA32
2660   {
2661     if(rt1[i]) {
2662       signed char sh,tl;
2663       tl=get_reg(i_regs->regmap,rt1[i]);
2664       sh=get_reg(i_regs->regmap,rs1[i]|64);
2665       if(tl>=0){
2666         assert(sh>=0);
2667         emit_mov(sh,tl);
2668         if(imm[i]>32)
2669         {
2670           emit_sarimm(tl,imm[i]&31,tl);
2671         }
2672       }
2673     }
2674   }
2675 }
2676
2677 #ifndef shift_assemble
2678 void shift_assemble(int i,struct regstat *i_regs)
2679 {
2680   printf("Need shift_assemble for this architecture.\n");
2681   exit(1);
2682 }
2683 #endif
2684
2685 void load_assemble(int i,struct regstat *i_regs)
2686 {
2687   int s,th,tl,addr,map=-1;
2688   int offset;
2689   int jaddr=0;
2690   int memtarget,c=0;
2691   u_int hr,reglist=0;
2692   th=get_reg(i_regs->regmap,rt1[i]|64);
2693   tl=get_reg(i_regs->regmap,rt1[i]);
2694   s=get_reg(i_regs->regmap,rs1[i]);
2695   offset=imm[i];
2696   for(hr=0;hr<HOST_REGS;hr++) {
2697     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2698   }
2699   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2700   if(s>=0) {
2701     c=(i_regs->wasconst>>s)&1;
2702     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2703     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2704   }
2705   if(offset||s<0||c) addr=tl;
2706   else addr=s;
2707   //printf("load_assemble: c=%d\n",c);
2708   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2709   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2710   if(tl>=0) {
2711     //assert(tl>=0);
2712     //assert(rt1[i]);
2713     reglist&=~(1<<tl);
2714     if(th>=0) reglist&=~(1<<th);
2715     if(!using_tlb) {
2716       if(!c) {
2717 //#define R29_HACK 1
2718         #ifdef R29_HACK
2719         // Strmnnrmn's speed hack
2720         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2721         #endif
2722         {
2723           emit_cmpimm(addr,0x800000);
2724           jaddr=(int)out;
2725           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2726           // Hint to branch predictor that the branch is unlikely to be taken
2727           if(rs1[i]>=28)
2728             emit_jno_unlikely(0);
2729           else
2730           #endif
2731           emit_jno(0);
2732         }
2733       }
2734     }else{ // using tlb
2735       int x=0;
2736       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2737       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2738       map=get_reg(i_regs->regmap,TLREG);
2739       assert(map>=0);
2740       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2741       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2742     }
2743     if (opcode[i]==0x20) { // LB
2744       if(!c||memtarget) {
2745         #ifdef HOST_IMM_ADDR32
2746         if(c)
2747           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2748         else
2749         #endif
2750         {
2751           //emit_xorimm(addr,3,tl);
2752           //gen_tlb_addr_r(tl,map);
2753           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2754           int x=0;
2755 #ifdef BIG_ENDIAN_MIPS
2756           if(!c) emit_xorimm(addr,3,tl);
2757           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2758 #else
2759           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2760           else if (tl!=addr) emit_mov(addr,tl);
2761 #endif
2762           emit_movsbl_indexed_tlb(x,tl,map,tl);
2763         }
2764         if(jaddr)
2765           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2766       }
2767       else
2768         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2769     }
2770     if (opcode[i]==0x21) { // LH
2771       if(!c||memtarget) {
2772         #ifdef HOST_IMM_ADDR32
2773         if(c)
2774           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2775         else
2776         #endif
2777         {
2778           int x=0;
2779 #ifdef BIG_ENDIAN_MIPS
2780           if(!c) emit_xorimm(addr,2,tl);
2781           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2782 #else
2783           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2784           else if (tl!=addr) emit_mov(addr,tl);
2785 #endif
2786           //#ifdef
2787           //emit_movswl_indexed_tlb(x,tl,map,tl);
2788           //else
2789           if(map>=0) {
2790             gen_tlb_addr_r(tl,map);
2791             emit_movswl_indexed(x,tl,tl);
2792           }else
2793             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2794         }
2795         if(jaddr)
2796           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2797       }
2798       else
2799         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2800     }
2801     if (opcode[i]==0x23) { // LW
2802       if(!c||memtarget) {
2803         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2804         #ifdef HOST_IMM_ADDR32
2805         if(c)
2806           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2807         else
2808         #endif
2809         emit_readword_indexed_tlb(0,addr,map,tl);
2810         if(jaddr)
2811           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2812       }
2813       else
2814         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2815     }
2816     if (opcode[i]==0x24) { // LBU
2817       if(!c||memtarget) {
2818         #ifdef HOST_IMM_ADDR32
2819         if(c)
2820           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2821         else
2822         #endif
2823         {
2824           //emit_xorimm(addr,3,tl);
2825           //gen_tlb_addr_r(tl,map);
2826           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2827           int x=0;
2828 #ifdef BIG_ENDIAN_MIPS
2829           if(!c) emit_xorimm(addr,3,tl);
2830           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2831 #else
2832           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2833           else if (tl!=addr) emit_mov(addr,tl);
2834 #endif
2835           emit_movzbl_indexed_tlb(x,tl,map,tl);
2836         }
2837         if(jaddr)
2838           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2839       }
2840       else
2841         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2842     }
2843     if (opcode[i]==0x25) { // LHU
2844       if(!c||memtarget) {
2845         #ifdef HOST_IMM_ADDR32
2846         if(c)
2847           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2848         else
2849         #endif
2850         {
2851           int x=0;
2852 #ifdef BIG_ENDIAN_MIPS
2853           if(!c) emit_xorimm(addr,2,tl);
2854           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2855 #else
2856           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2857           else if (tl!=addr) emit_mov(addr,tl);
2858 #endif
2859           //#ifdef
2860           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2861           //#else
2862           if(map>=0) {
2863             gen_tlb_addr_r(tl,map);
2864             emit_movzwl_indexed(x,tl,tl);
2865           }else
2866             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2867           if(jaddr)
2868             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2869         }
2870       }
2871       else
2872         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2873     }
2874     if (opcode[i]==0x27) { // LWU
2875       assert(th>=0);
2876       if(!c||memtarget) {
2877         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2878         #ifdef HOST_IMM_ADDR32
2879         if(c)
2880           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2881         else
2882         #endif
2883         emit_readword_indexed_tlb(0,addr,map,tl);
2884         if(jaddr)
2885           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2886       }
2887       else {
2888         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2889       }
2890       emit_zeroreg(th);
2891     }
2892     if (opcode[i]==0x37) { // LD
2893       if(!c||memtarget) {
2894         //gen_tlb_addr_r(tl,map);
2895         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2896         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2897         #ifdef HOST_IMM_ADDR32
2898         if(c)
2899           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2900         else
2901         #endif
2902         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2903         if(jaddr)
2904           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2905       }
2906       else
2907         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2908     }
2909     //emit_storereg(rt1[i],tl); // DEBUG
2910   }
2911   //if(opcode[i]==0x23)
2912   //if(opcode[i]==0x24)
2913   //if(opcode[i]==0x23||opcode[i]==0x24)
2914   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2915   {
2916     //emit_pusha();
2917     save_regs(0x100f);
2918         emit_readword((int)&last_count,ECX);
2919         #ifdef __i386__
2920         if(get_reg(i_regs->regmap,CCREG)<0)
2921           emit_loadreg(CCREG,HOST_CCREG);
2922         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2923         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2924         emit_writeword(HOST_CCREG,(int)&Count);
2925         #endif
2926         #ifdef __arm__
2927         if(get_reg(i_regs->regmap,CCREG)<0)
2928           emit_loadreg(CCREG,0);
2929         else
2930           emit_mov(HOST_CCREG,0);
2931         emit_add(0,ECX,0);
2932         emit_addimm(0,2*ccadj[i],0);
2933         emit_writeword(0,(int)&Count);
2934         #endif
2935     emit_call((int)memdebug);
2936     //emit_popa();
2937     restore_regs(0x100f);
2938   }/**/
2939 }
2940
2941 #ifndef loadlr_assemble
2942 void loadlr_assemble(int i,struct regstat *i_regs)
2943 {
2944   printf("Need loadlr_assemble for this architecture.\n");
2945   exit(1);
2946 }
2947 #endif
2948
2949 void store_assemble(int i,struct regstat *i_regs)
2950 {
2951   int s,th,tl,map=-1;
2952   int addr,temp;
2953   int offset;
2954   int jaddr=0,jaddr2,type;
2955   int memtarget=0,c=0;
2956   int agr=AGEN1+(i&1);
2957   u_int hr,reglist=0;
2958   th=get_reg(i_regs->regmap,rs2[i]|64);
2959   tl=get_reg(i_regs->regmap,rs2[i]);
2960   s=get_reg(i_regs->regmap,rs1[i]);
2961   temp=get_reg(i_regs->regmap,agr);
2962   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2963   offset=imm[i];
2964   if(s>=0) {
2965     c=(i_regs->wasconst>>s)&1;
2966     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2967     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2968   }
2969   assert(tl>=0);
2970   assert(temp>=0);
2971   for(hr=0;hr<HOST_REGS;hr++) {
2972     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2973   }
2974   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2975   if(offset||s<0||c) addr=temp;
2976   else addr=s;
2977   if(!using_tlb) {
2978     if(!c) {
2979       #ifdef R29_HACK
2980       // Strmnnrmn's speed hack
2981       memtarget=1;
2982       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2983       #endif
2984       emit_cmpimm(addr,0x800000);
2985       #ifdef DESTRUCTIVE_SHIFT
2986       if(s==addr) emit_mov(s,temp);
2987       #endif
2988       #ifdef R29_HACK
2989       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2990       #endif
2991       {
2992         jaddr=(int)out;
2993         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2994         // Hint to branch predictor that the branch is unlikely to be taken
2995         if(rs1[i]>=28)
2996           emit_jno_unlikely(0);
2997         else
2998         #endif
2999         emit_jno(0);
3000       }
3001     }
3002   }else{ // using tlb
3003     int x=0;
3004     if (opcode[i]==0x28) x=3; // SB
3005     if (opcode[i]==0x29) x=2; // SH
3006     map=get_reg(i_regs->regmap,TLREG);
3007     assert(map>=0);
3008     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3009     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3010   }
3011
3012   if (opcode[i]==0x28) { // SB
3013     if(!c||memtarget) {
3014       int x=0;
3015 #ifdef BIG_ENDIAN_MIPS
3016       if(!c) emit_xorimm(addr,3,temp);
3017       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3018 #else
3019       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3020       else if (addr!=temp) emit_mov(addr,temp);
3021 #endif
3022       //gen_tlb_addr_w(temp,map);
3023       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3024       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
3025     }
3026     type=STOREB_STUB;
3027   }
3028   if (opcode[i]==0x29) { // SH
3029     if(!c||memtarget) {
3030       int x=0;
3031 #ifdef BIG_ENDIAN_MIPS
3032       if(!c) emit_xorimm(addr,2,temp);
3033       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3034 #else
3035       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3036       else if (addr!=temp) emit_mov(addr,temp);
3037 #endif
3038       //#ifdef
3039       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3040       //#else
3041       if(map>=0) {
3042         gen_tlb_addr_w(temp,map);
3043         emit_writehword_indexed(tl,x,temp);
3044       }else
3045         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3046     }
3047     type=STOREH_STUB;
3048   }
3049   if (opcode[i]==0x2B) { // SW
3050     if(!c||memtarget)
3051       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3052       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3053     type=STOREW_STUB;
3054   }
3055   if (opcode[i]==0x3F) { // SD
3056     if(!c||memtarget) {
3057       if(rs2[i]) {
3058         assert(th>=0);
3059         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3060         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3061         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3062       }else{
3063         // Store zero
3064         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3065         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3066         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3067       }
3068     }
3069     type=STORED_STUB;
3070   }
3071   if(!using_tlb&&(!c||memtarget))
3072     // addr could be a temp, make sure it survives STORE*_STUB
3073     reglist|=1<<addr;
3074   if(jaddr) {
3075     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3076   } else if(!memtarget) {
3077     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3078   }
3079   if(!using_tlb) {
3080     if(!c||memtarget) {
3081       #ifdef DESTRUCTIVE_SHIFT
3082       // The x86 shift operation is 'destructive'; it overwrites the
3083       // source register, so we need to make a copy first and use that.
3084       addr=temp;
3085       #endif
3086       #if defined(HOST_IMM8)
3087       int ir=get_reg(i_regs->regmap,INVCP);
3088       assert(ir>=0);
3089       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3090       #else
3091       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3092       #endif
3093       jaddr2=(int)out;
3094       emit_jne(0);
3095       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3096     }
3097   }
3098   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3099   //if(opcode[i]==0x2B || opcode[i]==0x28)
3100   //if(opcode[i]==0x2B || opcode[i]==0x29)
3101   //if(opcode[i]==0x2B)
3102   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3103   {
3104     //emit_pusha();
3105     save_regs(0x100f);
3106         emit_readword((int)&last_count,ECX);
3107         #ifdef __i386__
3108         if(get_reg(i_regs->regmap,CCREG)<0)
3109           emit_loadreg(CCREG,HOST_CCREG);
3110         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3111         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3112         emit_writeword(HOST_CCREG,(int)&Count);
3113         #endif
3114         #ifdef __arm__
3115         if(get_reg(i_regs->regmap,CCREG)<0)
3116           emit_loadreg(CCREG,0);
3117         else
3118           emit_mov(HOST_CCREG,0);
3119         emit_add(0,ECX,0);
3120         emit_addimm(0,2*ccadj[i],0);
3121         emit_writeword(0,(int)&Count);
3122         #endif
3123     emit_call((int)memdebug);
3124     //emit_popa();
3125     restore_regs(0x100f);
3126   }/**/
3127 }
3128
3129 void storelr_assemble(int i,struct regstat *i_regs)
3130 {
3131   int s,th,tl;
3132   int temp;
3133   int temp2;
3134   int offset;
3135   int jaddr=0,jaddr2;
3136   int case1,case2,case3;
3137   int done0,done1,done2;
3138   int memtarget,c=0;
3139   u_int hr,reglist=0;
3140   th=get_reg(i_regs->regmap,rs2[i]|64);
3141   tl=get_reg(i_regs->regmap,rs2[i]);
3142   s=get_reg(i_regs->regmap,rs1[i]);
3143   temp=get_reg(i_regs->regmap,-1);
3144   offset=imm[i];
3145   if(s>=0) {
3146     c=(i_regs->isconst>>s)&1;
3147     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3148     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3149   }
3150   assert(tl>=0);
3151   for(hr=0;hr<HOST_REGS;hr++) {
3152     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3153   }
3154   if(tl>=0) {
3155     assert(temp>=0);
3156     if(!using_tlb) {
3157       if(!c) {
3158         emit_cmpimm(s<0||offset?temp:s,0x800000);
3159         if(!offset&&s!=temp) emit_mov(s,temp);
3160         jaddr=(int)out;
3161         emit_jno(0);
3162       }
3163       else
3164       {
3165         if(!memtarget||!rs1[i]) {
3166           jaddr=(int)out;
3167           emit_jmp(0);
3168         }
3169       }
3170       if((u_int)rdram!=0x80000000) 
3171         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3172     }else{ // using tlb
3173       int map=get_reg(i_regs->regmap,TLREG);
3174       assert(map>=0);
3175       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3176       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3177       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3178       if(!jaddr&&!memtarget) {
3179         jaddr=(int)out;
3180         emit_jmp(0);
3181       }
3182       gen_tlb_addr_w(temp,map);
3183     }
3184
3185     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3186       temp2=get_reg(i_regs->regmap,FTEMP);
3187       if(!rs2[i]) temp2=th=tl;
3188     }
3189
3190 #ifndef BIG_ENDIAN_MIPS
3191     emit_xorimm(temp,3,temp);
3192 #endif
3193     emit_testimm(temp,2);
3194     case2=(int)out;
3195     emit_jne(0);
3196     emit_testimm(temp,1);
3197     case1=(int)out;
3198     emit_jne(0);
3199     // 0
3200     if (opcode[i]==0x2A) { // SWL
3201       emit_writeword_indexed(tl,0,temp);
3202     }
3203     if (opcode[i]==0x2E) { // SWR
3204       emit_writebyte_indexed(tl,3,temp);
3205     }
3206     if (opcode[i]==0x2C) { // SDL
3207       emit_writeword_indexed(th,0,temp);
3208       if(rs2[i]) emit_mov(tl,temp2);
3209     }
3210     if (opcode[i]==0x2D) { // SDR
3211       emit_writebyte_indexed(tl,3,temp);
3212       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3213     }
3214     done0=(int)out;
3215     emit_jmp(0);
3216     // 1
3217     set_jump_target(case1,(int)out);
3218     if (opcode[i]==0x2A) { // SWL
3219       // Write 3 msb into three least significant bytes
3220       if(rs2[i]) emit_rorimm(tl,8,tl);
3221       emit_writehword_indexed(tl,-1,temp);
3222       if(rs2[i]) emit_rorimm(tl,16,tl);
3223       emit_writebyte_indexed(tl,1,temp);
3224       if(rs2[i]) emit_rorimm(tl,8,tl);
3225     }
3226     if (opcode[i]==0x2E) { // SWR
3227       // Write two lsb into two most significant bytes
3228       emit_writehword_indexed(tl,1,temp);
3229     }
3230     if (opcode[i]==0x2C) { // SDL
3231       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3232       // Write 3 msb into three least significant bytes
3233       if(rs2[i]) emit_rorimm(th,8,th);
3234       emit_writehword_indexed(th,-1,temp);
3235       if(rs2[i]) emit_rorimm(th,16,th);
3236       emit_writebyte_indexed(th,1,temp);
3237       if(rs2[i]) emit_rorimm(th,8,th);
3238     }
3239     if (opcode[i]==0x2D) { // SDR
3240       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3241       // Write two lsb into two most significant bytes
3242       emit_writehword_indexed(tl,1,temp);
3243     }
3244     done1=(int)out;
3245     emit_jmp(0);
3246     // 2
3247     set_jump_target(case2,(int)out);
3248     emit_testimm(temp,1);
3249     case3=(int)out;
3250     emit_jne(0);
3251     if (opcode[i]==0x2A) { // SWL
3252       // Write two msb into two least significant bytes
3253       if(rs2[i]) emit_rorimm(tl,16,tl);
3254       emit_writehword_indexed(tl,-2,temp);
3255       if(rs2[i]) emit_rorimm(tl,16,tl);
3256     }
3257     if (opcode[i]==0x2E) { // SWR
3258       // Write 3 lsb into three most significant bytes
3259       emit_writebyte_indexed(tl,-1,temp);
3260       if(rs2[i]) emit_rorimm(tl,8,tl);
3261       emit_writehword_indexed(tl,0,temp);
3262       if(rs2[i]) emit_rorimm(tl,24,tl);
3263     }
3264     if (opcode[i]==0x2C) { // SDL
3265       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3266       // Write two msb into two least significant bytes
3267       if(rs2[i]) emit_rorimm(th,16,th);
3268       emit_writehword_indexed(th,-2,temp);
3269       if(rs2[i]) emit_rorimm(th,16,th);
3270     }
3271     if (opcode[i]==0x2D) { // SDR
3272       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3273       // Write 3 lsb into three most significant bytes
3274       emit_writebyte_indexed(tl,-1,temp);
3275       if(rs2[i]) emit_rorimm(tl,8,tl);
3276       emit_writehword_indexed(tl,0,temp);
3277       if(rs2[i]) emit_rorimm(tl,24,tl);
3278     }
3279     done2=(int)out;
3280     emit_jmp(0);
3281     // 3
3282     set_jump_target(case3,(int)out);
3283     if (opcode[i]==0x2A) { // SWL
3284       // Write msb into least significant byte
3285       if(rs2[i]) emit_rorimm(tl,24,tl);
3286       emit_writebyte_indexed(tl,-3,temp);
3287       if(rs2[i]) emit_rorimm(tl,8,tl);
3288     }
3289     if (opcode[i]==0x2E) { // SWR
3290       // Write entire word
3291       emit_writeword_indexed(tl,-3,temp);
3292     }
3293     if (opcode[i]==0x2C) { // SDL
3294       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3295       // Write msb into least significant byte
3296       if(rs2[i]) emit_rorimm(th,24,th);
3297       emit_writebyte_indexed(th,-3,temp);
3298       if(rs2[i]) emit_rorimm(th,8,th);
3299     }
3300     if (opcode[i]==0x2D) { // SDR
3301       if(rs2[i]) emit_mov(th,temp2);
3302       // Write entire word
3303       emit_writeword_indexed(tl,-3,temp);
3304     }
3305     set_jump_target(done0,(int)out);
3306     set_jump_target(done1,(int)out);
3307     set_jump_target(done2,(int)out);
3308     if (opcode[i]==0x2C) { // SDL
3309       emit_testimm(temp,4);
3310       done0=(int)out;
3311       emit_jne(0);
3312       emit_andimm(temp,~3,temp);
3313       emit_writeword_indexed(temp2,4,temp);
3314       set_jump_target(done0,(int)out);
3315     }
3316     if (opcode[i]==0x2D) { // SDR
3317       emit_testimm(temp,4);
3318       done0=(int)out;
3319       emit_jeq(0);
3320       emit_andimm(temp,~3,temp);
3321       emit_writeword_indexed(temp2,-4,temp);
3322       set_jump_target(done0,(int)out);
3323     }
3324     if(!c||!memtarget)
3325       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3326   }
3327   if(!using_tlb) {
3328     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3329     #if defined(HOST_IMM8)
3330     int ir=get_reg(i_regs->regmap,INVCP);
3331     assert(ir>=0);
3332     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3333     #else
3334     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3335     #endif
3336     jaddr2=(int)out;
3337     emit_jne(0);
3338     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3339   }
3340   /*
3341     emit_pusha();
3342     //save_regs(0x100f);
3343         emit_readword((int)&last_count,ECX);
3344         if(get_reg(i_regs->regmap,CCREG)<0)
3345           emit_loadreg(CCREG,HOST_CCREG);
3346         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3347         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3348         emit_writeword(HOST_CCREG,(int)&Count);
3349     emit_call((int)memdebug);
3350     emit_popa();
3351     //restore_regs(0x100f);
3352   /**/
3353 }
3354
3355 void c1ls_assemble(int i,struct regstat *i_regs)
3356 {
3357 #ifndef DISABLE_COP1
3358   int s,th,tl;
3359   int temp,ar;
3360   int map=-1;
3361   int offset;
3362   int c=0;
3363   int jaddr,jaddr2=0,jaddr3,type;
3364   int agr=AGEN1+(i&1);
3365   u_int hr,reglist=0;
3366   th=get_reg(i_regs->regmap,FTEMP|64);
3367   tl=get_reg(i_regs->regmap,FTEMP);
3368   s=get_reg(i_regs->regmap,rs1[i]);
3369   temp=get_reg(i_regs->regmap,agr);
3370   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3371   offset=imm[i];
3372   assert(tl>=0);
3373   assert(rs1[i]>0);
3374   assert(temp>=0);
3375   for(hr=0;hr<HOST_REGS;hr++) {
3376     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3377   }
3378   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3379   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3380   {
3381     // Loads use a temporary register which we need to save
3382     reglist|=1<<temp;
3383   }
3384   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3385     ar=temp;
3386   else // LWC1/LDC1
3387     ar=tl;
3388   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3389   //else c=(i_regs->wasconst>>s)&1;
3390   if(s>=0) c=(i_regs->wasconst>>s)&1;
3391   // Check cop1 unusable
3392   if(!cop1_usable) {
3393     signed char rs=get_reg(i_regs->regmap,CSREG);
3394     assert(rs>=0);
3395     emit_testimm(rs,0x20000000);
3396     jaddr=(int)out;
3397     emit_jeq(0);
3398     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3399     cop1_usable=1;
3400   }
3401   if (opcode[i]==0x39) { // SWC1 (get float address)
3402     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3403   }
3404   if (opcode[i]==0x3D) { // SDC1 (get double address)
3405     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3406   }
3407   // Generate address + offset
3408   if(!using_tlb) {
3409     if(!c)
3410       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3411   }
3412   else
3413   {
3414     map=get_reg(i_regs->regmap,TLREG);
3415     assert(map>=0);
3416     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3417       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3418     }
3419     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3420       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3421     }
3422   }
3423   if (opcode[i]==0x39) { // SWC1 (read float)
3424     emit_readword_indexed(0,tl,tl);
3425   }
3426   if (opcode[i]==0x3D) { // SDC1 (read double)
3427     emit_readword_indexed(4,tl,th);
3428     emit_readword_indexed(0,tl,tl);
3429   }
3430   if (opcode[i]==0x31) { // LWC1 (get target address)
3431     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3432   }
3433   if (opcode[i]==0x35) { // LDC1 (get target address)
3434     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3435   }
3436   if(!using_tlb) {
3437     if(!c) {
3438       jaddr2=(int)out;
3439       emit_jno(0);
3440     }
3441     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3442       jaddr2=(int)out;
3443       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3444     }
3445     #ifdef DESTRUCTIVE_SHIFT
3446     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3447       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3448     }
3449     #endif
3450   }else{
3451     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3452       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3453     }
3454     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3455       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3456     }
3457   }
3458   if (opcode[i]==0x31) { // LWC1
3459     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3460     //gen_tlb_addr_r(ar,map);
3461     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3462     #ifdef HOST_IMM_ADDR32
3463     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3464     else
3465     #endif
3466     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3467     type=LOADW_STUB;
3468   }
3469   if (opcode[i]==0x35) { // LDC1
3470     assert(th>=0);
3471     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3472     //gen_tlb_addr_r(ar,map);
3473     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3474     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3475     #ifdef HOST_IMM_ADDR32
3476     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3477     else
3478     #endif
3479     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3480     type=LOADD_STUB;
3481   }
3482   if (opcode[i]==0x39) { // SWC1
3483     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3484     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3485     type=STOREW_STUB;
3486   }
3487   if (opcode[i]==0x3D) { // SDC1
3488     assert(th>=0);
3489     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3490     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3491     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3492     type=STORED_STUB;
3493   }
3494   if(!using_tlb) {
3495     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3496       #ifndef DESTRUCTIVE_SHIFT
3497       temp=offset||c||s<0?ar:s;
3498       #endif
3499       #if defined(HOST_IMM8)
3500       int ir=get_reg(i_regs->regmap,INVCP);
3501       assert(ir>=0);
3502       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3503       #else
3504       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3505       #endif
3506       jaddr3=(int)out;
3507       emit_jne(0);
3508       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3509     }
3510   }
3511   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3512   if (opcode[i]==0x31) { // LWC1 (write float)
3513     emit_writeword_indexed(tl,0,temp);
3514   }
3515   if (opcode[i]==0x35) { // LDC1 (write double)
3516     emit_writeword_indexed(th,4,temp);
3517     emit_writeword_indexed(tl,0,temp);
3518   }
3519   //if(opcode[i]==0x39)
3520   /*if(opcode[i]==0x39||opcode[i]==0x31)
3521   {
3522     emit_pusha();
3523         emit_readword((int)&last_count,ECX);
3524         if(get_reg(i_regs->regmap,CCREG)<0)
3525           emit_loadreg(CCREG,HOST_CCREG);
3526         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3527         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3528         emit_writeword(HOST_CCREG,(int)&Count);
3529     emit_call((int)memdebug);
3530     emit_popa();
3531   }/**/
3532 #else
3533   cop1_unusable(i, i_regs);
3534 #endif
3535 }
3536
3537 #ifndef multdiv_assemble
3538 void multdiv_assemble(int i,struct regstat *i_regs)
3539 {
3540   printf("Need multdiv_assemble for this architecture.\n");
3541   exit(1);
3542 }
3543 #endif
3544
3545 void mov_assemble(int i,struct regstat *i_regs)
3546 {
3547   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3548   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3549   assert(rt1[i]>0);
3550   if(rt1[i]) {
3551     signed char sh,sl,th,tl;
3552     th=get_reg(i_regs->regmap,rt1[i]|64);
3553     tl=get_reg(i_regs->regmap,rt1[i]);
3554     //assert(tl>=0);
3555     if(tl>=0) {
3556       sh=get_reg(i_regs->regmap,rs1[i]|64);
3557       sl=get_reg(i_regs->regmap,rs1[i]);
3558       if(sl>=0) emit_mov(sl,tl);
3559       else emit_loadreg(rs1[i],tl);
3560       if(th>=0) {
3561         if(sh>=0) emit_mov(sh,th);
3562         else emit_loadreg(rs1[i]|64,th);
3563       }
3564     }
3565   }
3566 }
3567
3568 #ifndef fconv_assemble
3569 void fconv_assemble(int i,struct regstat *i_regs)
3570 {
3571   printf("Need fconv_assemble for this architecture.\n");
3572   exit(1);
3573 }
3574 #endif
3575
3576 #if 0
3577 void float_assemble(int i,struct regstat *i_regs)
3578 {
3579   printf("Need float_assemble for this architecture.\n");
3580   exit(1);
3581 }
3582 #endif
3583
3584 void syscall_assemble(int i,struct regstat *i_regs)
3585 {
3586   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3587   assert(ccreg==HOST_CCREG);
3588   assert(!is_delayslot);
3589   emit_movimm(start+i*4,EAX); // Get PC
3590   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3591   emit_jmp((int)jump_syscall_hle); // XXX
3592 }
3593
3594 void hlecall_assemble(int i,struct regstat *i_regs)
3595 {
3596   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3597   assert(ccreg==HOST_CCREG);
3598   assert(!is_delayslot);
3599   emit_movimm(start+i*4+4,0); // Get PC
3600   emit_movimm(source[i],1); // opcode
3601   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3602   emit_jmp((int)jump_hlecall); // XXX
3603 }
3604
3605 void ds_assemble(int i,struct regstat *i_regs)
3606 {
3607   is_delayslot=1;
3608   switch(itype[i]) {
3609     case ALU:
3610       alu_assemble(i,i_regs);break;
3611     case IMM16:
3612       imm16_assemble(i,i_regs);break;
3613     case SHIFT:
3614       shift_assemble(i,i_regs);break;
3615     case SHIFTIMM:
3616       shiftimm_assemble(i,i_regs);break;
3617     case LOAD:
3618       load_assemble(i,i_regs);break;
3619     case LOADLR:
3620       loadlr_assemble(i,i_regs);break;
3621     case STORE:
3622       store_assemble(i,i_regs);break;
3623     case STORELR:
3624       storelr_assemble(i,i_regs);break;
3625     case COP0:
3626       cop0_assemble(i,i_regs);break;
3627     case COP1:
3628       cop1_assemble(i,i_regs);break;
3629     case C1LS:
3630       c1ls_assemble(i,i_regs);break;
3631     case FCONV:
3632       fconv_assemble(i,i_regs);break;
3633     case FLOAT:
3634       float_assemble(i,i_regs);break;
3635     case FCOMP:
3636       fcomp_assemble(i,i_regs);break;
3637     case MULTDIV:
3638       multdiv_assemble(i,i_regs);break;
3639     case MOV:
3640       mov_assemble(i,i_regs);break;
3641     case SYSCALL:
3642     case HLECALL:
3643     case SPAN:
3644     case UJUMP:
3645     case RJUMP:
3646     case CJUMP:
3647     case SJUMP:
3648     case FJUMP:
3649       printf("Jump in the delay slot.  This is probably a bug.\n");
3650   }
3651   is_delayslot=0;
3652 }
3653
3654 // Is the branch target a valid internal jump?
3655 int internal_branch(uint64_t i_is32,int addr)
3656 {
3657   if(addr&1) return 0; // Indirect (register) jump
3658   if(addr>=start && addr<start+slen*4-4)
3659   {
3660     int t=(addr-start)>>2;
3661     // Delay slots are not valid branch targets
3662     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3663     // 64 -> 32 bit transition requires a recompile
3664     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3665     {
3666       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3667       else printf("optimizable: yes\n");
3668     }*/
3669     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3670     if(requires_32bit[t]&~i_is32) return 0;
3671     else return 1;
3672   }
3673   return 0;
3674 }
3675
3676 #ifndef wb_invalidate
3677 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3678   uint64_t u,uint64_t uu)
3679 {
3680   int hr;
3681   for(hr=0;hr<HOST_REGS;hr++) {
3682     if(hr!=EXCLUDE_REG) {
3683       if(pre[hr]!=entry[hr]) {
3684         if(pre[hr]>=0) {
3685           if((dirty>>hr)&1) {
3686             if(get_reg(entry,pre[hr])<0) {
3687               if(pre[hr]<64) {
3688                 if(!((u>>pre[hr])&1)) {
3689                   emit_storereg(pre[hr],hr);
3690                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3691                     emit_sarimm(hr,31,hr);
3692                     emit_storereg(pre[hr]|64,hr);
3693                   }
3694                 }
3695               }else{
3696                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3697                   emit_storereg(pre[hr],hr);
3698                 }
3699               }
3700             }
3701           }
3702         }
3703       }
3704     }
3705   }
3706   // Move from one register to another (no writeback)
3707   for(hr=0;hr<HOST_REGS;hr++) {
3708     if(hr!=EXCLUDE_REG) {
3709       if(pre[hr]!=entry[hr]) {
3710         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3711           int nr;
3712           if((nr=get_reg(entry,pre[hr]))>=0) {
3713             emit_mov(hr,nr);
3714           }
3715         }
3716       }
3717     }
3718   }
3719 }
3720 #endif
3721
3722 // Load the specified registers
3723 // This only loads the registers given as arguments because
3724 // we don't want to load things that will be overwritten
3725 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3726 {
3727   int hr;
3728   // Load 32-bit regs
3729   for(hr=0;hr<HOST_REGS;hr++) {
3730     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3731       if(entry[hr]!=regmap[hr]) {
3732         if(regmap[hr]==rs1||regmap[hr]==rs2)
3733         {
3734           if(regmap[hr]==0) {
3735             emit_zeroreg(hr);
3736           }
3737           else
3738           {
3739             emit_loadreg(regmap[hr],hr);
3740           }
3741         }
3742       }
3743     }
3744   }
3745   //Load 64-bit regs
3746   for(hr=0;hr<HOST_REGS;hr++) {
3747     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3748       if(entry[hr]!=regmap[hr]) {
3749         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3750         {
3751           assert(regmap[hr]!=64);
3752           if((is32>>(regmap[hr]&63))&1) {
3753             int lr=get_reg(regmap,regmap[hr]-64);
3754             if(lr>=0)
3755               emit_sarimm(lr,31,hr);
3756             else
3757               emit_loadreg(regmap[hr],hr);
3758           }
3759           else
3760           {
3761             emit_loadreg(regmap[hr],hr);
3762           }
3763         }
3764       }
3765     }
3766   }
3767 }
3768
3769 // Load registers prior to the start of a loop
3770 // so that they are not loaded within the loop
3771 static void loop_preload(signed char pre[],signed char entry[])
3772 {
3773   int hr;
3774   for(hr=0;hr<HOST_REGS;hr++) {
3775     if(hr!=EXCLUDE_REG) {
3776       if(pre[hr]!=entry[hr]) {
3777         if(entry[hr]>=0) {
3778           if(get_reg(pre,entry[hr])<0) {
3779             assem_debug("loop preload:\n");
3780             //printf("loop preload: %d\n",hr);
3781             if(entry[hr]==0) {
3782               emit_zeroreg(hr);
3783             }
3784             else if(entry[hr]<TEMPREG)
3785             {
3786               emit_loadreg(entry[hr],hr);
3787             }
3788             else if(entry[hr]-64<TEMPREG)
3789             {
3790               emit_loadreg(entry[hr],hr);
3791             }
3792           }
3793         }
3794       }
3795     }
3796   }
3797 }
3798
3799 // Generate address for load/store instruction
3800 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3801 {
3802   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3803     int ra;
3804     int agr=AGEN1+(i&1);
3805     int mgr=MGEN1+(i&1);
3806     if(itype[i]==LOAD) {
3807       ra=get_reg(i_regs->regmap,rt1[i]);
3808       //if(rt1[i]) assert(ra>=0);
3809     }
3810     if(itype[i]==LOADLR) {
3811       ra=get_reg(i_regs->regmap,FTEMP);
3812     }
3813     if(itype[i]==STORE||itype[i]==STORELR) {
3814       ra=get_reg(i_regs->regmap,agr);
3815       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3816     }
3817     if(itype[i]==C1LS) {
3818       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3819         ra=get_reg(i_regs->regmap,FTEMP);
3820       else { // SWC1/SDC1
3821         ra=get_reg(i_regs->regmap,agr);
3822         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3823       }
3824     }
3825     int rs=get_reg(i_regs->regmap,rs1[i]);
3826     int rm=get_reg(i_regs->regmap,TLREG);
3827     if(ra>=0) {
3828       int offset=imm[i];
3829       int c=(i_regs->wasconst>>rs)&1;
3830       if(rs1[i]==0) {
3831         // Using r0 as a base address
3832         /*if(rm>=0) {
3833           if(!entry||entry[rm]!=mgr) {
3834             generate_map_const(offset,rm);
3835           } // else did it in the previous cycle
3836         }*/
3837         if(!entry||entry[ra]!=agr) {
3838           if (opcode[i]==0x22||opcode[i]==0x26) {
3839             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3840           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3841             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3842           }else{
3843             emit_movimm(offset,ra);
3844           }
3845         } // else did it in the previous cycle
3846       }
3847       else if(rs<0) {
3848         if(!entry||entry[ra]!=rs1[i])
3849           emit_loadreg(rs1[i],ra);
3850         //if(!entry||entry[ra]!=rs1[i])
3851         //  printf("poor load scheduling!\n");
3852       }
3853       else if(c) {
3854         if(rm>=0) {
3855           if(!entry||entry[rm]!=mgr) {
3856             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3857               // Stores to memory go thru the mapper to detect self-modifying
3858               // code, loads don't.
3859               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3860                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3861                 generate_map_const(constmap[i][rs]+offset,rm);
3862             }else{
3863               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3864                 generate_map_const(constmap[i][rs]+offset,rm);
3865             }
3866           }
3867         }
3868         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3869           if(!entry||entry[ra]!=agr) {
3870             if (opcode[i]==0x22||opcode[i]==0x26) {
3871               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3872             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3873               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3874             }else{
3875               #ifdef HOST_IMM_ADDR32
3876               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3877                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3878               #endif
3879               emit_movimm(constmap[i][rs]+offset,ra);
3880             }
3881           } // else did it in the previous cycle
3882         } // else load_consts already did it
3883       }
3884       if(offset&&!c&&rs1[i]) {
3885         if(rs>=0) {
3886           emit_addimm(rs,offset,ra);
3887         }else{
3888           emit_addimm(ra,offset,ra);
3889         }
3890       }
3891     }
3892   }
3893   // Preload constants for next instruction
3894   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3895     int agr,ra;
3896     #ifndef HOST_IMM_ADDR32
3897     // Mapper entry
3898     agr=MGEN1+((i+1)&1);
3899     ra=get_reg(i_regs->regmap,agr);
3900     if(ra>=0) {
3901       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3902       int offset=imm[i+1];
3903       int c=(regs[i+1].wasconst>>rs)&1;
3904       if(c) {
3905         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3906           // Stores to memory go thru the mapper to detect self-modifying
3907           // code, loads don't.
3908           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3909              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3910             generate_map_const(constmap[i+1][rs]+offset,ra);
3911         }else{
3912           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3913             generate_map_const(constmap[i+1][rs]+offset,ra);
3914         }
3915       }
3916       /*else if(rs1[i]==0) {
3917         generate_map_const(offset,ra);
3918       }*/
3919     }
3920     #endif
3921     // Actual address
3922     agr=AGEN1+((i+1)&1);
3923     ra=get_reg(i_regs->regmap,agr);
3924     if(ra>=0) {
3925       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3926       int offset=imm[i+1];
3927       int c=(regs[i+1].wasconst>>rs)&1;
3928       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3929         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3930           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3931         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3932           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3933         }else{
3934           #ifdef HOST_IMM_ADDR32
3935           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3936              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3937           #endif
3938           emit_movimm(constmap[i+1][rs]+offset,ra);
3939         }
3940       }
3941       else if(rs1[i+1]==0) {
3942         // Using r0 as a base address
3943         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3944           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3945         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3946           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3947         }else{
3948           emit_movimm(offset,ra);
3949         }
3950       }
3951     }
3952   }
3953 }
3954
3955 int get_final_value(int hr, int i, int *value)
3956 {
3957   int reg=regs[i].regmap[hr];
3958   while(i<slen-1) {
3959     if(regs[i+1].regmap[hr]!=reg) break;
3960     if(!((regs[i+1].isconst>>hr)&1)) break;
3961     if(bt[i+1]) break;
3962     i++;
3963   }
3964   if(i<slen-1) {
3965     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3966       *value=constmap[i][hr];
3967       return 1;
3968     }
3969     if(!bt[i+1]) {
3970       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3971         // Load in delay slot, out-of-order execution
3972         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3973         {
3974           #ifdef HOST_IMM_ADDR32
3975           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3976           #endif
3977           // Precompute load address
3978           *value=constmap[i][hr]+imm[i+2];
3979           return 1;
3980         }
3981       }
3982       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3983       {
3984         #ifdef HOST_IMM_ADDR32
3985         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3986         #endif
3987         // Precompute load address
3988         *value=constmap[i][hr]+imm[i+1];
3989         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3990         return 1;
3991       }
3992     }
3993   }
3994   *value=constmap[i][hr];
3995   //printf("c=%x\n",(int)constmap[i][hr]);
3996   if(i==slen-1) return 1;
3997   if(reg<64) {
3998     return !((unneeded_reg[i+1]>>reg)&1);
3999   }else{
4000     return !((unneeded_reg_upper[i+1]>>reg)&1);
4001   }
4002 }
4003
4004 // Load registers with known constants
4005 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4006 {
4007   int hr;
4008   // Load 32-bit regs
4009   for(hr=0;hr<HOST_REGS;hr++) {
4010     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4011       //if(entry[hr]!=regmap[hr]) {
4012       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4013         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4014           int value;
4015           if(get_final_value(hr,i,&value)) {
4016             if(value==0) {
4017               emit_zeroreg(hr);
4018             }
4019             else {
4020               emit_movimm(value,hr);
4021             }
4022           }
4023         }
4024       }
4025     }
4026   }
4027   // Load 64-bit regs
4028   for(hr=0;hr<HOST_REGS;hr++) {
4029     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4030       //if(entry[hr]!=regmap[hr]) {
4031       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4032         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4033           if((is32>>(regmap[hr]&63))&1) {
4034             int lr=get_reg(regmap,regmap[hr]-64);
4035             assert(lr>=0);
4036             emit_sarimm(lr,31,hr);
4037           }
4038           else
4039           {
4040             int value;
4041             if(get_final_value(hr,i,&value)) {
4042               if(value==0) {
4043                 emit_zeroreg(hr);
4044               }
4045               else {
4046                 emit_movimm(value,hr);
4047               }
4048             }
4049           }
4050         }
4051       }
4052     }
4053   }
4054 }
4055 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4056 {
4057   int hr;
4058   // Load 32-bit regs
4059   for(hr=0;hr<HOST_REGS;hr++) {
4060     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4061       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4062         int value=constmap[i][hr];
4063         if(value==0) {
4064           emit_zeroreg(hr);
4065         }
4066         else {
4067           emit_movimm(value,hr);
4068         }
4069       }
4070     }
4071   }
4072   // Load 64-bit regs
4073   for(hr=0;hr<HOST_REGS;hr++) {
4074     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4075       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4076         if((is32>>(regmap[hr]&63))&1) {
4077           int lr=get_reg(regmap,regmap[hr]-64);
4078           assert(lr>=0);
4079           emit_sarimm(lr,31,hr);
4080         }
4081         else
4082         {
4083           int value=constmap[i][hr];
4084           if(value==0) {
4085             emit_zeroreg(hr);
4086           }
4087           else {
4088             emit_movimm(value,hr);
4089           }
4090         }
4091       }
4092     }
4093   }
4094 }
4095
4096 // Write out all dirty registers (except cycle count)
4097 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4098 {
4099   int hr;
4100   for(hr=0;hr<HOST_REGS;hr++) {
4101     if(hr!=EXCLUDE_REG) {
4102       if(i_regmap[hr]>0) {
4103         if(i_regmap[hr]!=CCREG) {
4104           if((i_dirty>>hr)&1) {
4105             if(i_regmap[hr]<64) {
4106               emit_storereg(i_regmap[hr],hr);
4107 #ifndef FORCE32
4108               if( ((i_is32>>i_regmap[hr])&1) ) {
4109                 #ifdef DESTRUCTIVE_WRITEBACK
4110                 emit_sarimm(hr,31,hr);
4111                 emit_storereg(i_regmap[hr]|64,hr);
4112                 #else
4113                 emit_sarimm(hr,31,HOST_TEMPREG);
4114                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4115                 #endif
4116               }
4117 #endif
4118             }else{
4119               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4120                 emit_storereg(i_regmap[hr],hr);
4121               }
4122             }
4123           }
4124         }
4125       }
4126     }
4127   }
4128 }
4129 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4130 // This writes the registers not written by store_regs_bt
4131 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4132 {
4133   int hr;
4134   int t=(addr-start)>>2;
4135   for(hr=0;hr<HOST_REGS;hr++) {
4136     if(hr!=EXCLUDE_REG) {
4137       if(i_regmap[hr]>0) {
4138         if(i_regmap[hr]!=CCREG) {
4139           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4140             if((i_dirty>>hr)&1) {
4141               if(i_regmap[hr]<64) {
4142                 emit_storereg(i_regmap[hr],hr);
4143 #ifndef FORCE32
4144                 if( ((i_is32>>i_regmap[hr])&1) ) {
4145                   #ifdef DESTRUCTIVE_WRITEBACK
4146                   emit_sarimm(hr,31,hr);
4147                   emit_storereg(i_regmap[hr]|64,hr);
4148                   #else
4149                   emit_sarimm(hr,31,HOST_TEMPREG);
4150                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4151                   #endif
4152                 }
4153 #endif
4154               }else{
4155                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4156                   emit_storereg(i_regmap[hr],hr);
4157                 }
4158               }
4159             }
4160           }
4161         }
4162       }
4163     }
4164   }
4165 }
4166
4167 // Load all registers (except cycle count)
4168 void load_all_regs(signed char i_regmap[])
4169 {
4170   int hr;
4171   for(hr=0;hr<HOST_REGS;hr++) {
4172     if(hr!=EXCLUDE_REG) {
4173       if(i_regmap[hr]==0) {
4174         emit_zeroreg(hr);
4175       }
4176       else
4177       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4178       {
4179         emit_loadreg(i_regmap[hr],hr);
4180       }
4181     }
4182   }
4183 }
4184
4185 // Load all current registers also needed by next instruction
4186 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4187 {
4188   int hr;
4189   for(hr=0;hr<HOST_REGS;hr++) {
4190     if(hr!=EXCLUDE_REG) {
4191       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4192         if(i_regmap[hr]==0) {
4193           emit_zeroreg(hr);
4194         }
4195         else
4196         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4197         {
4198           emit_loadreg(i_regmap[hr],hr);
4199         }
4200       }
4201     }
4202   }
4203 }
4204
4205 // Load all regs, storing cycle count if necessary
4206 void load_regs_entry(int t)
4207 {
4208   int hr;
4209   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4210   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4211   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4212     emit_storereg(CCREG,HOST_CCREG);
4213   }
4214   // Load 32-bit regs
4215   for(hr=0;hr<HOST_REGS;hr++) {
4216     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4217       if(regs[t].regmap_entry[hr]==0) {
4218         emit_zeroreg(hr);
4219       }
4220       else if(regs[t].regmap_entry[hr]!=CCREG)
4221       {
4222         emit_loadreg(regs[t].regmap_entry[hr],hr);
4223       }
4224     }
4225   }
4226   // Load 64-bit regs
4227   for(hr=0;hr<HOST_REGS;hr++) {
4228     if(regs[t].regmap_entry[hr]>=64) {
4229       assert(regs[t].regmap_entry[hr]!=64);
4230       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4231         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4232         if(lr<0) {
4233           emit_loadreg(regs[t].regmap_entry[hr],hr);
4234         }
4235         else
4236         {
4237           emit_sarimm(lr,31,hr);
4238         }
4239       }
4240       else
4241       {
4242         emit_loadreg(regs[t].regmap_entry[hr],hr);
4243       }
4244     }
4245   }
4246 }
4247
4248 // Store dirty registers prior to branch
4249 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4250 {
4251   if(internal_branch(i_is32,addr))
4252   {
4253     int t=(addr-start)>>2;
4254     int hr;
4255     for(hr=0;hr<HOST_REGS;hr++) {
4256       if(hr!=EXCLUDE_REG) {
4257         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4258           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4259             if((i_dirty>>hr)&1) {
4260               if(i_regmap[hr]<64) {
4261                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4262                   emit_storereg(i_regmap[hr],hr);
4263                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4264                     #ifdef DESTRUCTIVE_WRITEBACK
4265                     emit_sarimm(hr,31,hr);
4266                     emit_storereg(i_regmap[hr]|64,hr);
4267                     #else
4268                     emit_sarimm(hr,31,HOST_TEMPREG);
4269                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4270                     #endif
4271                   }
4272                 }
4273               }else{
4274                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4275                   emit_storereg(i_regmap[hr],hr);
4276                 }
4277               }
4278             }
4279           }
4280         }
4281       }
4282     }
4283   }
4284   else
4285   {
4286     // Branch out of this block, write out all dirty regs
4287     wb_dirtys(i_regmap,i_is32,i_dirty);
4288   }
4289 }
4290
4291 // Load all needed registers for branch target
4292 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4293 {
4294   //if(addr>=start && addr<(start+slen*4))
4295   if(internal_branch(i_is32,addr))
4296   {
4297     int t=(addr-start)>>2;
4298     int hr;
4299     // Store the cycle count before loading something else
4300     if(i_regmap[HOST_CCREG]!=CCREG) {
4301       assert(i_regmap[HOST_CCREG]==-1);
4302     }
4303     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4304       emit_storereg(CCREG,HOST_CCREG);
4305     }
4306     // Load 32-bit regs
4307     for(hr=0;hr<HOST_REGS;hr++) {
4308       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4309         #ifdef DESTRUCTIVE_WRITEBACK
4310         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4311         #else
4312         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4313         #endif
4314           if(regs[t].regmap_entry[hr]==0) {
4315             emit_zeroreg(hr);
4316           }
4317           else if(regs[t].regmap_entry[hr]!=CCREG)
4318           {
4319             emit_loadreg(regs[t].regmap_entry[hr],hr);
4320           }
4321         }
4322       }
4323     }
4324     //Load 64-bit regs
4325     for(hr=0;hr<HOST_REGS;hr++) {
4326       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4327         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4328           assert(regs[t].regmap_entry[hr]!=64);
4329           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4330             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4331             if(lr<0) {
4332               emit_loadreg(regs[t].regmap_entry[hr],hr);
4333             }
4334             else
4335             {
4336               emit_sarimm(lr,31,hr);
4337             }
4338           }
4339           else
4340           {
4341             emit_loadreg(regs[t].regmap_entry[hr],hr);
4342           }
4343         }
4344         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4345           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4346           assert(lr>=0);
4347           emit_sarimm(lr,31,hr);
4348         }
4349       }
4350     }
4351   }
4352 }
4353
4354 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4355 {
4356   if(addr>=start && addr<start+slen*4-4)
4357   {
4358     int t=(addr-start)>>2;
4359     int hr;
4360     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4361     for(hr=0;hr<HOST_REGS;hr++)
4362     {
4363       if(hr!=EXCLUDE_REG)
4364       {
4365         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4366         {
4367           if(regs[t].regmap_entry[hr]!=-1)
4368           {
4369             return 0;
4370           }
4371           else 
4372           if((i_dirty>>hr)&1)
4373           {
4374             if(i_regmap[hr]<64)
4375             {
4376               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4377                 return 0;
4378             }
4379             else
4380             {
4381               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4382                 return 0;
4383             }
4384           }
4385         }
4386         else // Same register but is it 32-bit or dirty?
4387         if(i_regmap[hr]>=0)
4388         {
4389           if(!((regs[t].dirty>>hr)&1))
4390           {
4391             if((i_dirty>>hr)&1)
4392             {
4393               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4394               {
4395                 //printf("%x: dirty no match\n",addr);
4396                 return 0;
4397               }
4398             }
4399           }
4400           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4401           {
4402             //printf("%x: is32 no match\n",addr);
4403             return 0;
4404           }
4405         }
4406       }
4407     }
4408     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4409     if(requires_32bit[t]&~i_is32) return 0;
4410     // Delay slots are not valid branch targets
4411     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4412     // Delay slots require additional processing, so do not match
4413     if(is_ds[t]) return 0;
4414   }
4415   else
4416   {
4417     int hr;
4418     for(hr=0;hr<HOST_REGS;hr++)
4419     {
4420       if(hr!=EXCLUDE_REG)
4421       {
4422         if(i_regmap[hr]>=0)
4423         {
4424           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4425           {
4426             if((i_dirty>>hr)&1)
4427             {
4428               return 0;
4429             }
4430           }
4431         }
4432       }
4433     }
4434   }
4435   return 1;
4436 }
4437
4438 // Used when a branch jumps into the delay slot of another branch
4439 void ds_assemble_entry(int i)
4440 {
4441   int t=(ba[i]-start)>>2;
4442   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4443   assem_debug("Assemble delay slot at %x\n",ba[i]);
4444   assem_debug("<->\n");
4445   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4446     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4447   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4448   address_generation(t,&regs[t],regs[t].regmap_entry);
4449   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4450     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4451   cop1_usable=0;
4452   is_delayslot=0;
4453   switch(itype[t]) {
4454     case ALU:
4455       alu_assemble(t,&regs[t]);break;
4456     case IMM16:
4457       imm16_assemble(t,&regs[t]);break;
4458     case SHIFT:
4459       shift_assemble(t,&regs[t]);break;
4460     case SHIFTIMM:
4461       shiftimm_assemble(t,&regs[t]);break;
4462     case LOAD:
4463       load_assemble(t,&regs[t]);break;
4464     case LOADLR:
4465       loadlr_assemble(t,&regs[t]);break;
4466     case STORE:
4467       store_assemble(t,&regs[t]);break;
4468     case STORELR:
4469       storelr_assemble(t,&regs[t]);break;
4470     case COP0:
4471       cop0_assemble(t,&regs[t]);break;
4472     case COP1:
4473       cop1_assemble(t,&regs[t]);break;
4474     case C1LS:
4475       c1ls_assemble(t,&regs[t]);break;
4476     case FCONV:
4477       fconv_assemble(t,&regs[t]);break;
4478     case FLOAT:
4479       float_assemble(t,&regs[t]);break;
4480     case FCOMP:
4481       fcomp_assemble(t,&regs[t]);break;
4482     case MULTDIV:
4483       multdiv_assemble(t,&regs[t]);break;
4484     case MOV:
4485       mov_assemble(t,&regs[t]);break;
4486     case SYSCALL:
4487     case HLECALL:
4488     case SPAN:
4489     case UJUMP:
4490     case RJUMP:
4491     case CJUMP:
4492     case SJUMP:
4493     case FJUMP:
4494       printf("Jump in the delay slot.  This is probably a bug.\n");
4495   }
4496   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4497   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4498   if(internal_branch(regs[t].is32,ba[i]+4))
4499     assem_debug("branch: internal\n");
4500   else
4501     assem_debug("branch: external\n");
4502   assert(internal_branch(regs[t].is32,ba[i]+4));
4503   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4504   emit_jmp(0);
4505 }
4506
4507 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4508 {
4509   int count;
4510   int jaddr;
4511   int idle=0;
4512   if(itype[i]==RJUMP)
4513   {
4514     *adj=0;
4515   }
4516   //if(ba[i]>=start && ba[i]<(start+slen*4))
4517   if(internal_branch(branch_regs[i].is32,ba[i]))
4518   {
4519     int t=(ba[i]-start)>>2;
4520     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4521     else *adj=ccadj[t];
4522   }
4523   else
4524   {
4525     *adj=0;
4526   }
4527   count=ccadj[i];
4528   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4529     // Idle loop
4530     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4531     idle=(int)out;
4532     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4533     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4534     jaddr=(int)out;
4535     emit_jmp(0);
4536   }
4537   else if(*adj==0||invert) {
4538     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4539     jaddr=(int)out;
4540     emit_jns(0);
4541   }
4542   else
4543   {
4544     emit_cmpimm(HOST_CCREG,-2*(count+2));
4545     jaddr=(int)out;
4546     emit_jns(0);
4547   }
4548   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4549 }
4550
4551 void do_ccstub(int n)
4552 {
4553   literal_pool(256);
4554   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4555   set_jump_target(stubs[n][1],(int)out);
4556   int i=stubs[n][4];
4557   if(stubs[n][6]==NULLDS) {
4558     // Delay slot instruction is nullified ("likely" branch)
4559     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4560   }
4561   else if(stubs[n][6]!=TAKEN) {
4562     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4563   }
4564   else {
4565     if(internal_branch(branch_regs[i].is32,ba[i]))
4566       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4567   }
4568   if(stubs[n][5]!=-1)
4569   {
4570     // Save PC as return address
4571     emit_movimm(stubs[n][5],EAX);
4572     emit_writeword(EAX,(int)&pcaddr);
4573   }
4574   else
4575   {
4576     // Return address depends on which way the branch goes
4577     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4578     {
4579       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4580       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4581       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4582       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4583       if(rs1[i]==0)
4584       {
4585         s1l=s2l;s1h=s2h;
4586         s2l=s2h=-1;
4587       }
4588       else if(rs2[i]==0)
4589       {
4590         s2l=s2h=-1;
4591       }
4592       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4593         s1h=s2h=-1;
4594       }
4595       assert(s1l>=0);
4596       #ifdef DESTRUCTIVE_WRITEBACK
4597       if(rs1[i]) {
4598         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4599           emit_loadreg(rs1[i],s1l);
4600       } 
4601       else {
4602         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4603           emit_loadreg(rs2[i],s1l);
4604       }
4605       if(s2l>=0)
4606         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4607           emit_loadreg(rs2[i],s2l);
4608       #endif
4609       int hr=0;
4610       int addr,alt,ntaddr;
4611       while(hr<HOST_REGS)
4612       {
4613         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4614            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4615            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4616         {
4617           addr=hr++;break;
4618         }
4619         hr++;
4620       }
4621       while(hr<HOST_REGS)
4622       {
4623         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4624            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4625            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4626         {
4627           alt=hr++;break;
4628         }
4629         hr++;
4630       }
4631       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4632       {
4633         while(hr<HOST_REGS)
4634         {
4635           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4636              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4637              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4638           {
4639             ntaddr=hr;break;
4640           }
4641           hr++;
4642         }
4643         assert(hr<HOST_REGS);
4644       }
4645       if((opcode[i]&0x2f)==4) // BEQ
4646       {
4647         #ifdef HAVE_CMOV_IMM
4648         if(s1h<0) {
4649           if(s2l>=0) emit_cmp(s1l,s2l);
4650           else emit_test(s1l,s1l);
4651           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4652         }
4653         else
4654         #endif
4655         {
4656           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4657           if(s1h>=0) {
4658             if(s2h>=0) emit_cmp(s1h,s2h);
4659             else emit_test(s1h,s1h);
4660             emit_cmovne_reg(alt,addr);
4661           }
4662           if(s2l>=0) emit_cmp(s1l,s2l);
4663           else emit_test(s1l,s1l);
4664           emit_cmovne_reg(alt,addr);
4665         }
4666       }
4667       if((opcode[i]&0x2f)==5) // BNE
4668       {
4669         #ifdef HAVE_CMOV_IMM
4670         if(s1h<0) {
4671           if(s2l>=0) emit_cmp(s1l,s2l);
4672           else emit_test(s1l,s1l);
4673           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4674         }
4675         else
4676         #endif
4677         {
4678           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4679           if(s1h>=0) {
4680             if(s2h>=0) emit_cmp(s1h,s2h);
4681             else emit_test(s1h,s1h);
4682             emit_cmovne_reg(alt,addr);
4683           }
4684           if(s2l>=0) emit_cmp(s1l,s2l);
4685           else emit_test(s1l,s1l);
4686           emit_cmovne_reg(alt,addr);
4687         }
4688       }
4689       if((opcode[i]&0x2f)==6) // BLEZ
4690       {
4691         //emit_movimm(ba[i],alt);
4692         //emit_movimm(start+i*4+8,addr);
4693         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4694         emit_cmpimm(s1l,1);
4695         if(s1h>=0) emit_mov(addr,ntaddr);
4696         emit_cmovl_reg(alt,addr);
4697         if(s1h>=0) {
4698           emit_test(s1h,s1h);
4699           emit_cmovne_reg(ntaddr,addr);
4700           emit_cmovs_reg(alt,addr);
4701         }
4702       }
4703       if((opcode[i]&0x2f)==7) // BGTZ
4704       {
4705         //emit_movimm(ba[i],addr);
4706         //emit_movimm(start+i*4+8,ntaddr);
4707         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4708         emit_cmpimm(s1l,1);
4709         if(s1h>=0) emit_mov(addr,alt);
4710         emit_cmovl_reg(ntaddr,addr);
4711         if(s1h>=0) {
4712           emit_test(s1h,s1h);
4713           emit_cmovne_reg(alt,addr);
4714           emit_cmovs_reg(ntaddr,addr);
4715         }
4716       }
4717       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4718       {
4719         //emit_movimm(ba[i],alt);
4720         //emit_movimm(start+i*4+8,addr);
4721         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4722         if(s1h>=0) emit_test(s1h,s1h);
4723         else emit_test(s1l,s1l);
4724         emit_cmovs_reg(alt,addr);
4725       }
4726       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4727       {
4728         //emit_movimm(ba[i],addr);
4729         //emit_movimm(start+i*4+8,alt);
4730         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4731         if(s1h>=0) emit_test(s1h,s1h);
4732         else emit_test(s1l,s1l);
4733         emit_cmovs_reg(alt,addr);
4734       }
4735       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4736         if(source[i]&0x10000) // BC1T
4737         {
4738           //emit_movimm(ba[i],alt);
4739           //emit_movimm(start+i*4+8,addr);
4740           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4741           emit_testimm(s1l,0x800000);
4742           emit_cmovne_reg(alt,addr);
4743         }
4744         else // BC1F
4745         {
4746           //emit_movimm(ba[i],addr);
4747           //emit_movimm(start+i*4+8,alt);
4748           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4749           emit_testimm(s1l,0x800000);
4750           emit_cmovne_reg(alt,addr);
4751         }
4752       }
4753       emit_writeword(addr,(int)&pcaddr);
4754     }
4755     else
4756     if(itype[i]==RJUMP)
4757     {
4758       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4759       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4760         r=get_reg(branch_regs[i].regmap,RTEMP);
4761       }
4762       emit_writeword(r,(int)&pcaddr);
4763     }
4764     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4765   }
4766   // Update cycle count
4767   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4768   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4769   emit_call((int)cc_interrupt);
4770   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4771   if(stubs[n][6]==TAKEN) {
4772     if(internal_branch(branch_regs[i].is32,ba[i]))
4773       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4774     else if(itype[i]==RJUMP) {
4775       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4776         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4777       else
4778         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4779     }
4780   }else if(stubs[n][6]==NOTTAKEN) {
4781     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4782     else load_all_regs(branch_regs[i].regmap);
4783   }else if(stubs[n][6]==NULLDS) {
4784     // Delay slot instruction is nullified ("likely" branch)
4785     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4786     else load_all_regs(regs[i].regmap);
4787   }else{
4788     load_all_regs(branch_regs[i].regmap);
4789   }
4790   emit_jmp(stubs[n][2]); // return address
4791   
4792   /* This works but uses a lot of memory...
4793   emit_readword((int)&last_count,ECX);
4794   emit_add(HOST_CCREG,ECX,EAX);
4795   emit_writeword(EAX,(int)&Count);
4796   emit_call((int)gen_interupt);
4797   emit_readword((int)&Count,HOST_CCREG);
4798   emit_readword((int)&next_interupt,EAX);
4799   emit_readword((int)&pending_exception,EBX);
4800   emit_writeword(EAX,(int)&last_count);
4801   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4802   emit_test(EBX,EBX);
4803   int jne_instr=(int)out;
4804   emit_jne(0);
4805   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4806   load_all_regs(branch_regs[i].regmap);
4807   emit_jmp(stubs[n][2]); // return address
4808   set_jump_target(jne_instr,(int)out);
4809   emit_readword((int)&pcaddr,EAX);
4810   // Call get_addr_ht instead of doing the hash table here.
4811   // This code is executed infrequently and takes up a lot of space
4812   // so smaller is better.
4813   emit_storereg(CCREG,HOST_CCREG);
4814   emit_pushreg(EAX);
4815   emit_call((int)get_addr_ht);
4816   emit_loadreg(CCREG,HOST_CCREG);
4817   emit_addimm(ESP,4,ESP);
4818   emit_jmpreg(EAX);*/
4819 }
4820
4821 add_to_linker(int addr,int target,int ext)
4822 {
4823   link_addr[linkcount][0]=addr;
4824   link_addr[linkcount][1]=target;
4825   link_addr[linkcount][2]=ext;  
4826   linkcount++;
4827 }
4828
4829 void ujump_assemble(int i,struct regstat *i_regs)
4830 {
4831   signed char *i_regmap=i_regs->regmap;
4832   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4833   address_generation(i+1,i_regs,regs[i].regmap_entry);
4834   #ifdef REG_PREFETCH
4835   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4836   if(rt1[i]==31&&temp>=0) 
4837   {
4838     int return_address=start+i*4+8;
4839     if(get_reg(branch_regs[i].regmap,31)>0) 
4840     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4841   }
4842   #endif
4843   ds_assemble(i+1,i_regs);
4844   uint64_t bc_unneeded=branch_regs[i].u;
4845   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4846   bc_unneeded|=1|(1LL<<rt1[i]);
4847   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4848   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4849                 bc_unneeded,bc_unneeded_upper);
4850   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4851   if(rt1[i]==31) {
4852     int rt;
4853     unsigned int return_address;
4854     assert(rt1[i+1]!=31);
4855     assert(rt2[i+1]!=31);
4856     rt=get_reg(branch_regs[i].regmap,31);
4857     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4858     //assert(rt>=0);
4859     return_address=start+i*4+8;
4860     if(rt>=0) {
4861       #ifdef USE_MINI_HT
4862       if(internal_branch(branch_regs[i].is32,return_address)) {
4863         int temp=rt+1;
4864         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4865            branch_regs[i].regmap[temp]>=0)
4866         {
4867           temp=get_reg(branch_regs[i].regmap,-1);
4868         }
4869         #ifdef HOST_TEMPREG
4870         if(temp<0) temp=HOST_TEMPREG;
4871         #endif
4872         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4873         else emit_movimm(return_address,rt);
4874       }
4875       else
4876       #endif
4877       {
4878         #ifdef REG_PREFETCH
4879         if(temp>=0) 
4880         {
4881           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4882         }
4883         #endif
4884         emit_movimm(return_address,rt); // PC into link register
4885         #ifdef IMM_PREFETCH
4886         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4887         #endif
4888       }
4889     }
4890   }
4891   int cc,adj;
4892   cc=get_reg(branch_regs[i].regmap,CCREG);
4893   assert(cc==HOST_CCREG);
4894   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4895   #ifdef REG_PREFETCH
4896   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4897   #endif
4898   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4899   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4900   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4901   if(internal_branch(branch_regs[i].is32,ba[i]))
4902     assem_debug("branch: internal\n");
4903   else
4904     assem_debug("branch: external\n");
4905   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4906     ds_assemble_entry(i);
4907   }
4908   else {
4909     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4910     emit_jmp(0);
4911   }
4912 }
4913
4914 void rjump_assemble(int i,struct regstat *i_regs)
4915 {
4916   signed char *i_regmap=i_regs->regmap;
4917   int temp;
4918   int rs,cc,adj;
4919   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4920   assert(rs>=0);
4921   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4922     // Delay slot abuse, make a copy of the branch address register
4923     temp=get_reg(branch_regs[i].regmap,RTEMP);
4924     assert(temp>=0);
4925     assert(regs[i].regmap[temp]==RTEMP);
4926     emit_mov(rs,temp);
4927     rs=temp;
4928   }
4929   address_generation(i+1,i_regs,regs[i].regmap_entry);
4930   #ifdef REG_PREFETCH
4931   if(rt1[i]==31) 
4932   {
4933     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4934       int return_address=start+i*4+8;
4935       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4936     }
4937   }
4938   #endif
4939   #ifdef USE_MINI_HT
4940   if(rs1[i]==31) {
4941     int rh=get_reg(regs[i].regmap,RHASH);
4942     if(rh>=0) do_preload_rhash(rh);
4943   }
4944   #endif
4945   ds_assemble(i+1,i_regs);
4946   uint64_t bc_unneeded=branch_regs[i].u;
4947   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4948   bc_unneeded|=1|(1LL<<rt1[i]);
4949   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4950   bc_unneeded&=~(1LL<<rs1[i]);
4951   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4952                 bc_unneeded,bc_unneeded_upper);
4953   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4954   if(rt1[i]==31) {
4955     int rt,return_address;
4956     assert(rt1[i+1]!=31);
4957     assert(rt2[i+1]!=31);
4958     rt=get_reg(branch_regs[i].regmap,31);
4959     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4960     assert(rt>=0);
4961     return_address=start+i*4+8;
4962     #ifdef REG_PREFETCH
4963     if(temp>=0) 
4964     {
4965       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4966     }
4967     #endif
4968     emit_movimm(return_address,rt); // PC into link register
4969     #ifdef IMM_PREFETCH
4970     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4971     #endif
4972   }
4973   cc=get_reg(branch_regs[i].regmap,CCREG);
4974   assert(cc==HOST_CCREG);
4975   #ifdef USE_MINI_HT
4976   int rh=get_reg(branch_regs[i].regmap,RHASH);
4977   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4978   if(rs1[i]==31) {
4979     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4980     do_preload_rhtbl(ht);
4981     do_rhash(rs,rh);
4982   }
4983   #endif
4984   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4985   #ifdef DESTRUCTIVE_WRITEBACK
4986   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4987     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4988       emit_loadreg(rs1[i],rs);
4989     }
4990   }
4991   #endif
4992   #ifdef REG_PREFETCH
4993   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4994   #endif
4995   #ifdef USE_MINI_HT
4996   if(rs1[i]==31) {
4997     do_miniht_load(ht,rh);
4998   }
4999   #endif
5000   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5001   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5002   //assert(adj==0);
5003   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5004   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5005   emit_jns(0);
5006   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5007   #ifdef USE_MINI_HT
5008   if(rs1[i]==31) {
5009     do_miniht_jump(rs,rh,ht);
5010   }
5011   else
5012   #endif
5013   {
5014     //if(rs!=EAX) emit_mov(rs,EAX);
5015     //emit_jmp((int)jump_vaddr_eax);
5016     emit_jmp(jump_vaddr_reg[rs]);
5017   }
5018   /* Check hash table
5019   temp=!rs;
5020   emit_mov(rs,temp);
5021   emit_shrimm(rs,16,rs);
5022   emit_xor(temp,rs,rs);
5023   emit_movzwl_reg(rs,rs);
5024   emit_shlimm(rs,4,rs);
5025   emit_cmpmem_indexed((int)hash_table,rs,temp);
5026   emit_jne((int)out+14);
5027   emit_readword_indexed((int)hash_table+4,rs,rs);
5028   emit_jmpreg(rs);
5029   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5030   emit_addimm_no_flags(8,rs);
5031   emit_jeq((int)out-17);
5032   // No hit on hash table, call compiler
5033   emit_pushreg(temp);
5034 //DEBUG >
5035 #ifdef DEBUG_CYCLE_COUNT
5036   emit_readword((int)&last_count,ECX);
5037   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5038   emit_readword((int)&next_interupt,ECX);
5039   emit_writeword(HOST_CCREG,(int)&Count);
5040   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5041   emit_writeword(ECX,(int)&last_count);
5042 #endif
5043 //DEBUG <
5044   emit_storereg(CCREG,HOST_CCREG);
5045   emit_call((int)get_addr);
5046   emit_loadreg(CCREG,HOST_CCREG);
5047   emit_addimm(ESP,4,ESP);
5048   emit_jmpreg(EAX);*/
5049   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5050   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5051   #endif
5052 }
5053
5054 void cjump_assemble(int i,struct regstat *i_regs)
5055 {
5056   signed char *i_regmap=i_regs->regmap;
5057   int cc;
5058   int match;
5059   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5060   assem_debug("match=%d\n",match);
5061   int s1h,s1l,s2h,s2l;
5062   int prev_cop1_usable=cop1_usable;
5063   int unconditional=0,nop=0;
5064   int only32=0;
5065   int ooo=1;
5066   int invert=0;
5067   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5068   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5069   if(likely[i]) ooo=0;
5070   if(!match) invert=1;
5071   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5072   if(i>(ba[i]-start)>>2) invert=1;
5073   #endif
5074     
5075   if(ooo)
5076     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5077        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5078   {
5079     // Write-after-read dependency prevents out of order execution
5080     // First test branch condition, then execute delay slot, then branch
5081     ooo=0;
5082   }
5083
5084   if(ooo) {
5085     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5086     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5087     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5088     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5089   }
5090   else {
5091     s1l=get_reg(i_regmap,rs1[i]);
5092     s1h=get_reg(i_regmap,rs1[i]|64);
5093     s2l=get_reg(i_regmap,rs2[i]);
5094     s2h=get_reg(i_regmap,rs2[i]|64);
5095   }
5096   if(rs1[i]==0&&rs2[i]==0)
5097   {
5098     if(opcode[i]&1) nop=1;
5099     else unconditional=1;
5100     //assert(opcode[i]!=5);
5101     //assert(opcode[i]!=7);
5102     //assert(opcode[i]!=0x15);
5103     //assert(opcode[i]!=0x17);
5104   }
5105   else if(rs1[i]==0)
5106   {
5107     s1l=s2l;s1h=s2h;
5108     s2l=s2h=-1;
5109     only32=(regs[i].was32>>rs2[i])&1;
5110   }
5111   else if(rs2[i]==0)
5112   {
5113     s2l=s2h=-1;
5114     only32=(regs[i].was32>>rs1[i])&1;
5115   }
5116   else {
5117     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5118   }
5119
5120   if(ooo) {
5121     // Out of order execution (delay slot first)
5122     //printf("OOOE\n");
5123     address_generation(i+1,i_regs,regs[i].regmap_entry);
5124     ds_assemble(i+1,i_regs);
5125     int adj;
5126     uint64_t bc_unneeded=branch_regs[i].u;
5127     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5128     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5129     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5130     bc_unneeded|=1;
5131     bc_unneeded_upper|=1;
5132     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5133                   bc_unneeded,bc_unneeded_upper);
5134     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5135     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5136     cc=get_reg(branch_regs[i].regmap,CCREG);
5137     assert(cc==HOST_CCREG);
5138     if(unconditional) 
5139       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5140     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5141     //assem_debug("cycle count (adj)\n");
5142     if(unconditional) {
5143       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5144       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5145         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5146         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5147         if(internal)
5148           assem_debug("branch: internal\n");
5149         else
5150           assem_debug("branch: external\n");
5151         if(internal&&is_ds[(ba[i]-start)>>2]) {
5152           ds_assemble_entry(i);
5153         }
5154         else {
5155           add_to_linker((int)out,ba[i],internal);
5156           emit_jmp(0);
5157         }
5158         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5159         if(((u_int)out)&7) emit_addnop(0);
5160         #endif
5161       }
5162     }
5163     else if(nop) {
5164       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5165       int jaddr=(int)out;
5166       emit_jns(0);
5167       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5168     }
5169     else {
5170       int taken=0,nottaken=0,nottaken1=0;
5171       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5172       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5173       if(!only32)
5174       {
5175         assert(s1h>=0);
5176         if(opcode[i]==4) // BEQ
5177         {
5178           if(s2h>=0) emit_cmp(s1h,s2h);
5179           else emit_test(s1h,s1h);
5180           nottaken1=(int)out;
5181           emit_jne(1);
5182         }
5183         if(opcode[i]==5) // BNE
5184         {
5185           if(s2h>=0) emit_cmp(s1h,s2h);
5186           else emit_test(s1h,s1h);
5187           if(invert) taken=(int)out;
5188           else add_to_linker((int)out,ba[i],internal);
5189           emit_jne(0);
5190         }
5191         if(opcode[i]==6) // BLEZ
5192         {
5193           emit_test(s1h,s1h);
5194           if(invert) taken=(int)out;
5195           else add_to_linker((int)out,ba[i],internal);
5196           emit_js(0);
5197           nottaken1=(int)out;
5198           emit_jne(1);
5199         }
5200         if(opcode[i]==7) // BGTZ
5201         {
5202           emit_test(s1h,s1h);
5203           nottaken1=(int)out;
5204           emit_js(1);
5205           if(invert) taken=(int)out;
5206           else add_to_linker((int)out,ba[i],internal);
5207           emit_jne(0);
5208         }
5209       } // if(!only32)
5210           
5211       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5212       assert(s1l>=0);
5213       if(opcode[i]==4) // BEQ
5214       {
5215         if(s2l>=0) emit_cmp(s1l,s2l);
5216         else emit_test(s1l,s1l);
5217         if(invert){
5218           nottaken=(int)out;
5219           emit_jne(1);
5220         }else{
5221           add_to_linker((int)out,ba[i],internal);
5222           emit_jeq(0);
5223         }
5224       }
5225       if(opcode[i]==5) // BNE
5226       {
5227         if(s2l>=0) emit_cmp(s1l,s2l);
5228         else emit_test(s1l,s1l);
5229         if(invert){
5230           nottaken=(int)out;
5231           emit_jeq(1);
5232         }else{
5233           add_to_linker((int)out,ba[i],internal);
5234           emit_jne(0);
5235         }
5236       }
5237       if(opcode[i]==6) // BLEZ
5238       {
5239         emit_cmpimm(s1l,1);
5240         if(invert){
5241           nottaken=(int)out;
5242           emit_jge(1);
5243         }else{
5244           add_to_linker((int)out,ba[i],internal);
5245           emit_jl(0);
5246         }
5247       }
5248       if(opcode[i]==7) // BGTZ
5249       {
5250         emit_cmpimm(s1l,1);
5251         if(invert){
5252           nottaken=(int)out;
5253           emit_jl(1);
5254         }else{
5255           add_to_linker((int)out,ba[i],internal);
5256           emit_jge(0);
5257         }
5258       }
5259       if(invert) {
5260         if(taken) set_jump_target(taken,(int)out);
5261         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5262         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5263           if(adj) {
5264             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5265             add_to_linker((int)out,ba[i],internal);
5266           }else{
5267             emit_addnop(13);
5268             add_to_linker((int)out,ba[i],internal*2);
5269           }
5270           emit_jmp(0);
5271         }else
5272         #endif
5273         {
5274           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5275           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5276           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5277           if(internal)
5278             assem_debug("branch: internal\n");
5279           else
5280             assem_debug("branch: external\n");
5281           if(internal&&is_ds[(ba[i]-start)>>2]) {
5282             ds_assemble_entry(i);
5283           }
5284           else {
5285             add_to_linker((int)out,ba[i],internal);
5286             emit_jmp(0);
5287           }
5288         }
5289         set_jump_target(nottaken,(int)out);
5290       }
5291
5292       if(nottaken1) set_jump_target(nottaken1,(int)out);
5293       if(adj) {
5294         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5295       }
5296     } // (!unconditional)
5297   } // if(ooo)
5298   else
5299   {
5300     // In-order execution (branch first)
5301     //if(likely[i]) printf("IOL\n");
5302     //else
5303     //printf("IOE\n");
5304     int taken=0,nottaken=0,nottaken1=0;
5305     if(!unconditional&&!nop) {
5306       if(!only32)
5307       {
5308         assert(s1h>=0);
5309         if((opcode[i]&0x2f)==4) // BEQ
5310         {
5311           if(s2h>=0) emit_cmp(s1h,s2h);
5312           else emit_test(s1h,s1h);
5313           nottaken1=(int)out;
5314           emit_jne(2);
5315         }
5316         if((opcode[i]&0x2f)==5) // BNE
5317         {
5318           if(s2h>=0) emit_cmp(s1h,s2h);
5319           else emit_test(s1h,s1h);
5320           taken=(int)out;
5321           emit_jne(1);
5322         }
5323         if((opcode[i]&0x2f)==6) // BLEZ
5324         {
5325           emit_test(s1h,s1h);
5326           taken=(int)out;
5327           emit_js(1);
5328           nottaken1=(int)out;
5329           emit_jne(2);
5330         }
5331         if((opcode[i]&0x2f)==7) // BGTZ
5332         {
5333           emit_test(s1h,s1h);
5334           nottaken1=(int)out;
5335           emit_js(2);
5336           taken=(int)out;
5337           emit_jne(1);
5338         }
5339       } // if(!only32)
5340           
5341       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5342       assert(s1l>=0);
5343       if((opcode[i]&0x2f)==4) // BEQ
5344       {
5345         if(s2l>=0) emit_cmp(s1l,s2l);
5346         else emit_test(s1l,s1l);
5347         nottaken=(int)out;
5348         emit_jne(2);
5349       }
5350       if((opcode[i]&0x2f)==5) // BNE
5351       {
5352         if(s2l>=0) emit_cmp(s1l,s2l);
5353         else emit_test(s1l,s1l);
5354         nottaken=(int)out;
5355         emit_jeq(2);
5356       }
5357       if((opcode[i]&0x2f)==6) // BLEZ
5358       {
5359         emit_cmpimm(s1l,1);
5360         nottaken=(int)out;
5361         emit_jge(2);
5362       }
5363       if((opcode[i]&0x2f)==7) // BGTZ
5364       {
5365         emit_cmpimm(s1l,1);
5366         nottaken=(int)out;
5367         emit_jl(2);
5368       }
5369     } // if(!unconditional)
5370     int adj;
5371     uint64_t ds_unneeded=branch_regs[i].u;
5372     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5373     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5374     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5375     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5376     ds_unneeded|=1;
5377     ds_unneeded_upper|=1;
5378     // branch taken
5379     if(!nop) {
5380       if(taken) set_jump_target(taken,(int)out);
5381       assem_debug("1:\n");
5382       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5383                     ds_unneeded,ds_unneeded_upper);
5384       // load regs
5385       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5386       address_generation(i+1,&branch_regs[i],0);
5387       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5388       ds_assemble(i+1,&branch_regs[i]);
5389       cc=get_reg(branch_regs[i].regmap,CCREG);
5390       if(cc==-1) {
5391         emit_loadreg(CCREG,cc=HOST_CCREG);
5392         // CHECK: Is the following instruction (fall thru) allocated ok?
5393       }
5394       assert(cc==HOST_CCREG);
5395       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5396       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5397       assem_debug("cycle count (adj)\n");
5398       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5399       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5400       if(internal)
5401         assem_debug("branch: internal\n");
5402       else
5403         assem_debug("branch: external\n");
5404       if(internal&&is_ds[(ba[i]-start)>>2]) {
5405         ds_assemble_entry(i);
5406       }
5407       else {
5408         add_to_linker((int)out,ba[i],internal);
5409         emit_jmp(0);
5410       }
5411     }
5412     // branch not taken
5413     cop1_usable=prev_cop1_usable;
5414     if(!unconditional) {
5415       if(nottaken1) set_jump_target(nottaken1,(int)out);
5416       set_jump_target(nottaken,(int)out);
5417       assem_debug("2:\n");
5418       if(!likely[i]) {
5419         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5420                       ds_unneeded,ds_unneeded_upper);
5421         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5422         address_generation(i+1,&branch_regs[i],0);
5423         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5424         ds_assemble(i+1,&branch_regs[i]);
5425       }
5426       cc=get_reg(branch_regs[i].regmap,CCREG);
5427       if(cc==-1&&!likely[i]) {
5428         // Cycle count isn't in a register, temporarily load it then write it out
5429         emit_loadreg(CCREG,HOST_CCREG);
5430         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5431         int jaddr=(int)out;
5432         emit_jns(0);
5433         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5434         emit_storereg(CCREG,HOST_CCREG);
5435       }
5436       else{
5437         cc=get_reg(i_regmap,CCREG);
5438         assert(cc==HOST_CCREG);
5439         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5440         int jaddr=(int)out;
5441         emit_jns(0);
5442         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5443       }
5444     }
5445   }
5446 }
5447
5448 void sjump_assemble(int i,struct regstat *i_regs)
5449 {
5450   signed char *i_regmap=i_regs->regmap;
5451   int cc;
5452   int match;
5453   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5454   assem_debug("smatch=%d\n",match);
5455   int s1h,s1l;
5456   int prev_cop1_usable=cop1_usable;
5457   int unconditional=0,nevertaken=0;
5458   int only32=0;
5459   int ooo=1;
5460   int invert=0;
5461   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5462   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5463   if(likely[i]) ooo=0;
5464   if(!match) invert=1;
5465   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5466   if(i>(ba[i]-start)>>2) invert=1;
5467   #endif
5468
5469   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5470   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5471
5472   if(ooo)
5473     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5474   {
5475     // Write-after-read dependency prevents out of order execution
5476     // First test branch condition, then execute delay slot, then branch
5477     ooo=0;
5478   }
5479   // TODO: Conditional branches w/link must execute in-order so that
5480   // condition test and write to r31 occur before cycle count test
5481
5482   if(ooo) {
5483     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5484     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5485   }
5486   else {
5487     s1l=get_reg(i_regmap,rs1[i]);
5488     s1h=get_reg(i_regmap,rs1[i]|64);
5489   }
5490   if(rs1[i]==0)
5491   {
5492     if(opcode2[i]&1) unconditional=1;
5493     else nevertaken=1;
5494     // These are never taken (r0 is never less than zero)
5495     //assert(opcode2[i]!=0);
5496     //assert(opcode2[i]!=2);
5497     //assert(opcode2[i]!=0x10);
5498     //assert(opcode2[i]!=0x12);
5499   }
5500   else {
5501     only32=(regs[i].was32>>rs1[i])&1;
5502   }
5503
5504   if(ooo) {
5505     // Out of order execution (delay slot first)
5506     //printf("OOOE\n");
5507     address_generation(i+1,i_regs,regs[i].regmap_entry);
5508     ds_assemble(i+1,i_regs);
5509     int adj;
5510     uint64_t bc_unneeded=branch_regs[i].u;
5511     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5512     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5513     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5514     bc_unneeded|=1;
5515     bc_unneeded_upper|=1;
5516     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5517                   bc_unneeded,bc_unneeded_upper);
5518     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5519     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5520     if(rt1[i]==31) {
5521       int rt,return_address;
5522       assert(rt1[i+1]!=31);
5523       assert(rt2[i+1]!=31);
5524       rt=get_reg(branch_regs[i].regmap,31);
5525       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5526       if(rt>=0) {
5527         // Save the PC even if the branch is not taken
5528         return_address=start+i*4+8;
5529         emit_movimm(return_address,rt); // PC into link register
5530         #ifdef IMM_PREFETCH
5531         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5532         #endif
5533       }
5534     }
5535     cc=get_reg(branch_regs[i].regmap,CCREG);
5536     assert(cc==HOST_CCREG);
5537     if(unconditional) 
5538       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5539     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5540     assem_debug("cycle count (adj)\n");
5541     if(unconditional) {
5542       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5543       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5544         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5545         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5546         if(internal)
5547           assem_debug("branch: internal\n");
5548         else
5549           assem_debug("branch: external\n");
5550         if(internal&&is_ds[(ba[i]-start)>>2]) {
5551           ds_assemble_entry(i);
5552         }
5553         else {
5554           add_to_linker((int)out,ba[i],internal);
5555           emit_jmp(0);
5556         }
5557         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5558         if(((u_int)out)&7) emit_addnop(0);
5559         #endif
5560       }
5561     }
5562     else if(nevertaken) {
5563       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5564       int jaddr=(int)out;
5565       emit_jns(0);
5566       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5567     }
5568     else {
5569       int nottaken=0;
5570       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5571       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5572       if(!only32)
5573       {
5574         assert(s1h>=0);
5575         if(opcode2[i]==0) // BLTZ
5576         {
5577           emit_test(s1h,s1h);
5578           if(invert){
5579             nottaken=(int)out;
5580             emit_jns(1);
5581           }else{
5582             add_to_linker((int)out,ba[i],internal);
5583             emit_js(0);
5584           }
5585         }
5586         if(opcode2[i]==1) // BGEZ
5587         {
5588           emit_test(s1h,s1h);
5589           if(invert){
5590             nottaken=(int)out;
5591             emit_js(1);
5592           }else{
5593             add_to_linker((int)out,ba[i],internal);
5594             emit_jns(0);
5595           }
5596         }
5597       } // if(!only32)
5598       else
5599       {
5600         assert(s1l>=0);
5601         if(opcode2[i]==0) // BLTZ
5602         {
5603           emit_test(s1l,s1l);
5604           if(invert){
5605             nottaken=(int)out;
5606             emit_jns(1);
5607           }else{
5608             add_to_linker((int)out,ba[i],internal);
5609             emit_js(0);
5610           }
5611         }
5612         if(opcode2[i]==1) // BGEZ
5613         {
5614           emit_test(s1l,s1l);
5615           if(invert){
5616             nottaken=(int)out;
5617             emit_js(1);
5618           }else{
5619             add_to_linker((int)out,ba[i],internal);
5620             emit_jns(0);
5621           }
5622         }
5623       } // if(!only32)
5624           
5625       if(invert) {
5626         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5627         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5628           if(adj) {
5629             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5630             add_to_linker((int)out,ba[i],internal);
5631           }else{
5632             emit_addnop(13);
5633             add_to_linker((int)out,ba[i],internal*2);
5634           }
5635           emit_jmp(0);
5636         }else
5637         #endif
5638         {
5639           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5640           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5641           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5642           if(internal)
5643             assem_debug("branch: internal\n");
5644           else
5645             assem_debug("branch: external\n");
5646           if(internal&&is_ds[(ba[i]-start)>>2]) {
5647             ds_assemble_entry(i);
5648           }
5649           else {
5650             add_to_linker((int)out,ba[i],internal);
5651             emit_jmp(0);
5652           }
5653         }
5654         set_jump_target(nottaken,(int)out);
5655       }
5656
5657       if(adj) {
5658         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5659       }
5660     } // (!unconditional)
5661   } // if(ooo)
5662   else
5663   {
5664     // In-order execution (branch first)
5665     //printf("IOE\n");
5666     int nottaken=0;
5667     if(!unconditional) {
5668       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5669       if(!only32)
5670       {
5671         assert(s1h>=0);
5672         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5673         {
5674           emit_test(s1h,s1h);
5675           nottaken=(int)out;
5676           emit_jns(1);
5677         }
5678         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5679         {
5680           emit_test(s1h,s1h);
5681           nottaken=(int)out;
5682           emit_js(1);
5683         }
5684       } // if(!only32)
5685       else
5686       {
5687         assert(s1l>=0);
5688         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5689         {
5690           emit_test(s1l,s1l);
5691           nottaken=(int)out;
5692           emit_jns(1);
5693         }
5694         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5695         {
5696           emit_test(s1l,s1l);
5697           nottaken=(int)out;
5698           emit_js(1);
5699         }
5700       }
5701     } // if(!unconditional)
5702     int adj;
5703     uint64_t ds_unneeded=branch_regs[i].u;
5704     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5705     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5706     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5707     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5708     ds_unneeded|=1;
5709     ds_unneeded_upper|=1;
5710     // branch taken
5711     if(!nevertaken) {
5712       //assem_debug("1:\n");
5713       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5714                     ds_unneeded,ds_unneeded_upper);
5715       // load regs
5716       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5717       address_generation(i+1,&branch_regs[i],0);
5718       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5719       ds_assemble(i+1,&branch_regs[i]);
5720       cc=get_reg(branch_regs[i].regmap,CCREG);
5721       if(cc==-1) {
5722         emit_loadreg(CCREG,cc=HOST_CCREG);
5723         // CHECK: Is the following instruction (fall thru) allocated ok?
5724       }
5725       assert(cc==HOST_CCREG);
5726       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5727       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5728       assem_debug("cycle count (adj)\n");
5729       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5730       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5731       if(internal)
5732         assem_debug("branch: internal\n");
5733       else
5734         assem_debug("branch: external\n");
5735       if(internal&&is_ds[(ba[i]-start)>>2]) {
5736         ds_assemble_entry(i);
5737       }
5738       else {
5739         add_to_linker((int)out,ba[i],internal);
5740         emit_jmp(0);
5741       }
5742     }
5743     // branch not taken
5744     cop1_usable=prev_cop1_usable;
5745     if(!unconditional) {
5746       set_jump_target(nottaken,(int)out);
5747       assem_debug("1:\n");
5748       if(!likely[i]) {
5749         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5750                       ds_unneeded,ds_unneeded_upper);
5751         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5752         address_generation(i+1,&branch_regs[i],0);
5753         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5754         ds_assemble(i+1,&branch_regs[i]);
5755       }
5756       cc=get_reg(branch_regs[i].regmap,CCREG);
5757       if(cc==-1&&!likely[i]) {
5758         // Cycle count isn't in a register, temporarily load it then write it out
5759         emit_loadreg(CCREG,HOST_CCREG);
5760         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5761         int jaddr=(int)out;
5762         emit_jns(0);
5763         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5764         emit_storereg(CCREG,HOST_CCREG);
5765       }
5766       else{
5767         cc=get_reg(i_regmap,CCREG);
5768         assert(cc==HOST_CCREG);
5769         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5770         int jaddr=(int)out;
5771         emit_jns(0);
5772         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5773       }
5774     }
5775   }
5776 }
5777
5778 void fjump_assemble(int i,struct regstat *i_regs)
5779 {
5780   signed char *i_regmap=i_regs->regmap;
5781   int cc;
5782   int match;
5783   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5784   assem_debug("fmatch=%d\n",match);
5785   int fs,cs;
5786   int eaddr;
5787   int ooo=1;
5788   int invert=0;
5789   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5790   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5791   if(likely[i]) ooo=0;
5792   if(!match) invert=1;
5793   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5794   if(i>(ba[i]-start)>>2) invert=1;
5795   #endif
5796
5797   if(ooo)
5798     if(itype[i+1]==FCOMP)
5799   {
5800     // Write-after-read dependency prevents out of order execution
5801     // First test branch condition, then execute delay slot, then branch
5802     ooo=0;
5803   }
5804
5805   if(ooo) {
5806     fs=get_reg(branch_regs[i].regmap,FSREG);
5807     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5808   }
5809   else {
5810     fs=get_reg(i_regmap,FSREG);
5811   }
5812
5813   // Check cop1 unusable
5814   if(!cop1_usable) {
5815     cs=get_reg(i_regmap,CSREG);
5816     assert(cs>=0);
5817     emit_testimm(cs,0x20000000);
5818     eaddr=(int)out;
5819     emit_jeq(0);
5820     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5821     cop1_usable=1;
5822   }
5823
5824   if(ooo) {
5825     // Out of order execution (delay slot first)
5826     //printf("OOOE\n");
5827     ds_assemble(i+1,i_regs);
5828     int adj;
5829     uint64_t bc_unneeded=branch_regs[i].u;
5830     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5831     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5832     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5833     bc_unneeded|=1;
5834     bc_unneeded_upper|=1;
5835     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5836                   bc_unneeded,bc_unneeded_upper);
5837     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5838     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5839     cc=get_reg(branch_regs[i].regmap,CCREG);
5840     assert(cc==HOST_CCREG);
5841     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5842     assem_debug("cycle count (adj)\n");
5843     if(1) {
5844       int nottaken=0;
5845       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5846       if(1) {
5847         assert(fs>=0);
5848         emit_testimm(fs,0x800000);
5849         if(source[i]&0x10000) // BC1T
5850         {
5851           if(invert){
5852             nottaken=(int)out;
5853             emit_jeq(1);
5854           }else{
5855             add_to_linker((int)out,ba[i],internal);
5856             emit_jne(0);
5857           }
5858         }
5859         else // BC1F
5860           if(invert){
5861             nottaken=(int)out;
5862             emit_jne(1);
5863           }else{
5864             add_to_linker((int)out,ba[i],internal);
5865             emit_jeq(0);
5866           }
5867         {
5868         }
5869       } // if(!only32)
5870           
5871       if(invert) {
5872         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5873         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5874         else if(match) emit_addnop(13);
5875         #endif
5876         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5877         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5878         if(internal)
5879           assem_debug("branch: internal\n");
5880         else
5881           assem_debug("branch: external\n");
5882         if(internal&&is_ds[(ba[i]-start)>>2]) {
5883           ds_assemble_entry(i);
5884         }
5885         else {
5886           add_to_linker((int)out,ba[i],internal);
5887           emit_jmp(0);
5888         }
5889         set_jump_target(nottaken,(int)out);
5890       }
5891
5892       if(adj) {
5893         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5894       }
5895     } // (!unconditional)
5896   } // if(ooo)
5897   else
5898   {
5899     // In-order execution (branch first)
5900     //printf("IOE\n");
5901     int nottaken=0;
5902     if(1) {
5903       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5904       if(1) {
5905         assert(fs>=0);
5906         emit_testimm(fs,0x800000);
5907         if(source[i]&0x10000) // BC1T
5908         {
5909           nottaken=(int)out;
5910           emit_jeq(1);
5911         }
5912         else // BC1F
5913         {
5914           nottaken=(int)out;
5915           emit_jne(1);
5916         }
5917       }
5918     } // if(!unconditional)
5919     int adj;
5920     uint64_t ds_unneeded=branch_regs[i].u;
5921     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5922     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5923     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5924     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5925     ds_unneeded|=1;
5926     ds_unneeded_upper|=1;
5927     // branch taken
5928     //assem_debug("1:\n");
5929     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5930                   ds_unneeded,ds_unneeded_upper);
5931     // load regs
5932     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5933     address_generation(i+1,&branch_regs[i],0);
5934     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5935     ds_assemble(i+1,&branch_regs[i]);
5936     cc=get_reg(branch_regs[i].regmap,CCREG);
5937     if(cc==-1) {
5938       emit_loadreg(CCREG,cc=HOST_CCREG);
5939       // CHECK: Is the following instruction (fall thru) allocated ok?
5940     }
5941     assert(cc==HOST_CCREG);
5942     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5943     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5944     assem_debug("cycle count (adj)\n");
5945     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5946     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5947     if(internal)
5948       assem_debug("branch: internal\n");
5949     else
5950       assem_debug("branch: external\n");
5951     if(internal&&is_ds[(ba[i]-start)>>2]) {
5952       ds_assemble_entry(i);
5953     }
5954     else {
5955       add_to_linker((int)out,ba[i],internal);
5956       emit_jmp(0);
5957     }
5958
5959     // branch not taken
5960     if(1) { // <- FIXME (don't need this)
5961       set_jump_target(nottaken,(int)out);
5962       assem_debug("1:\n");
5963       if(!likely[i]) {
5964         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5965                       ds_unneeded,ds_unneeded_upper);
5966         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5967         address_generation(i+1,&branch_regs[i],0);
5968         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5969         ds_assemble(i+1,&branch_regs[i]);
5970       }
5971       cc=get_reg(branch_regs[i].regmap,CCREG);
5972       if(cc==-1&&!likely[i]) {
5973         // Cycle count isn't in a register, temporarily load it then write it out
5974         emit_loadreg(CCREG,HOST_CCREG);
5975         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5976         int jaddr=(int)out;
5977         emit_jns(0);
5978         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5979         emit_storereg(CCREG,HOST_CCREG);
5980       }
5981       else{
5982         cc=get_reg(i_regmap,CCREG);
5983         assert(cc==HOST_CCREG);
5984         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5985         int jaddr=(int)out;
5986         emit_jns(0);
5987         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5988       }
5989     }
5990   }
5991 }
5992
5993 static void pagespan_assemble(int i,struct regstat *i_regs)
5994 {
5995   int s1l=get_reg(i_regs->regmap,rs1[i]);
5996   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5997   int s2l=get_reg(i_regs->regmap,rs2[i]);
5998   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5999   void *nt_branch=NULL;
6000   int taken=0;
6001   int nottaken=0;
6002   int unconditional=0;
6003   if(rs1[i]==0)
6004   {
6005     s1l=s2l;s1h=s2h;
6006     s2l=s2h=-1;
6007   }
6008   else if(rs2[i]==0)
6009   {
6010     s2l=s2h=-1;
6011   }
6012   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6013     s1h=s2h=-1;
6014   }
6015   int hr=0;
6016   int addr,alt,ntaddr;
6017   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6018   else {
6019     while(hr<HOST_REGS)
6020     {
6021       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6022          (i_regs->regmap[hr]&63)!=rs1[i] &&
6023          (i_regs->regmap[hr]&63)!=rs2[i] )
6024       {
6025         addr=hr++;break;
6026       }
6027       hr++;
6028     }
6029   }
6030   while(hr<HOST_REGS)
6031   {
6032     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6033        (i_regs->regmap[hr]&63)!=rs1[i] &&
6034        (i_regs->regmap[hr]&63)!=rs2[i] )
6035     {
6036       alt=hr++;break;
6037     }
6038     hr++;
6039   }
6040   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6041   {
6042     while(hr<HOST_REGS)
6043     {
6044       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6045          (i_regs->regmap[hr]&63)!=rs1[i] &&
6046          (i_regs->regmap[hr]&63)!=rs2[i] )
6047       {
6048         ntaddr=hr;break;
6049       }
6050       hr++;
6051     }
6052   }
6053   assert(hr<HOST_REGS);
6054   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6055     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6056   }
6057   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6058   if(opcode[i]==2) // J
6059   {
6060     unconditional=1;
6061   }
6062   if(opcode[i]==3) // JAL
6063   {
6064     // TODO: mini_ht
6065     int rt=get_reg(i_regs->regmap,31);
6066     emit_movimm(start+i*4+8,rt);
6067     unconditional=1;
6068   }
6069   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6070   {
6071     emit_mov(s1l,addr);
6072     if(opcode2[i]==9) // JALR
6073     {
6074       int rt=get_reg(i_regs->regmap,31);
6075       emit_movimm(start+i*4+8,rt);
6076     }
6077   }
6078   if((opcode[i]&0x3f)==4) // BEQ
6079   {
6080     if(rs1[i]==rs2[i])
6081     {
6082       unconditional=1;
6083     }
6084     else
6085     #ifdef HAVE_CMOV_IMM
6086     if(s1h<0) {
6087       if(s2l>=0) emit_cmp(s1l,s2l);
6088       else emit_test(s1l,s1l);
6089       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6090     }
6091     else
6092     #endif
6093     {
6094       assert(s1l>=0);
6095       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6096       if(s1h>=0) {
6097         if(s2h>=0) emit_cmp(s1h,s2h);
6098         else emit_test(s1h,s1h);
6099         emit_cmovne_reg(alt,addr);
6100       }
6101       if(s2l>=0) emit_cmp(s1l,s2l);
6102       else emit_test(s1l,s1l);
6103       emit_cmovne_reg(alt,addr);
6104     }
6105   }
6106   if((opcode[i]&0x3f)==5) // BNE
6107   {
6108     #ifdef HAVE_CMOV_IMM
6109     if(s1h<0) {
6110       if(s2l>=0) emit_cmp(s1l,s2l);
6111       else emit_test(s1l,s1l);
6112       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6113     }
6114     else
6115     #endif
6116     {
6117       assert(s1l>=0);
6118       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6119       if(s1h>=0) {
6120         if(s2h>=0) emit_cmp(s1h,s2h);
6121         else emit_test(s1h,s1h);
6122         emit_cmovne_reg(alt,addr);
6123       }
6124       if(s2l>=0) emit_cmp(s1l,s2l);
6125       else emit_test(s1l,s1l);
6126       emit_cmovne_reg(alt,addr);
6127     }
6128   }
6129   if((opcode[i]&0x3f)==0x14) // BEQL
6130   {
6131     if(s1h>=0) {
6132       if(s2h>=0) emit_cmp(s1h,s2h);
6133       else emit_test(s1h,s1h);
6134       nottaken=(int)out;
6135       emit_jne(0);
6136     }
6137     if(s2l>=0) emit_cmp(s1l,s2l);
6138     else emit_test(s1l,s1l);
6139     if(nottaken) set_jump_target(nottaken,(int)out);
6140     nottaken=(int)out;
6141     emit_jne(0);
6142   }
6143   if((opcode[i]&0x3f)==0x15) // BNEL
6144   {
6145     if(s1h>=0) {
6146       if(s2h>=0) emit_cmp(s1h,s2h);
6147       else emit_test(s1h,s1h);
6148       taken=(int)out;
6149       emit_jne(0);
6150     }
6151     if(s2l>=0) emit_cmp(s1l,s2l);
6152     else emit_test(s1l,s1l);
6153     nottaken=(int)out;
6154     emit_jeq(0);
6155     if(taken) set_jump_target(taken,(int)out);
6156   }
6157   if((opcode[i]&0x3f)==6) // BLEZ
6158   {
6159     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6160     emit_cmpimm(s1l,1);
6161     if(s1h>=0) emit_mov(addr,ntaddr);
6162     emit_cmovl_reg(alt,addr);
6163     if(s1h>=0) {
6164       emit_test(s1h,s1h);
6165       emit_cmovne_reg(ntaddr,addr);
6166       emit_cmovs_reg(alt,addr);
6167     }
6168   }
6169   if((opcode[i]&0x3f)==7) // BGTZ
6170   {
6171     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6172     emit_cmpimm(s1l,1);
6173     if(s1h>=0) emit_mov(addr,alt);
6174     emit_cmovl_reg(ntaddr,addr);
6175     if(s1h>=0) {
6176       emit_test(s1h,s1h);
6177       emit_cmovne_reg(alt,addr);
6178       emit_cmovs_reg(ntaddr,addr);
6179     }
6180   }
6181   if((opcode[i]&0x3f)==0x16) // BLEZL
6182   {
6183     assert((opcode[i]&0x3f)!=0x16);
6184   }
6185   if((opcode[i]&0x3f)==0x17) // BGTZL
6186   {
6187     assert((opcode[i]&0x3f)!=0x17);
6188   }
6189   assert(opcode[i]!=1); // BLTZ/BGEZ
6190
6191   //FIXME: Check CSREG
6192   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6193     if((source[i]&0x30000)==0) // BC1F
6194     {
6195       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6196       emit_testimm(s1l,0x800000);
6197       emit_cmovne_reg(alt,addr);
6198     }
6199     if((source[i]&0x30000)==0x10000) // BC1T
6200     {
6201       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6202       emit_testimm(s1l,0x800000);
6203       emit_cmovne_reg(alt,addr);
6204     }
6205     if((source[i]&0x30000)==0x20000) // BC1FL
6206     {
6207       emit_testimm(s1l,0x800000);
6208       nottaken=(int)out;
6209       emit_jne(0);
6210     }
6211     if((source[i]&0x30000)==0x30000) // BC1TL
6212     {
6213       emit_testimm(s1l,0x800000);
6214       nottaken=(int)out;
6215       emit_jeq(0);
6216     }
6217   }
6218
6219   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6220   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6221   if(likely[i]||unconditional)
6222   {
6223     emit_movimm(ba[i],HOST_BTREG);
6224   }
6225   else if(addr!=HOST_BTREG)
6226   {
6227     emit_mov(addr,HOST_BTREG);
6228   }
6229   void *branch_addr=out;
6230   emit_jmp(0);
6231   int target_addr=start+i*4+5;
6232   void *stub=out;
6233   void *compiled_target_addr=check_addr(target_addr);
6234   emit_extjump_ds((int)branch_addr,target_addr);
6235   if(compiled_target_addr) {
6236     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6237     add_link(target_addr,stub);
6238   }
6239   else set_jump_target((int)branch_addr,(int)stub);
6240   if(likely[i]) {
6241     // Not-taken path
6242     set_jump_target((int)nottaken,(int)out);
6243     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6244     void *branch_addr=out;
6245     emit_jmp(0);
6246     int target_addr=start+i*4+8;
6247     void *stub=out;
6248     void *compiled_target_addr=check_addr(target_addr);
6249     emit_extjump_ds((int)branch_addr,target_addr);
6250     if(compiled_target_addr) {
6251       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6252       add_link(target_addr,stub);
6253     }
6254     else set_jump_target((int)branch_addr,(int)stub);
6255   }
6256 }
6257
6258 // Assemble the delay slot for the above
6259 static void pagespan_ds()
6260 {
6261   assem_debug("initial delay slot:\n");
6262   u_int vaddr=start+1;
6263   u_int page=get_page(vaddr);
6264   u_int vpage=get_vpage(vaddr);
6265   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6266   do_dirty_stub_ds();
6267   ll_add(jump_in+page,vaddr,(void *)out);
6268   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6269   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6270     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6271   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6272     emit_writeword(HOST_BTREG,(int)&branch_target);
6273   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6274   address_generation(0,&regs[0],regs[0].regmap_entry);
6275   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6276     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6277   cop1_usable=0;
6278   is_delayslot=0;
6279   switch(itype[0]) {
6280     case ALU:
6281       alu_assemble(0,&regs[0]);break;
6282     case IMM16:
6283       imm16_assemble(0,&regs[0]);break;
6284     case SHIFT:
6285       shift_assemble(0,&regs[0]);break;
6286     case SHIFTIMM:
6287       shiftimm_assemble(0,&regs[0]);break;
6288     case LOAD:
6289       load_assemble(0,&regs[0]);break;
6290     case LOADLR:
6291       loadlr_assemble(0,&regs[0]);break;
6292     case STORE:
6293       store_assemble(0,&regs[0]);break;
6294     case STORELR:
6295       storelr_assemble(0,&regs[0]);break;
6296     case COP0:
6297       cop0_assemble(0,&regs[0]);break;
6298     case COP1:
6299       cop1_assemble(0,&regs[0]);break;
6300     case C1LS:
6301       c1ls_assemble(0,&regs[0]);break;
6302     case FCONV:
6303       fconv_assemble(0,&regs[0]);break;
6304     case FLOAT:
6305       float_assemble(0,&regs[0]);break;
6306     case FCOMP:
6307       fcomp_assemble(0,&regs[0]);break;
6308     case MULTDIV:
6309       multdiv_assemble(0,&regs[0]);break;
6310     case MOV:
6311       mov_assemble(0,&regs[0]);break;
6312     case SYSCALL:
6313     case HLECALL:
6314     case SPAN:
6315     case UJUMP:
6316     case RJUMP:
6317     case CJUMP:
6318     case SJUMP:
6319     case FJUMP:
6320       printf("Jump in the delay slot.  This is probably a bug.\n");
6321   }
6322   int btaddr=get_reg(regs[0].regmap,BTREG);
6323   if(btaddr<0) {
6324     btaddr=get_reg(regs[0].regmap,-1);
6325     emit_readword((int)&branch_target,btaddr);
6326   }
6327   assert(btaddr!=HOST_CCREG);
6328   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6329 #ifdef HOST_IMM8
6330   emit_movimm(start+4,HOST_TEMPREG);
6331   emit_cmp(btaddr,HOST_TEMPREG);
6332 #else
6333   emit_cmpimm(btaddr,start+4);
6334 #endif
6335   int branch=(int)out;
6336   emit_jeq(0);
6337   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6338   emit_jmp(jump_vaddr_reg[btaddr]);
6339   set_jump_target(branch,(int)out);
6340   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6341   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6342 }
6343
6344 // Basic liveness analysis for MIPS registers
6345 void unneeded_registers(int istart,int iend,int r)
6346 {
6347   int i;
6348   uint64_t u,uu,b,bu;
6349   uint64_t temp_u,temp_uu;
6350   uint64_t tdep;
6351   if(iend==slen-1) {
6352     u=1;uu=1;
6353   }else{
6354     u=unneeded_reg[iend+1];
6355     uu=unneeded_reg_upper[iend+1];
6356     u=1;uu=1;
6357   }
6358   for (i=iend;i>=istart;i--)
6359   {
6360     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6361     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6362     {
6363       // If subroutine call, flag return address as a possible branch target
6364       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6365       
6366       if(ba[i]<start || ba[i]>=(start+slen*4))
6367       {
6368         // Branch out of this block, flush all regs
6369         u=1;
6370         uu=1;
6371         /* Hexagon hack 
6372         if(itype[i]==UJUMP&&rt1[i]==31)
6373         {
6374           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6375         }
6376         if(itype[i]==RJUMP&&rs1[i]==31)
6377         {
6378           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6379         }
6380         if(start>0x80000400&&start<0x80800000) {
6381           if(itype[i]==UJUMP&&rt1[i]==31)
6382           {
6383             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6384             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6385           }
6386           if(itype[i]==RJUMP&&rs1[i]==31)
6387           {
6388             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6389             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6390           }
6391         }*/
6392         branch_unneeded_reg[i]=u;
6393         branch_unneeded_reg_upper[i]=uu;
6394         // Merge in delay slot
6395         tdep=(~uu>>rt1[i+1])&1;
6396         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6397         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6398         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6399         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6400         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6401         u|=1;uu|=1;
6402         // If branch is "likely" (and conditional)
6403         // then we skip the delay slot on the fall-thru path
6404         if(likely[i]) {
6405           if(i<slen-1) {
6406             u&=unneeded_reg[i+2];
6407             uu&=unneeded_reg_upper[i+2];
6408           }
6409           else
6410           {
6411             u=1;
6412             uu=1;
6413           }
6414         }
6415       }
6416       else
6417       {
6418         // Internal branch, flag target
6419         bt[(ba[i]-start)>>2]=1;
6420         if(ba[i]<=start+i*4) {
6421           // Backward branch
6422           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6423           {
6424             // Unconditional branch
6425             temp_u=1;temp_uu=1;
6426           } else {
6427             // Conditional branch (not taken case)
6428             temp_u=unneeded_reg[i+2];
6429             temp_uu=unneeded_reg_upper[i+2];
6430           }
6431           // Merge in delay slot
6432           tdep=(~temp_uu>>rt1[i+1])&1;
6433           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6434           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6435           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6436           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6437           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6438           temp_u|=1;temp_uu|=1;
6439           // If branch is "likely" (and conditional)
6440           // then we skip the delay slot on the fall-thru path
6441           if(likely[i]) {
6442             if(i<slen-1) {
6443               temp_u&=unneeded_reg[i+2];
6444               temp_uu&=unneeded_reg_upper[i+2];
6445             }
6446             else
6447             {
6448               temp_u=1;
6449               temp_uu=1;
6450             }
6451           }
6452           tdep=(~temp_uu>>rt1[i])&1;
6453           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6454           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6455           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6456           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6457           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6458           temp_u|=1;temp_uu|=1;
6459           unneeded_reg[i]=temp_u;
6460           unneeded_reg_upper[i]=temp_uu;
6461           // Only go three levels deep.  This recursion can take an
6462           // excessive amount of time if there are a lot of nested loops.
6463           if(r<2) {
6464             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6465           }else{
6466             unneeded_reg[(ba[i]-start)>>2]=1;
6467             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6468           }
6469         } /*else*/ if(1) {
6470           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6471           {
6472             // Unconditional branch
6473             u=unneeded_reg[(ba[i]-start)>>2];
6474             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6475             branch_unneeded_reg[i]=u;
6476             branch_unneeded_reg_upper[i]=uu;
6477         //u=1;
6478         //uu=1;
6479         //branch_unneeded_reg[i]=u;
6480         //branch_unneeded_reg_upper[i]=uu;
6481             // Merge in delay slot
6482             tdep=(~uu>>rt1[i+1])&1;
6483             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6484             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6485             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6486             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6487             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6488             u|=1;uu|=1;
6489           } else {
6490             // Conditional branch
6491             b=unneeded_reg[(ba[i]-start)>>2];
6492             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6493             branch_unneeded_reg[i]=b;
6494             branch_unneeded_reg_upper[i]=bu;
6495         //b=1;
6496         //bu=1;
6497         //branch_unneeded_reg[i]=b;
6498         //branch_unneeded_reg_upper[i]=bu;
6499             // Branch delay slot
6500             tdep=(~uu>>rt1[i+1])&1;
6501             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6502             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6503             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6504             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6505             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6506             b|=1;bu|=1;
6507             // If branch is "likely" then we skip the
6508             // delay slot on the fall-thru path
6509             if(likely[i]) {
6510               u=b;
6511               uu=bu;
6512               if(i<slen-1) {
6513                 u&=unneeded_reg[i+2];
6514                 uu&=unneeded_reg_upper[i+2];
6515         //u=1;
6516         //uu=1;
6517               }
6518             } else {
6519               u&=b;
6520               uu&=bu;
6521         //u=1;
6522         //uu=1;
6523             }
6524             if(i<slen-1) {
6525               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6526               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6527         //branch_unneeded_reg[i]=1;
6528         //branch_unneeded_reg_upper[i]=1;
6529             } else {
6530               branch_unneeded_reg[i]=1;
6531               branch_unneeded_reg_upper[i]=1;
6532             }
6533           }
6534         }
6535       }
6536     }
6537     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
6538     {
6539       // SYSCALL instruction (software interrupt)
6540       u=1;
6541       uu=1;
6542     }
6543     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6544     {
6545       // ERET instruction (return from interrupt)
6546       u=1;
6547       uu=1;
6548     }
6549     //u=uu=1; // DEBUG
6550     tdep=(~uu>>rt1[i])&1;
6551     // Written registers are unneeded
6552     u|=1LL<<rt1[i];
6553     u|=1LL<<rt2[i];
6554     uu|=1LL<<rt1[i];
6555     uu|=1LL<<rt2[i];
6556     // Accessed registers are needed
6557     u&=~(1LL<<rs1[i]);
6558     u&=~(1LL<<rs2[i]);
6559     uu&=~(1LL<<us1[i]);
6560     uu&=~(1LL<<us2[i]);
6561     // Source-target dependencies
6562     uu&=~(tdep<<dep1[i]);
6563     uu&=~(tdep<<dep2[i]);
6564     // R0 is always unneeded
6565     u|=1;uu|=1;
6566     // Save it
6567     unneeded_reg[i]=u;
6568     unneeded_reg_upper[i]=uu;
6569 #ifdef FORCE32
6570     unneeded_reg_upper[i]=-1LL;
6571 #endif
6572     /*
6573     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6574     printf("U:");
6575     int r;
6576     for(r=1;r<=CCREG;r++) {
6577       if((unneeded_reg[i]>>r)&1) {
6578         if(r==HIREG) printf(" HI");
6579         else if(r==LOREG) printf(" LO");
6580         else printf(" r%d",r);
6581       }
6582     }
6583     printf(" UU:");
6584     for(r=1;r<=CCREG;r++) {
6585       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6586         if(r==HIREG) printf(" HI");
6587         else if(r==LOREG) printf(" LO");
6588         else printf(" r%d",r);
6589       }
6590     }
6591     printf("\n");*/
6592   }
6593 }
6594
6595 // Identify registers which are likely to contain 32-bit values
6596 // This is used to predict whether any branches will jump to a
6597 // location with 64-bit values in registers.
6598 static void provisional_32bit()
6599 {
6600   int i,j;
6601   uint64_t is32=1;
6602   uint64_t lastbranch=1;
6603   
6604   for(i=0;i<slen;i++)
6605   {
6606     if(i>0) {
6607       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6608         if(i>1) is32=lastbranch;
6609         else is32=1;
6610       }
6611     }
6612     if(i>1)
6613     {
6614       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6615         if(likely[i-2]) {
6616           if(i>2) is32=lastbranch;
6617           else is32=1;
6618         }
6619       }
6620       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6621       {
6622         if(rs1[i-2]==0||rs2[i-2]==0)
6623         {
6624           if(rs1[i-2]) {
6625             is32|=1LL<<rs1[i-2];
6626           }
6627           if(rs2[i-2]) {
6628             is32|=1LL<<rs2[i-2];
6629           }
6630         }
6631       }
6632     }
6633     // If something jumps here with 64-bit values
6634     // then promote those registers to 64 bits
6635     if(bt[i])
6636     {
6637       uint64_t temp_is32=is32;
6638       for(j=i-1;j>=0;j--)
6639       {
6640         if(ba[j]==start+i*4) 
6641           //temp_is32&=branch_regs[j].is32;
6642           temp_is32&=p32[j];
6643       }
6644       for(j=i;j<slen;j++)
6645       {
6646         if(ba[j]==start+i*4) 
6647           temp_is32=1;
6648       }
6649       is32=temp_is32;
6650     }
6651     int type=itype[i];
6652     int op=opcode[i];
6653     int op2=opcode2[i];
6654     int rt=rt1[i];
6655     int s1=rs1[i];
6656     int s2=rs2[i];
6657     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6658       // Branches don't write registers, consider the delay slot instead.
6659       type=itype[i+1];
6660       op=opcode[i+1];
6661       op2=opcode2[i+1];
6662       rt=rt1[i+1];
6663       s1=rs1[i+1];
6664       s2=rs2[i+1];
6665       lastbranch=is32;
6666     }
6667     switch(type) {
6668       case LOAD:
6669         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6670            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6671           is32&=~(1LL<<rt);
6672         else
6673           is32|=1LL<<rt;
6674         break;
6675       case STORE:
6676       case STORELR:
6677         break;
6678       case LOADLR:
6679         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6680         if(op==0x22) is32|=1LL<<rt; // LWL
6681         break;
6682       case IMM16:
6683         if (op==0x08||op==0x09|| // ADDI/ADDIU
6684             op==0x0a||op==0x0b|| // SLTI/SLTIU
6685             op==0x0c|| // ANDI
6686             op==0x0f)  // LUI
6687         {
6688           is32|=1LL<<rt;
6689         }
6690         if(op==0x18||op==0x19) { // DADDI/DADDIU
6691           is32&=~(1LL<<rt);
6692           //if(imm[i]==0)
6693           //  is32|=((is32>>s1)&1LL)<<rt;
6694         }
6695         if(op==0x0d||op==0x0e) { // ORI/XORI
6696           uint64_t sr=((is32>>s1)&1LL);
6697           is32&=~(1LL<<rt);
6698           is32|=sr<<rt;
6699         }
6700         break;
6701       case UJUMP:
6702         break;
6703       case RJUMP:
6704         break;
6705       case CJUMP:
6706         break;
6707       case SJUMP:
6708         break;
6709       case FJUMP:
6710         break;
6711       case ALU:
6712         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6713           is32|=1LL<<rt;
6714         }
6715         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6716           is32|=1LL<<rt;
6717         }
6718         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6719           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6720           is32&=~(1LL<<rt);
6721           is32|=sr<<rt;
6722         }
6723         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6724           if(s1==0&&s2==0) {
6725             is32|=1LL<<rt;
6726           }
6727           else if(s2==0) {
6728             uint64_t sr=((is32>>s1)&1LL);
6729             is32&=~(1LL<<rt);
6730             is32|=sr<<rt;
6731           }
6732           else if(s1==0) {
6733             uint64_t sr=((is32>>s2)&1LL);
6734             is32&=~(1LL<<rt);
6735             is32|=sr<<rt;
6736           }
6737           else {
6738             is32&=~(1LL<<rt);
6739           }
6740         }
6741         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6742           if(s1==0&&s2==0) {
6743             is32|=1LL<<rt;
6744           }
6745           else if(s2==0) {
6746             uint64_t sr=((is32>>s1)&1LL);
6747             is32&=~(1LL<<rt);
6748             is32|=sr<<rt;
6749           }
6750           else {
6751             is32&=~(1LL<<rt);
6752           }
6753         }
6754         break;
6755       case MULTDIV:
6756         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6757           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6758         }
6759         else {
6760           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6761         }
6762         break;
6763       case MOV:
6764         {
6765           uint64_t sr=((is32>>s1)&1LL);
6766           is32&=~(1LL<<rt);
6767           is32|=sr<<rt;
6768         }
6769         break;
6770       case SHIFT:
6771         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6772         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6773         break;
6774       case SHIFTIMM:
6775         is32|=1LL<<rt;
6776         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6777         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6778         break;
6779       case COP0:
6780         if(op2==0) is32|=1LL<<rt; // MFC0
6781         break;
6782       case COP1:
6783         if(op2==0) is32|=1LL<<rt; // MFC1
6784         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6785         if(op2==2) is32|=1LL<<rt; // CFC1
6786         break;
6787       case C1LS:
6788         break;
6789       case FLOAT:
6790       case FCONV:
6791         break;
6792       case FCOMP:
6793         break;
6794       case SYSCALL:
6795       case HLECALL:
6796         break;
6797       default:
6798         break;
6799     }
6800     is32|=1;
6801     p32[i]=is32;
6802
6803     if(i>0)
6804     {
6805       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6806       {
6807         if(rt1[i-1]==31) // JAL/JALR
6808         {
6809           // Subroutine call will return here, don't alloc any registers
6810           is32=1;
6811         }
6812         else if(i+1<slen)
6813         {
6814           // Internal branch will jump here, match registers to caller
6815           is32=0x3FFFFFFFFLL;
6816         }
6817       }
6818     }
6819   }
6820 }
6821
6822 // Identify registers which may be assumed to contain 32-bit values
6823 // and where optimizations will rely on this.
6824 // This is used to determine whether backward branches can safely
6825 // jump to a location with 64-bit values in registers.
6826 static void provisional_r32()
6827 {
6828   u_int r32=0;
6829   int i;
6830   
6831   for (i=slen-1;i>=0;i--)
6832   {
6833     int hr;
6834     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6835     {
6836       if(ba[i]<start || ba[i]>=(start+slen*4))
6837       {
6838         // Branch out of this block, don't need anything
6839         r32=0;
6840       }
6841       else
6842       {
6843         // Internal branch
6844         // Need whatever matches the target
6845         // (and doesn't get overwritten by the delay slot instruction)
6846         r32=0;
6847         int t=(ba[i]-start)>>2;
6848         if(ba[i]>start+i*4) {
6849           // Forward branch
6850           //if(!(requires_32bit[t]&~regs[i].was32))
6851           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6852           if(!(pr32[t]&~regs[i].was32))
6853             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6854         }else{
6855           // Backward branch
6856           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6857             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6858         }
6859       }
6860       // Conditional branch may need registers for following instructions
6861       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6862       {
6863         if(i<slen-2) {
6864           //r32|=requires_32bit[i+2];
6865           r32|=pr32[i+2];
6866           r32&=regs[i].was32;
6867           // Mark this address as a branch target since it may be called
6868           // upon return from interrupt
6869           //bt[i+2]=1;
6870         }
6871       }
6872       // Merge in delay slot
6873       if(!likely[i]) {
6874         // These are overwritten unless the branch is "likely"
6875         // and the delay slot is nullified if not taken
6876         r32&=~(1LL<<rt1[i+1]);
6877         r32&=~(1LL<<rt2[i+1]);
6878       }
6879       // Assume these are needed (delay slot)
6880       if(us1[i+1]>0)
6881       {
6882         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6883       }
6884       if(us2[i+1]>0)
6885       {
6886         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6887       }
6888       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6889       {
6890         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6891       }
6892       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6893       {
6894         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6895       }
6896     }
6897     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
6898     {
6899       // SYSCALL instruction (software interrupt)
6900       r32=0;
6901     }
6902     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6903     {
6904       // ERET instruction (return from interrupt)
6905       r32=0;
6906     }
6907     // Check 32 bits
6908     r32&=~(1LL<<rt1[i]);
6909     r32&=~(1LL<<rt2[i]);
6910     if(us1[i]>0)
6911     {
6912       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6913     }
6914     if(us2[i]>0)
6915     {
6916       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6917     }
6918     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6919     {
6920       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6921     }
6922     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6923     {
6924       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6925     }
6926     //requires_32bit[i]=r32;
6927     pr32[i]=r32;
6928     
6929     // Dirty registers which are 32-bit, require 32-bit input
6930     // as they will be written as 32-bit values
6931     for(hr=0;hr<HOST_REGS;hr++)
6932     {
6933       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6934         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6935           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6936           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6937           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6938         }
6939       }
6940     }
6941   }
6942 }
6943
6944 // Write back dirty registers as soon as we will no longer modify them,
6945 // so that we don't end up with lots of writes at the branches.
6946 void clean_registers(int istart,int iend,int wr)
6947 {
6948   int i;
6949   int r;
6950   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6951   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6952   if(iend==slen-1) {
6953     will_dirty_i=will_dirty_next=0;
6954     wont_dirty_i=wont_dirty_next=0;
6955   }else{
6956     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6957     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6958   }
6959   for (i=iend;i>=istart;i--)
6960   {
6961     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6962     {
6963       if(ba[i]<start || ba[i]>=(start+slen*4))
6964       {
6965         // Branch out of this block, flush all regs
6966         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6967         {
6968           // Unconditional branch
6969           will_dirty_i=0;
6970           wont_dirty_i=0;
6971           // Merge in delay slot (will dirty)
6972           for(r=0;r<HOST_REGS;r++) {
6973             if(r!=EXCLUDE_REG) {
6974               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6975               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6976               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6977               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6978               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6979               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6980               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6981               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6982               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6983               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6984               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6985               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6986               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6987               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6988             }
6989           }
6990         }
6991         else
6992         {
6993           // Conditional branch
6994           will_dirty_i=0;
6995           wont_dirty_i=wont_dirty_next;
6996           // Merge in delay slot (will dirty)
6997           for(r=0;r<HOST_REGS;r++) {
6998             if(r!=EXCLUDE_REG) {
6999               if(!likely[i]) {
7000                 // Might not dirty if likely branch is not taken
7001                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7002                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7003                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7004                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7005                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7006                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7007                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7008                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7009                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7010                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7011                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7012                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7013                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7014                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7015               }
7016             }
7017           }
7018         }
7019         // Merge in delay slot (wont dirty)
7020         for(r=0;r<HOST_REGS;r++) {
7021           if(r!=EXCLUDE_REG) {
7022             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7023             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7024             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7025             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7026             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7027             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7028             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7029             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7030             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7031             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7032           }
7033         }
7034         if(wr) {
7035           #ifndef DESTRUCTIVE_WRITEBACK
7036           branch_regs[i].dirty&=wont_dirty_i;
7037           #endif
7038           branch_regs[i].dirty|=will_dirty_i;
7039         }
7040       }
7041       else
7042       {
7043         // Internal branch
7044         if(ba[i]<=start+i*4) {
7045           // Backward branch
7046           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7047           {
7048             // Unconditional branch
7049             temp_will_dirty=0;
7050             temp_wont_dirty=0;
7051             // Merge in delay slot (will dirty)
7052             for(r=0;r<HOST_REGS;r++) {
7053               if(r!=EXCLUDE_REG) {
7054                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7055                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7056                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7057                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7058                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7059                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7060                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7061                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7062                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7063                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7064                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7065                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7066                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7067                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7068               }
7069             }
7070           } else {
7071             // Conditional branch (not taken case)
7072             temp_will_dirty=will_dirty_next;
7073             temp_wont_dirty=wont_dirty_next;
7074             // Merge in delay slot (will dirty)
7075             for(r=0;r<HOST_REGS;r++) {
7076               if(r!=EXCLUDE_REG) {
7077                 if(!likely[i]) {
7078                   // Will not dirty if likely branch is not taken
7079                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7080                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7081                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7082                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7083                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7084                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7085                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7086                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7087                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7088                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7089                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7090                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7091                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7092                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7093                 }
7094               }
7095             }
7096           }
7097           // Merge in delay slot (wont dirty)
7098           for(r=0;r<HOST_REGS;r++) {
7099             if(r!=EXCLUDE_REG) {
7100               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7101               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7102               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7103               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7104               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7105               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7106               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7107               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7108               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7109               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7110             }
7111           }
7112           // Deal with changed mappings
7113           if(i<iend) {
7114             for(r=0;r<HOST_REGS;r++) {
7115               if(r!=EXCLUDE_REG) {
7116                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7117                   temp_will_dirty&=~(1<<r);
7118                   temp_wont_dirty&=~(1<<r);
7119                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7120                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7121                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7122                   } else {
7123                     temp_will_dirty|=1<<r;
7124                     temp_wont_dirty|=1<<r;
7125                   }
7126                 }
7127               }
7128             }
7129           }
7130           if(wr) {
7131             will_dirty[i]=temp_will_dirty;
7132             wont_dirty[i]=temp_wont_dirty;
7133             clean_registers((ba[i]-start)>>2,i-1,0);
7134           }else{
7135             // Limit recursion.  It can take an excessive amount
7136             // of time if there are a lot of nested loops.
7137             will_dirty[(ba[i]-start)>>2]=0;
7138             wont_dirty[(ba[i]-start)>>2]=-1;
7139           }
7140         }
7141         /*else*/ if(1)
7142         {
7143           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7144           {
7145             // Unconditional branch
7146             will_dirty_i=0;
7147             wont_dirty_i=0;
7148           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7149             for(r=0;r<HOST_REGS;r++) {
7150               if(r!=EXCLUDE_REG) {
7151                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7152                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7153                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7154                 }
7155               }
7156             }
7157           //}
7158             // Merge in delay slot
7159             for(r=0;r<HOST_REGS;r++) {
7160               if(r!=EXCLUDE_REG) {
7161                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7162                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7163                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7164                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7165                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7166                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7167                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7168                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7169                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7170                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7171                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7172                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7173                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7174                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7175               }
7176             }
7177           } else {
7178             // Conditional branch
7179             will_dirty_i=will_dirty_next;
7180             wont_dirty_i=wont_dirty_next;
7181           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7182             for(r=0;r<HOST_REGS;r++) {
7183               if(r!=EXCLUDE_REG) {
7184                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7185                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7186                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7187                 }
7188                 else
7189                 {
7190                   will_dirty_i&=~(1<<r);
7191                 }
7192                 // Treat delay slot as part of branch too
7193                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7194                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7195                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7196                 }
7197                 else
7198                 {
7199                   will_dirty[i+1]&=~(1<<r);
7200                 }*/
7201               }
7202             }
7203           //}
7204             // Merge in delay slot
7205             for(r=0;r<HOST_REGS;r++) {
7206               if(r!=EXCLUDE_REG) {
7207                 if(!likely[i]) {
7208                   // Might not dirty if likely branch is not taken
7209                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7210                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7211                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7212                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7213                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7214                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7215                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7216                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7217                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7218                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7219                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7220                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7221                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7222                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7223                 }
7224               }
7225             }
7226           }
7227           // Merge in delay slot
7228           for(r=0;r<HOST_REGS;r++) {
7229             if(r!=EXCLUDE_REG) {
7230               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7231               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7232               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7233               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7234               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7235               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7236               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7237               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7238               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7239               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7240             }
7241           }
7242           if(wr) {
7243             #ifndef DESTRUCTIVE_WRITEBACK
7244             branch_regs[i].dirty&=wont_dirty_i;
7245             #endif
7246             branch_regs[i].dirty|=will_dirty_i;
7247           }
7248         }
7249       }
7250     }
7251     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
7252     {
7253       // SYSCALL instruction (software interrupt)
7254       will_dirty_i=0;
7255       wont_dirty_i=0;
7256     }
7257     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7258     {
7259       // ERET instruction (return from interrupt)
7260       will_dirty_i=0;
7261       wont_dirty_i=0;
7262     }
7263     will_dirty_next=will_dirty_i;
7264     wont_dirty_next=wont_dirty_i;
7265     for(r=0;r<HOST_REGS;r++) {
7266       if(r!=EXCLUDE_REG) {
7267         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7268         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7269         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7270         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7271         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7272         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7273         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7274         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7275         if(i>istart) {
7276           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7277           {
7278             // Don't store a register immediately after writing it,
7279             // may prevent dual-issue.
7280             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7281             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7282           }
7283         }
7284       }
7285     }
7286     // Save it
7287     will_dirty[i]=will_dirty_i;
7288     wont_dirty[i]=wont_dirty_i;
7289     // Mark registers that won't be dirtied as not dirty
7290     if(wr) {
7291       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7292       for(r=0;r<HOST_REGS;r++) {
7293         if((will_dirty_i>>r)&1) {
7294           printf(" r%d",r);
7295         }
7296       }
7297       printf("\n");*/
7298
7299       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7300         regs[i].dirty|=will_dirty_i;
7301         #ifndef DESTRUCTIVE_WRITEBACK
7302         regs[i].dirty&=wont_dirty_i;
7303         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7304         {
7305           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7306             for(r=0;r<HOST_REGS;r++) {
7307               if(r!=EXCLUDE_REG) {
7308                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7309                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7310                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7311               }
7312             }
7313           }
7314         }
7315         else
7316         {
7317           if(i<iend) {
7318             for(r=0;r<HOST_REGS;r++) {
7319               if(r!=EXCLUDE_REG) {
7320                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7321                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7322                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7323               }
7324             }
7325           }
7326         }
7327         #endif
7328       //}
7329     }
7330     // Deal with changed mappings
7331     temp_will_dirty=will_dirty_i;
7332     temp_wont_dirty=wont_dirty_i;
7333     for(r=0;r<HOST_REGS;r++) {
7334       if(r!=EXCLUDE_REG) {
7335         int nr;
7336         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7337           if(wr) {
7338             #ifndef DESTRUCTIVE_WRITEBACK
7339             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7340             #endif
7341             regs[i].wasdirty|=will_dirty_i&(1<<r);
7342           }
7343         }
7344         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7345           // Register moved to a different register
7346           will_dirty_i&=~(1<<r);
7347           wont_dirty_i&=~(1<<r);
7348           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7349           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7350           if(wr) {
7351             #ifndef DESTRUCTIVE_WRITEBACK
7352             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7353             #endif
7354             regs[i].wasdirty|=will_dirty_i&(1<<r);
7355           }
7356         }
7357         else {
7358           will_dirty_i&=~(1<<r);
7359           wont_dirty_i&=~(1<<r);
7360           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7361             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7362             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7363           } else {
7364             wont_dirty_i|=1<<r;
7365             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7366           }
7367         }
7368       }
7369     }
7370   }
7371 }
7372
7373   /* disassembly */
7374 void disassemble_inst(int i)
7375 {
7376     if (bt[i]) printf("*"); else printf(" ");
7377     switch(itype[i]) {
7378       case UJUMP:
7379         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7380       case CJUMP:
7381         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7382       case SJUMP:
7383         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7384       case FJUMP:
7385         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7386       case RJUMP:
7387         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7388       case SPAN:
7389         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7390       case IMM16:
7391         if(opcode[i]==0xf) //LUI
7392           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7393         else
7394           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7395         break;
7396       case LOAD:
7397       case LOADLR:
7398         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7399         break;
7400       case STORE:
7401       case STORELR:
7402         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7403         break;
7404       case ALU:
7405       case SHIFT:
7406         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7407         break;
7408       case MULTDIV:
7409         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7410         break;
7411       case SHIFTIMM:
7412         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7413         break;
7414       case MOV:
7415         if((opcode2[i]&0x1d)==0x10)
7416           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7417         else if((opcode2[i]&0x1d)==0x11)
7418           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7419         else
7420           printf (" %x: %s\n",start+i*4,insn[i]);
7421         break;
7422       case COP0:
7423         if(opcode2[i]==0)
7424           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7425         else if(opcode2[i]==4)
7426           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7427         else printf (" %x: %s\n",start+i*4,insn[i]);
7428         break;
7429       case COP1:
7430         if(opcode2[i]<3)
7431           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7432         else if(opcode2[i]>3)
7433           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7434         else printf (" %x: %s\n",start+i*4,insn[i]);
7435         break;
7436       case C1LS:
7437         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7438         break;
7439       default:
7440         //printf (" %s %8x\n",insn[i],source[i]);
7441         printf (" %x: %s\n",start+i*4,insn[i]);
7442     }
7443 }
7444
7445 void new_dynarec_init()
7446 {
7447   printf("Init new dynarec\n");
7448   out=(u_char *)BASE_ADDR;
7449   if (mmap (out, 1<<TARGET_SIZE_2,
7450             PROT_READ | PROT_WRITE | PROT_EXEC,
7451             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7452             -1, 0) <= 0) {printf("mmap() failed\n");}
7453 #ifdef MUPEN64
7454   rdword=&readmem_dword;
7455   fake_pc.f.r.rs=&readmem_dword;
7456   fake_pc.f.r.rt=&readmem_dword;
7457   fake_pc.f.r.rd=&readmem_dword;
7458 #endif
7459   int n;
7460   for(n=0x80000;n<0x80800;n++)
7461     invalid_code[n]=1;
7462   for(n=0;n<65536;n++)
7463     hash_table[n][0]=hash_table[n][2]=-1;
7464   memset(mini_ht,-1,sizeof(mini_ht));
7465   memset(restore_candidate,0,sizeof(restore_candidate));
7466   copy=shadow;
7467   expirep=16384; // Expiry pointer, +2 blocks
7468   pending_exception=0;
7469   literalcount=0;
7470 #ifdef HOST_IMM8
7471   // Copy this into local area so we don't have to put it in every literal pool
7472   invc_ptr=invalid_code;
7473 #endif
7474   stop_after_jal=0;
7475   // TLB
7476   using_tlb=0;
7477   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7478     memory_map[n]=-1;
7479   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7480     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7481   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7482     memory_map[n]=-1;
7483 #ifdef MUPEN64
7484   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7485     writemem[n] = write_nomem_new;
7486     writememb[n] = write_nomemb_new;
7487     writememh[n] = write_nomemh_new;
7488 #ifndef FORCE32
7489     writememd[n] = write_nomemd_new;
7490 #endif
7491     readmem[n] = read_nomem_new;
7492     readmemb[n] = read_nomemb_new;
7493     readmemh[n] = read_nomemh_new;
7494 #ifndef FORCE32
7495     readmemd[n] = read_nomemd_new;
7496 #endif
7497   }
7498   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7499     writemem[n] = write_rdram_new;
7500     writememb[n] = write_rdramb_new;
7501     writememh[n] = write_rdramh_new;
7502 #ifndef FORCE32
7503     writememd[n] = write_rdramd_new;
7504 #endif
7505   }
7506   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7507     writemem[n] = write_nomem_new;
7508     writememb[n] = write_nomemb_new;
7509     writememh[n] = write_nomemh_new;
7510 #ifndef FORCE32
7511     writememd[n] = write_nomemd_new;
7512 #endif
7513     readmem[n] = read_nomem_new;
7514     readmemb[n] = read_nomemb_new;
7515     readmemh[n] = read_nomemh_new;
7516 #ifndef FORCE32
7517     readmemd[n] = read_nomemd_new;
7518 #endif
7519   }
7520 #endif
7521   tlb_hacks();
7522   arch_init();
7523 }
7524
7525 void new_dynarec_cleanup()
7526 {
7527   int n;
7528   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7529   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7530   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7531   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7532   #ifdef ROM_COPY
7533   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7534   #endif
7535 }
7536
7537 int new_recompile_block(int addr)
7538 {
7539 /*
7540   if(addr==0x800cd050) {
7541     int block;
7542     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7543     int n;
7544     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7545   }
7546 */
7547   //if(Count==365117028) tracedebug=1;
7548   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7549   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7550   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7551   //if(debug) 
7552   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7553   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7554   /*if(Count>=312978186) {
7555     rlist();
7556   }*/
7557   //rlist();
7558   start = (u_int)addr&~3;
7559   //assert(((u_int)addr&1)==0);
7560 #ifdef PCSX
7561   if (Config.HLE && start == 0x80001000) {
7562     // XXX: is this enough? Maybe check hleSoftCall?
7563     u_int page=get_page(start);
7564     ll_add(jump_in+page,start,out);
7565     invalid_code[start>>12]=0;
7566     emit_movimm(start,0);
7567     emit_writeword(0,(int)&pcaddr);
7568     emit_jmp((int)new_dyna_leave); // enough??
7569     return 0;
7570   }
7571   else if ((u_int)addr < 0x00200000) {
7572     // used for BIOS calls mostly?
7573     source = (u_int *)((u_int)rdram+start-0);
7574     pagelimit = 0x00200000;
7575   }
7576   else
7577 #endif
7578 #ifdef MUPEN64
7579   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7580     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7581     pagelimit = 0xa4001000;
7582   }
7583   else
7584 #endif
7585   if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7586     source = (u_int *)((u_int)rdram+start-0x80000000);
7587     pagelimit = 0x80800000;
7588   }
7589 #ifndef DISABLE_TLB
7590   else if ((signed int)addr >= (signed int)0xC0000000) {
7591     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7592     //if(tlb_LUT_r[start>>12])
7593       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7594     if((signed int)memory_map[start>>12]>=0) {
7595       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7596       pagelimit=(start+4096)&0xFFFFF000;
7597       int map=memory_map[start>>12];
7598       int i;
7599       for(i=0;i<5;i++) {
7600         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7601         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7602       }
7603       assem_debug("pagelimit=%x\n",pagelimit);
7604       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7605     }
7606     else {
7607       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7608       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7609       return 1; // Caller will invoke exception handler
7610     }
7611     //printf("source= %x\n",(int)source);
7612   }
7613 #endif
7614   else {
7615     printf("Compile at bogus memory address: %x \n", (int)addr);
7616     exit(1);
7617   }
7618
7619   /* Pass 1: disassemble */
7620   /* Pass 2: register dependencies, branch targets */
7621   /* Pass 3: register allocation */
7622   /* Pass 4: branch dependencies */
7623   /* Pass 5: pre-alloc */
7624   /* Pass 6: optimize clean/dirty state */
7625   /* Pass 7: flag 32-bit registers */
7626   /* Pass 8: assembly */
7627   /* Pass 9: linker */
7628   /* Pass 10: garbage collection / free memory */
7629
7630   int i,j;
7631   int done=0;
7632   unsigned int type,op,op2;
7633
7634   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7635   
7636   /* Pass 1 disassembly */
7637
7638   for(i=0;!done;i++) {
7639     bt[i]=0;likely[i]=0;op2=0;
7640     opcode[i]=op=source[i]>>26;
7641     switch(op)
7642     {
7643       case 0x00: strcpy(insn[i],"special"); type=NI;
7644         op2=source[i]&0x3f;
7645         switch(op2)
7646         {
7647           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7648           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7649           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7650           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7651           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7652           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7653           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7654           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7655           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7656           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7657           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7658           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7659           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7660           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7661           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7662           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7663           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7664           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7665           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7666           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7667           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7668           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7669           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7670           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7671           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7672           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7673           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7674           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7675           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7676           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7677           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7678           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7679           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7680           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7681           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7682           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7683           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7684           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7685           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7686           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7687           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7688           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7689           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7690           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7691           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7692           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7693           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7694           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7695           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7696           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7697           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7698           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7699         }
7700         break;
7701       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7702         op2=(source[i]>>16)&0x1f;
7703         switch(op2)
7704         {
7705           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7706           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7707           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7708           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7709           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7710           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7711           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7712           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7713           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7714           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7715           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7716           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7717           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7718           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7719         }
7720         break;
7721       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7722       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7723       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7724       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7725       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7726       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7727       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7728       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7729       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7730       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7731       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7732       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7733       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7734       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7735       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7736         op2=(source[i]>>21)&0x1f;
7737         switch(op2)
7738         {
7739           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7740           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7741           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7742           switch(source[i]&0x3f)
7743           {
7744             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7745             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7746             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7747             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7748             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7749           }
7750         }
7751         break;
7752       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7753         op2=(source[i]>>21)&0x1f;
7754         switch(op2)
7755         {
7756           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7757           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7758           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7759           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7760           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7761           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7762           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7763           switch((source[i]>>16)&0x3)
7764           {
7765             case 0x00: strcpy(insn[i],"BC1F"); break;
7766             case 0x01: strcpy(insn[i],"BC1T"); break;
7767             case 0x02: strcpy(insn[i],"BC1FL"); break;
7768             case 0x03: strcpy(insn[i],"BC1TL"); break;
7769           }
7770           break;
7771           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7772           switch(source[i]&0x3f)
7773           {
7774             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7775             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7776             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7777             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7778             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7779             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7780             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7781             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7782             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7783             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7784             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7785             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7786             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7787             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7788             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7789             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7790             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7791             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7792             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7793             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7794             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7795             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7796             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7797             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7798             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7799             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7800             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7801             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7802             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7803             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7804             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7805             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7806             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7807             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7808             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7809           }
7810           break;
7811           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7812           switch(source[i]&0x3f)
7813           {
7814             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7815             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7816             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7817             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7818             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7819             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7820             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7821             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7822             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7823             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7824             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7825             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7826             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7827             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7828             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7829             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7830             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7831             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7832             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7833             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7834             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7835             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7836             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7837             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7838             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7839             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7840             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7841             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7842             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7843             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7844             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7845             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7846             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7847             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7848             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7849           }
7850           break;
7851           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7852           switch(source[i]&0x3f)
7853           {
7854             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7855             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7856           }
7857           break;
7858           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7859           switch(source[i]&0x3f)
7860           {
7861             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7862             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7863           }
7864           break;
7865         }
7866         break;
7867       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7868       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7869       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7870       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7871       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7872       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7873       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7874       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7875       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7876       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7877       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7878       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7879       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7880       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7881       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7882       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7883       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7884       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7885       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7886       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7887       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7888       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7889       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7890       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7891       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7892       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7893       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7894       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7895       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7896       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7897       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7898 #ifdef PCSX
7899       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7900 #endif
7901       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7902       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7903       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7904       default: strcpy(insn[i],"???"); type=NI;
7905         printf("NI %08x @%08x\n", source[i], addr + i*4);
7906         break;
7907     }
7908     itype[i]=type;
7909     opcode2[i]=op2;
7910     /* Get registers/immediates */
7911     lt1[i]=0;
7912     us1[i]=0;
7913     us2[i]=0;
7914     dep1[i]=0;
7915     dep2[i]=0;
7916     switch(type) {
7917       case LOAD:
7918         rs1[i]=(source[i]>>21)&0x1f;
7919         rs2[i]=0;
7920         rt1[i]=(source[i]>>16)&0x1f;
7921         rt2[i]=0;
7922         imm[i]=(short)source[i];
7923         break;
7924       case STORE:
7925       case STORELR:
7926         rs1[i]=(source[i]>>21)&0x1f;
7927         rs2[i]=(source[i]>>16)&0x1f;
7928         rt1[i]=0;
7929         rt2[i]=0;
7930         imm[i]=(short)source[i];
7931         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7932         break;
7933       case LOADLR:
7934         // LWL/LWR only load part of the register,
7935         // therefore the target register must be treated as a source too
7936         rs1[i]=(source[i]>>21)&0x1f;
7937         rs2[i]=(source[i]>>16)&0x1f;
7938         rt1[i]=(source[i]>>16)&0x1f;
7939         rt2[i]=0;
7940         imm[i]=(short)source[i];
7941         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7942         if(op==0x26) dep1[i]=rt1[i]; // LWR
7943         break;
7944       case IMM16:
7945         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7946         else rs1[i]=(source[i]>>21)&0x1f;
7947         rs2[i]=0;
7948         rt1[i]=(source[i]>>16)&0x1f;
7949         rt2[i]=0;
7950         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7951           imm[i]=(unsigned short)source[i];
7952         }else{
7953           imm[i]=(short)source[i];
7954         }
7955         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7956         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7957         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7958         break;
7959       case UJUMP:
7960         rs1[i]=0;
7961         rs2[i]=0;
7962         rt1[i]=0;
7963         rt2[i]=0;
7964         // The JAL instruction writes to r31.
7965         if (op&1) {
7966           rt1[i]=31;
7967         }
7968         rs2[i]=CCREG;
7969         break;
7970       case RJUMP:
7971         rs1[i]=(source[i]>>21)&0x1f;
7972         rs2[i]=0;
7973         rt1[i]=0;
7974         rt2[i]=0;
7975         // The JALR instruction writes to r31.
7976         if (op2&1) {
7977           rt1[i]=31;   
7978         }
7979         rs2[i]=CCREG;
7980         break;
7981       case CJUMP:
7982         rs1[i]=(source[i]>>21)&0x1f;
7983         rs2[i]=(source[i]>>16)&0x1f;
7984         rt1[i]=0;
7985         rt2[i]=0;
7986         if(op&2) { // BGTZ/BLEZ
7987           rs2[i]=0;
7988         }
7989         us1[i]=rs1[i];
7990         us2[i]=rs2[i];
7991         likely[i]=op>>4;
7992         break;
7993       case SJUMP:
7994         rs1[i]=(source[i]>>21)&0x1f;
7995         rs2[i]=CCREG;
7996         rt1[i]=0;
7997         rt2[i]=0;
7998         us1[i]=rs1[i];
7999         if(op2&0x10) { // BxxAL
8000           rt1[i]=31;
8001           // NOTE: If the branch is not taken, r31 is still overwritten
8002         }
8003         likely[i]=(op2&2)>>1;
8004         break;
8005       case FJUMP:
8006         rs1[i]=FSREG;
8007         rs2[i]=CSREG;
8008         rt1[i]=0;
8009         rt2[i]=0;
8010         likely[i]=((source[i])>>17)&1;
8011         break;
8012       case ALU:
8013         rs1[i]=(source[i]>>21)&0x1f; // source
8014         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8015         rt1[i]=(source[i]>>11)&0x1f; // destination
8016         rt2[i]=0;
8017         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8018           us1[i]=rs1[i];us2[i]=rs2[i];
8019         }
8020         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8021           dep1[i]=rs1[i];dep2[i]=rs2[i];
8022         }
8023         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8024           dep1[i]=rs1[i];dep2[i]=rs2[i];
8025         }
8026         break;
8027       case MULTDIV:
8028         rs1[i]=(source[i]>>21)&0x1f; // source
8029         rs2[i]=(source[i]>>16)&0x1f; // divisor
8030         rt1[i]=HIREG;
8031         rt2[i]=LOREG;
8032         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8033           us1[i]=rs1[i];us2[i]=rs2[i];
8034         }
8035         break;
8036       case MOV:
8037         rs1[i]=0;
8038         rs2[i]=0;
8039         rt1[i]=0;
8040         rt2[i]=0;
8041         if(op2==0x10) rs1[i]=HIREG; // MFHI
8042         if(op2==0x11) rt1[i]=HIREG; // MTHI
8043         if(op2==0x12) rs1[i]=LOREG; // MFLO
8044         if(op2==0x13) rt1[i]=LOREG; // MTLO
8045         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8046         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8047         dep1[i]=rs1[i];
8048         break;
8049       case SHIFT:
8050         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8051         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8052         rt1[i]=(source[i]>>11)&0x1f; // destination
8053         rt2[i]=0;
8054         // DSLLV/DSRLV/DSRAV are 64-bit
8055         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8056         break;
8057       case SHIFTIMM:
8058         rs1[i]=(source[i]>>16)&0x1f;
8059         rs2[i]=0;
8060         rt1[i]=(source[i]>>11)&0x1f;
8061         rt2[i]=0;
8062         imm[i]=(source[i]>>6)&0x1f;
8063         // DSxx32 instructions
8064         if(op2>=0x3c) imm[i]|=0x20;
8065         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8066         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8067         break;
8068       case COP0:
8069         rs1[i]=0;
8070         rs2[i]=0;
8071         rt1[i]=0;
8072         rt2[i]=0;
8073         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8074         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8075         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8076         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8077         break;
8078       case COP1:
8079         rs1[i]=0;
8080         rs2[i]=0;
8081         rt1[i]=0;
8082         rt2[i]=0;
8083         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8084         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8085         if(op2==5) us1[i]=rs1[i]; // DMTC1
8086         rs2[i]=CSREG;
8087         break;
8088       case C1LS:
8089         rs1[i]=(source[i]>>21)&0x1F;
8090         rs2[i]=CSREG;
8091         rt1[i]=0;
8092         rt2[i]=0;
8093         imm[i]=(short)source[i];
8094         break;
8095       case FLOAT:
8096       case FCONV:
8097         rs1[i]=0;
8098         rs2[i]=CSREG;
8099         rt1[i]=0;
8100         rt2[i]=0;
8101         break;
8102       case FCOMP:
8103         rs1[i]=FSREG;
8104         rs2[i]=CSREG;
8105         rt1[i]=FSREG;
8106         rt2[i]=0;
8107         break;
8108       case SYSCALL:
8109       case HLECALL:
8110         rs1[i]=CCREG;
8111         rs2[i]=0;
8112         rt1[i]=0;
8113         rt2[i]=0;
8114         break;
8115       default:
8116         rs1[i]=0;
8117         rs2[i]=0;
8118         rt1[i]=0;
8119         rt2[i]=0;
8120     }
8121     /* Calculate branch target addresses */
8122     if(type==UJUMP)
8123       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8124     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8125       ba[i]=start+i*4+8; // Ignore never taken branch
8126     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8127       ba[i]=start+i*4+8; // Ignore never taken branch
8128     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8129       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8130     else ba[i]=-1;
8131     /* Is this the end of the block? */
8132     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8133       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8134         done=1;
8135         // Does the block continue due to a branch?
8136         for(j=i-1;j>=0;j--)
8137         {
8138           if(ba[j]==start+i*4+4) done=j=0;
8139           if(ba[j]==start+i*4+8) done=j=0;
8140         }
8141       }
8142       else {
8143         if(stop_after_jal) done=1;
8144         // Stop on BREAK
8145         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8146       }
8147       // Don't recompile stuff that's already compiled
8148       if(check_addr(start+i*4+4)) done=1;
8149       // Don't get too close to the limit
8150       if(i>MAXBLOCK/2) done=1;
8151     }
8152     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8153     if(itype[i-1]==HLECALL) done=1;
8154     assert(i<MAXBLOCK-1);
8155     if(start+i*4==pagelimit-4) done=1;
8156     assert(start+i*4<pagelimit);
8157     if (i==MAXBLOCK-1) done=1;
8158     // Stop if we're compiling junk
8159     if(itype[i]==NI&&opcode[i]==0x11) {
8160       done=stop_after_jal=1;
8161       printf("Disabled speculative precompilation\n");
8162     }
8163   }
8164   slen=i;
8165   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8166     if(start+i*4==pagelimit) {
8167       itype[i-1]=SPAN;
8168     }
8169   }
8170   assert(slen>0);
8171
8172   /* Pass 2 - Register dependencies and branch targets */
8173
8174   unneeded_registers(0,slen-1,0);
8175   
8176   /* Pass 3 - Register allocation */
8177
8178   struct regstat current; // Current register allocations/status
8179   current.is32=1;
8180   current.dirty=0;
8181   current.u=unneeded_reg[0];
8182   current.uu=unneeded_reg_upper[0];
8183   clear_all_regs(current.regmap);
8184   alloc_reg(&current,0,CCREG);
8185   dirty_reg(&current,CCREG);
8186   current.isconst=0;
8187   current.wasconst=0;
8188   int ds=0;
8189   int cc=0;
8190   int hr;
8191   
8192   provisional_32bit();
8193   
8194   if((u_int)addr&1) {
8195     // First instruction is delay slot
8196     cc=-1;
8197     bt[1]=1;
8198     ds=1;
8199     unneeded_reg[0]=1;
8200     unneeded_reg_upper[0]=1;
8201     current.regmap[HOST_BTREG]=BTREG;
8202   }
8203   
8204   for(i=0;i<slen;i++)
8205   {
8206     if(bt[i])
8207     {
8208       int hr;
8209       for(hr=0;hr<HOST_REGS;hr++)
8210       {
8211         // Is this really necessary?
8212         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8213       }
8214       current.isconst=0;
8215     }
8216     if(i>1)
8217     {
8218       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8219       {
8220         if(rs1[i-2]==0||rs2[i-2]==0)
8221         {
8222           if(rs1[i-2]) {
8223             current.is32|=1LL<<rs1[i-2];
8224             int hr=get_reg(current.regmap,rs1[i-2]|64);
8225             if(hr>=0) current.regmap[hr]=-1;
8226           }
8227           if(rs2[i-2]) {
8228             current.is32|=1LL<<rs2[i-2];
8229             int hr=get_reg(current.regmap,rs2[i-2]|64);
8230             if(hr>=0) current.regmap[hr]=-1;
8231           }
8232         }
8233       }
8234     }
8235     // If something jumps here with 64-bit values
8236     // then promote those registers to 64 bits
8237     if(bt[i])
8238     {
8239       uint64_t temp_is32=current.is32;
8240       for(j=i-1;j>=0;j--)
8241       {
8242         if(ba[j]==start+i*4) 
8243           temp_is32&=branch_regs[j].is32;
8244       }
8245       for(j=i;j<slen;j++)
8246       {
8247         if(ba[j]==start+i*4) 
8248           //temp_is32=1;
8249           temp_is32&=p32[j];
8250       }
8251       if(temp_is32!=current.is32) {
8252         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8253         #ifdef DESTRUCTIVE_WRITEBACK
8254         for(hr=0;hr<HOST_REGS;hr++)
8255         {
8256           int r=current.regmap[hr];
8257           if(r>0&&r<64)
8258           {
8259             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8260               temp_is32|=1LL<<r;
8261               //printf("restore %d\n",r);
8262             }
8263           }
8264         }
8265         #endif
8266         current.is32=temp_is32;
8267       }
8268     }
8269 #ifdef FORCE32
8270     memset(p32, 0xff, sizeof(p32));
8271     current.is32=-1LL;
8272 #endif
8273
8274     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8275     regs[i].wasconst=current.isconst;
8276     regs[i].was32=current.is32;
8277     regs[i].wasdirty=current.dirty;
8278     #ifdef DESTRUCTIVE_WRITEBACK
8279     // To change a dirty register from 32 to 64 bits, we must write
8280     // it out during the previous cycle (for branches, 2 cycles)
8281     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8282     {
8283       uint64_t temp_is32=current.is32;
8284       for(j=i-1;j>=0;j--)
8285       {
8286         if(ba[j]==start+i*4+4) 
8287           temp_is32&=branch_regs[j].is32;
8288       }
8289       for(j=i;j<slen;j++)
8290       {
8291         if(ba[j]==start+i*4+4) 
8292           //temp_is32=1;
8293           temp_is32&=p32[j];
8294       }
8295       if(temp_is32!=current.is32) {
8296         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8297         for(hr=0;hr<HOST_REGS;hr++)
8298         {
8299           int r=current.regmap[hr];
8300           if(r>0)
8301           {
8302             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8303               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8304               {
8305                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8306                 {
8307                   //printf("dump %d/r%d\n",hr,r);
8308                   current.regmap[hr]=-1;
8309                   if(get_reg(current.regmap,r|64)>=0) 
8310                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8311                 }
8312               }
8313             }
8314           }
8315         }
8316       }
8317     }
8318     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8319     {
8320       uint64_t temp_is32=current.is32;
8321       for(j=i-1;j>=0;j--)
8322       {
8323         if(ba[j]==start+i*4+8) 
8324           temp_is32&=branch_regs[j].is32;
8325       }
8326       for(j=i;j<slen;j++)
8327       {
8328         if(ba[j]==start+i*4+8) 
8329           //temp_is32=1;
8330           temp_is32&=p32[j];
8331       }
8332       if(temp_is32!=current.is32) {
8333         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8334         for(hr=0;hr<HOST_REGS;hr++)
8335         {
8336           int r=current.regmap[hr];
8337           if(r>0)
8338           {
8339             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8340               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8341               {
8342                 //printf("dump %d/r%d\n",hr,r);
8343                 current.regmap[hr]=-1;
8344                 if(get_reg(current.regmap,r|64)>=0) 
8345                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8346               }
8347             }
8348           }
8349         }
8350       }
8351     }
8352     #endif
8353     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8354       if(i+1<slen) {
8355         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8356         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8357         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8358         current.u|=1;
8359         current.uu|=1;
8360       } else {
8361         current.u=1;
8362         current.uu=1;
8363       }
8364     } else {
8365       if(i+1<slen) {
8366         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8367         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8368         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8369         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8370         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8371         current.u|=1;
8372         current.uu|=1;
8373       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8374     }
8375     is_ds[i]=ds;
8376     if(ds) {
8377       ds=0; // Skip delay slot, already allocated as part of branch
8378       // ...but we need to alloc it in case something jumps here
8379       if(i+1<slen) {
8380         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8381         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8382       }else{
8383         current.u=branch_unneeded_reg[i-1];
8384         current.uu=branch_unneeded_reg_upper[i-1];
8385       }
8386       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8387       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8388       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8389       current.u|=1;
8390       current.uu|=1;
8391       struct regstat temp;
8392       memcpy(&temp,&current,sizeof(current));
8393       temp.wasdirty=temp.dirty;
8394       temp.was32=temp.is32;
8395       // TODO: Take into account unconditional branches, as below
8396       delayslot_alloc(&temp,i);
8397       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8398       regs[i].wasdirty=temp.wasdirty;
8399       regs[i].was32=temp.was32;
8400       regs[i].dirty=temp.dirty;
8401       regs[i].is32=temp.is32;
8402       regs[i].isconst=0;
8403       regs[i].wasconst=0;
8404       current.isconst=0;
8405       // Create entry (branch target) regmap
8406       for(hr=0;hr<HOST_REGS;hr++)
8407       {
8408         int r=temp.regmap[hr];
8409         if(r>=0) {
8410           if(r!=regmap_pre[i][hr]) {
8411             regs[i].regmap_entry[hr]=-1;
8412           }
8413           else
8414           {
8415             if(r<64){
8416               if((current.u>>r)&1) {
8417                 regs[i].regmap_entry[hr]=-1;
8418                 regs[i].regmap[hr]=-1;
8419                 //Don't clear regs in the delay slot as the branch might need them
8420                 //current.regmap[hr]=-1;
8421               }else
8422                 regs[i].regmap_entry[hr]=r;
8423             }
8424             else {
8425               if((current.uu>>(r&63))&1) {
8426                 regs[i].regmap_entry[hr]=-1;
8427                 regs[i].regmap[hr]=-1;
8428                 //Don't clear regs in the delay slot as the branch might need them
8429                 //current.regmap[hr]=-1;
8430               }else
8431                 regs[i].regmap_entry[hr]=r;
8432             }
8433           }
8434         } else {
8435           // First instruction expects CCREG to be allocated
8436           if(i==0&&hr==HOST_CCREG) 
8437             regs[i].regmap_entry[hr]=CCREG;
8438           else
8439             regs[i].regmap_entry[hr]=-1;
8440         }
8441       }
8442     }
8443     else { // Not delay slot
8444       switch(itype[i]) {
8445         case UJUMP:
8446           //current.isconst=0; // DEBUG
8447           //current.wasconst=0; // DEBUG
8448           //regs[i].wasconst=0; // DEBUG
8449           clear_const(&current,rt1[i]);
8450           alloc_cc(&current,i);
8451           dirty_reg(&current,CCREG);
8452           if (rt1[i]==31) {
8453             alloc_reg(&current,i,31);
8454             dirty_reg(&current,31);
8455             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8456             #ifdef REG_PREFETCH
8457             alloc_reg(&current,i,PTEMP);
8458             #endif
8459             //current.is32|=1LL<<rt1[i];
8460           }
8461           delayslot_alloc(&current,i+1);
8462           //current.isconst=0; // DEBUG
8463           ds=1;
8464           //printf("i=%d, isconst=%x\n",i,current.isconst);
8465           break;
8466         case RJUMP:
8467           //current.isconst=0;
8468           //current.wasconst=0;
8469           //regs[i].wasconst=0;
8470           clear_const(&current,rs1[i]);
8471           clear_const(&current,rt1[i]);
8472           alloc_cc(&current,i);
8473           dirty_reg(&current,CCREG);
8474           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8475             alloc_reg(&current,i,rs1[i]);
8476             if (rt1[i]==31) {
8477               alloc_reg(&current,i,31);
8478               dirty_reg(&current,31);
8479               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8480               #ifdef REG_PREFETCH
8481               alloc_reg(&current,i,PTEMP);
8482               #endif
8483             }
8484             #ifdef USE_MINI_HT
8485             if(rs1[i]==31) { // JALR
8486               alloc_reg(&current,i,RHASH);
8487               #ifndef HOST_IMM_ADDR32
8488               alloc_reg(&current,i,RHTBL);
8489               #endif
8490             }
8491             #endif
8492             delayslot_alloc(&current,i+1);
8493           } else {
8494             // The delay slot overwrites our source register,
8495             // allocate a temporary register to hold the old value.
8496             current.isconst=0;
8497             current.wasconst=0;
8498             regs[i].wasconst=0;
8499             delayslot_alloc(&current,i+1);
8500             current.isconst=0;
8501             alloc_reg(&current,i,RTEMP);
8502           }
8503           //current.isconst=0; // DEBUG
8504           ds=1;
8505           break;
8506         case CJUMP:
8507           //current.isconst=0;
8508           //current.wasconst=0;
8509           //regs[i].wasconst=0;
8510           clear_const(&current,rs1[i]);
8511           clear_const(&current,rs2[i]);
8512           if((opcode[i]&0x3E)==4) // BEQ/BNE
8513           {
8514             alloc_cc(&current,i);
8515             dirty_reg(&current,CCREG);
8516             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8517             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8518             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8519             {
8520               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8521               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8522             }
8523             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8524                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8525               // The delay slot overwrites one of our conditions.
8526               // Allocate the branch condition registers instead.
8527               // Note that such a sequence of instructions could
8528               // be considered a bug since the branch can not be
8529               // re-executed if an exception occurs.
8530               current.isconst=0;
8531               current.wasconst=0;
8532               regs[i].wasconst=0;
8533               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8534               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8535               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8536               {
8537                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8538                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8539               }
8540             }
8541             else delayslot_alloc(&current,i+1);
8542           }
8543           else
8544           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8545           {
8546             alloc_cc(&current,i);
8547             dirty_reg(&current,CCREG);
8548             alloc_reg(&current,i,rs1[i]);
8549             if(!(current.is32>>rs1[i]&1))
8550             {
8551               alloc_reg64(&current,i,rs1[i]);
8552             }
8553             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8554               // The delay slot overwrites one of our conditions.
8555               // Allocate the branch condition registers instead.
8556               // Note that such a sequence of instructions could
8557               // be considered a bug since the branch can not be
8558               // re-executed if an exception occurs.
8559               current.isconst=0;
8560               current.wasconst=0;
8561               regs[i].wasconst=0;
8562               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8563               if(!((current.is32>>rs1[i])&1))
8564               {
8565                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8566               }
8567             }
8568             else delayslot_alloc(&current,i+1);
8569           }
8570           else
8571           // Don't alloc the delay slot yet because we might not execute it
8572           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8573           {
8574             current.isconst=0;
8575             current.wasconst=0;
8576             regs[i].wasconst=0;
8577             alloc_cc(&current,i);
8578             dirty_reg(&current,CCREG);
8579             alloc_reg(&current,i,rs1[i]);
8580             alloc_reg(&current,i,rs2[i]);
8581             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8582             {
8583               alloc_reg64(&current,i,rs1[i]);
8584               alloc_reg64(&current,i,rs2[i]);
8585             }
8586           }
8587           else
8588           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8589           {
8590             current.isconst=0;
8591             current.wasconst=0;
8592             regs[i].wasconst=0;
8593             alloc_cc(&current,i);
8594             dirty_reg(&current,CCREG);
8595             alloc_reg(&current,i,rs1[i]);
8596             if(!(current.is32>>rs1[i]&1))
8597             {
8598               alloc_reg64(&current,i,rs1[i]);
8599             }
8600           }
8601           ds=1;
8602           //current.isconst=0;
8603           break;
8604         case SJUMP:
8605           //current.isconst=0;
8606           //current.wasconst=0;
8607           //regs[i].wasconst=0;
8608           clear_const(&current,rs1[i]);
8609           clear_const(&current,rt1[i]);
8610           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8611           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8612           {
8613             alloc_cc(&current,i);
8614             dirty_reg(&current,CCREG);
8615             alloc_reg(&current,i,rs1[i]);
8616             if(!(current.is32>>rs1[i]&1))
8617             {
8618               alloc_reg64(&current,i,rs1[i]);
8619             }
8620             if (rt1[i]==31) { // BLTZAL/BGEZAL
8621               alloc_reg(&current,i,31);
8622               dirty_reg(&current,31);
8623               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8624               //#ifdef REG_PREFETCH
8625               //alloc_reg(&current,i,PTEMP);
8626               //#endif
8627               //current.is32|=1LL<<rt1[i];
8628             }
8629             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8630               // The delay slot overwrites the branch condition.
8631               // Allocate the branch condition registers instead.
8632               // Note that such a sequence of instructions could
8633               // be considered a bug since the branch can not be
8634               // re-executed if an exception occurs.
8635               current.isconst=0;
8636               current.wasconst=0;
8637               regs[i].wasconst=0;
8638               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8639               if(!((current.is32>>rs1[i])&1))
8640               {
8641                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8642               }
8643             }
8644             else delayslot_alloc(&current,i+1);
8645           }
8646           else
8647           // Don't alloc the delay slot yet because we might not execute it
8648           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8649           {
8650             current.isconst=0;
8651             current.wasconst=0;
8652             regs[i].wasconst=0;
8653             alloc_cc(&current,i);
8654             dirty_reg(&current,CCREG);
8655             alloc_reg(&current,i,rs1[i]);
8656             if(!(current.is32>>rs1[i]&1))
8657             {
8658               alloc_reg64(&current,i,rs1[i]);
8659             }
8660           }
8661           ds=1;
8662           //current.isconst=0;
8663           break;
8664         case FJUMP:
8665           current.isconst=0;
8666           current.wasconst=0;
8667           regs[i].wasconst=0;
8668           if(likely[i]==0) // BC1F/BC1T
8669           {
8670             // TODO: Theoretically we can run out of registers here on x86.
8671             // The delay slot can allocate up to six, and we need to check
8672             // CSREG before executing the delay slot.  Possibly we can drop
8673             // the cycle count and then reload it after checking that the
8674             // FPU is in a usable state, or don't do out-of-order execution.
8675             alloc_cc(&current,i);
8676             dirty_reg(&current,CCREG);
8677             alloc_reg(&current,i,FSREG);
8678             alloc_reg(&current,i,CSREG);
8679             if(itype[i+1]==FCOMP) {
8680               // The delay slot overwrites the branch condition.
8681               // Allocate the branch condition registers instead.
8682               // Note that such a sequence of instructions could
8683               // be considered a bug since the branch can not be
8684               // re-executed if an exception occurs.
8685               alloc_cc(&current,i);
8686               dirty_reg(&current,CCREG);
8687               alloc_reg(&current,i,CSREG);
8688               alloc_reg(&current,i,FSREG);
8689             }
8690             else {
8691               delayslot_alloc(&current,i+1);
8692               alloc_reg(&current,i+1,CSREG);
8693             }
8694           }
8695           else
8696           // Don't alloc the delay slot yet because we might not execute it
8697           if(likely[i]) // BC1FL/BC1TL
8698           {
8699             alloc_cc(&current,i);
8700             dirty_reg(&current,CCREG);
8701             alloc_reg(&current,i,CSREG);
8702             alloc_reg(&current,i,FSREG);
8703           }
8704           ds=1;
8705           current.isconst=0;
8706           break;
8707         case IMM16:
8708           imm16_alloc(&current,i);
8709           break;
8710         case LOAD:
8711         case LOADLR:
8712           load_alloc(&current,i);
8713           break;
8714         case STORE:
8715         case STORELR:
8716           store_alloc(&current,i);
8717           break;
8718         case ALU:
8719           alu_alloc(&current,i);
8720           break;
8721         case SHIFT:
8722           shift_alloc(&current,i);
8723           break;
8724         case MULTDIV:
8725           multdiv_alloc(&current,i);
8726           break;
8727         case SHIFTIMM:
8728           shiftimm_alloc(&current,i);
8729           break;
8730         case MOV:
8731           mov_alloc(&current,i);
8732           break;
8733         case COP0:
8734           cop0_alloc(&current,i);
8735           break;
8736         case COP1:
8737           cop1_alloc(&current,i);
8738           break;
8739         case C1LS:
8740           c1ls_alloc(&current,i);
8741           break;
8742         case FCONV:
8743           fconv_alloc(&current,i);
8744           break;
8745         case FLOAT:
8746           float_alloc(&current,i);
8747           break;
8748         case FCOMP:
8749           fcomp_alloc(&current,i);
8750           break;
8751         case SYSCALL:
8752         case HLECALL:
8753           syscall_alloc(&current,i);
8754           break;
8755         case SPAN:
8756           pagespan_alloc(&current,i);
8757           break;
8758       }
8759       
8760       // Drop the upper half of registers that have become 32-bit
8761       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8762       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8763         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8764         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8765         current.uu|=1;
8766       } else {
8767         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8768         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8769         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8770         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8771         current.uu|=1;
8772       }
8773
8774       // Create entry (branch target) regmap
8775       for(hr=0;hr<HOST_REGS;hr++)
8776       {
8777         int r,or,er;
8778         r=current.regmap[hr];
8779         if(r>=0) {
8780           if(r!=regmap_pre[i][hr]) {
8781             // TODO: delay slot (?)
8782             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8783             if(or<0||(r&63)>=TEMPREG){
8784               regs[i].regmap_entry[hr]=-1;
8785             }
8786             else
8787             {
8788               // Just move it to a different register
8789               regs[i].regmap_entry[hr]=r;
8790               // If it was dirty before, it's still dirty
8791               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8792             }
8793           }
8794           else
8795           {
8796             // Unneeded
8797             if(r==0){
8798               regs[i].regmap_entry[hr]=0;
8799             }
8800             else
8801             if(r<64){
8802               if((current.u>>r)&1) {
8803                 regs[i].regmap_entry[hr]=-1;
8804                 //regs[i].regmap[hr]=-1;
8805                 current.regmap[hr]=-1;
8806               }else
8807                 regs[i].regmap_entry[hr]=r;
8808             }
8809             else {
8810               if((current.uu>>(r&63))&1) {
8811                 regs[i].regmap_entry[hr]=-1;
8812                 //regs[i].regmap[hr]=-1;
8813                 current.regmap[hr]=-1;
8814               }else
8815                 regs[i].regmap_entry[hr]=r;
8816             }
8817           }
8818         } else {
8819           // Branches expect CCREG to be allocated at the target
8820           if(regmap_pre[i][hr]==CCREG) 
8821             regs[i].regmap_entry[hr]=CCREG;
8822           else
8823             regs[i].regmap_entry[hr]=-1;
8824         }
8825       }
8826       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8827     }
8828     /* Branch post-alloc */
8829     if(i>0)
8830     {
8831       current.was32=current.is32;
8832       current.wasdirty=current.dirty;
8833       switch(itype[i-1]) {
8834         case UJUMP:
8835           memcpy(&branch_regs[i-1],&current,sizeof(current));
8836           branch_regs[i-1].isconst=0;
8837           branch_regs[i-1].wasconst=0;
8838           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8839           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8840           alloc_cc(&branch_regs[i-1],i-1);
8841           dirty_reg(&branch_regs[i-1],CCREG);
8842           if(rt1[i-1]==31) { // JAL
8843             alloc_reg(&branch_regs[i-1],i-1,31);
8844             dirty_reg(&branch_regs[i-1],31);
8845             branch_regs[i-1].is32|=1LL<<31;
8846           }
8847           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8848           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8849           break;
8850         case RJUMP:
8851           memcpy(&branch_regs[i-1],&current,sizeof(current));
8852           branch_regs[i-1].isconst=0;
8853           branch_regs[i-1].wasconst=0;
8854           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8855           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8856           alloc_cc(&branch_regs[i-1],i-1);
8857           dirty_reg(&branch_regs[i-1],CCREG);
8858           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8859           if(rt1[i-1]==31) { // JALR
8860             alloc_reg(&branch_regs[i-1],i-1,31);
8861             dirty_reg(&branch_regs[i-1],31);
8862             branch_regs[i-1].is32|=1LL<<31;
8863           }
8864           #ifdef USE_MINI_HT
8865           if(rs1[i-1]==31) { // JALR
8866             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8867             #ifndef HOST_IMM_ADDR32
8868             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8869             #endif
8870           }
8871           #endif
8872           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8873           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8874           break;
8875         case CJUMP:
8876           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8877           {
8878             alloc_cc(&current,i-1);
8879             dirty_reg(&current,CCREG);
8880             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8881                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8882               // The delay slot overwrote one of our conditions
8883               // Delay slot goes after the test (in order)
8884               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8885               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8886               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8887               current.u|=1;
8888               current.uu|=1;
8889               delayslot_alloc(&current,i);
8890               current.isconst=0;
8891             }
8892             else
8893             {
8894               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8895               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8896               // Alloc the branch condition registers
8897               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8898               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8899               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8900               {
8901                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8902                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8903               }
8904             }
8905             memcpy(&branch_regs[i-1],&current,sizeof(current));
8906             branch_regs[i-1].isconst=0;
8907             branch_regs[i-1].wasconst=0;
8908             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8909             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8910           }
8911           else
8912           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8913           {
8914             alloc_cc(&current,i-1);
8915             dirty_reg(&current,CCREG);
8916             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8917               // The delay slot overwrote the branch condition
8918               // Delay slot goes after the test (in order)
8919               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8920               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8921               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8922               current.u|=1;
8923               current.uu|=1;
8924               delayslot_alloc(&current,i);
8925               current.isconst=0;
8926             }
8927             else
8928             {
8929               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8930               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8931               // Alloc the branch condition register
8932               alloc_reg(&current,i-1,rs1[i-1]);
8933               if(!(current.is32>>rs1[i-1]&1))
8934               {
8935                 alloc_reg64(&current,i-1,rs1[i-1]);
8936               }
8937             }
8938             memcpy(&branch_regs[i-1],&current,sizeof(current));
8939             branch_regs[i-1].isconst=0;
8940             branch_regs[i-1].wasconst=0;
8941             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8942             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8943           }
8944           else
8945           // Alloc the delay slot in case the branch is taken
8946           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8947           {
8948             memcpy(&branch_regs[i-1],&current,sizeof(current));
8949             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8950             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8951             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8952             alloc_cc(&branch_regs[i-1],i);
8953             dirty_reg(&branch_regs[i-1],CCREG);
8954             delayslot_alloc(&branch_regs[i-1],i);
8955             branch_regs[i-1].isconst=0;
8956             alloc_reg(&current,i,CCREG); // Not taken path
8957             dirty_reg(&current,CCREG);
8958             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8959           }
8960           else
8961           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8962           {
8963             memcpy(&branch_regs[i-1],&current,sizeof(current));
8964             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8965             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8966             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8967             alloc_cc(&branch_regs[i-1],i);
8968             dirty_reg(&branch_regs[i-1],CCREG);
8969             delayslot_alloc(&branch_regs[i-1],i);
8970             branch_regs[i-1].isconst=0;
8971             alloc_reg(&current,i,CCREG); // Not taken path
8972             dirty_reg(&current,CCREG);
8973             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8974           }
8975           break;
8976         case SJUMP:
8977           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8978           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8979           {
8980             alloc_cc(&current,i-1);
8981             dirty_reg(&current,CCREG);
8982             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8983               // The delay slot overwrote the branch condition
8984               // Delay slot goes after the test (in order)
8985               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8986               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8987               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8988               current.u|=1;
8989               current.uu|=1;
8990               delayslot_alloc(&current,i);
8991               current.isconst=0;
8992             }
8993             else
8994             {
8995               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8996               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8997               // Alloc the branch condition register
8998               alloc_reg(&current,i-1,rs1[i-1]);
8999               if(!(current.is32>>rs1[i-1]&1))
9000               {
9001                 alloc_reg64(&current,i-1,rs1[i-1]);
9002               }
9003             }
9004             memcpy(&branch_regs[i-1],&current,sizeof(current));
9005             branch_regs[i-1].isconst=0;
9006             branch_regs[i-1].wasconst=0;
9007             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9008             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9009           }
9010           else
9011           // Alloc the delay slot in case the branch is taken
9012           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9013           {
9014             memcpy(&branch_regs[i-1],&current,sizeof(current));
9015             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9016             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9017             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9018             alloc_cc(&branch_regs[i-1],i);
9019             dirty_reg(&branch_regs[i-1],CCREG);
9020             delayslot_alloc(&branch_regs[i-1],i);
9021             branch_regs[i-1].isconst=0;
9022             alloc_reg(&current,i,CCREG); // Not taken path
9023             dirty_reg(&current,CCREG);
9024             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9025           }
9026           // FIXME: BLTZAL/BGEZAL
9027           if(opcode2[i-1]&0x10) { // BxxZAL
9028             alloc_reg(&branch_regs[i-1],i-1,31);
9029             dirty_reg(&branch_regs[i-1],31);
9030             branch_regs[i-1].is32|=1LL<<31;
9031           }
9032           break;
9033         case FJUMP:
9034           if(likely[i-1]==0) // BC1F/BC1T
9035           {
9036             alloc_cc(&current,i-1);
9037             dirty_reg(&current,CCREG);
9038             if(itype[i]==FCOMP) {
9039               // The delay slot overwrote the branch condition
9040               // Delay slot goes after the test (in order)
9041               delayslot_alloc(&current,i);
9042               current.isconst=0;
9043             }
9044             else
9045             {
9046               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9047               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9048               // Alloc the branch condition register
9049               alloc_reg(&current,i-1,FSREG);
9050             }
9051             memcpy(&branch_regs[i-1],&current,sizeof(current));
9052             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9053           }
9054           else // BC1FL/BC1TL
9055           {
9056             // Alloc the delay slot in case the branch is taken
9057             memcpy(&branch_regs[i-1],&current,sizeof(current));
9058             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9059             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9060             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9061             alloc_cc(&branch_regs[i-1],i);
9062             dirty_reg(&branch_regs[i-1],CCREG);
9063             delayslot_alloc(&branch_regs[i-1],i);
9064             branch_regs[i-1].isconst=0;
9065             alloc_reg(&current,i,CCREG); // Not taken path
9066             dirty_reg(&current,CCREG);
9067             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9068           }
9069           break;
9070       }
9071
9072       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9073       {
9074         if(rt1[i-1]==31) // JAL/JALR
9075         {
9076           // Subroutine call will return here, don't alloc any registers
9077           current.is32=1;
9078           current.dirty=0;
9079           clear_all_regs(current.regmap);
9080           alloc_reg(&current,i,CCREG);
9081           dirty_reg(&current,CCREG);
9082         }
9083         else if(i+1<slen)
9084         {
9085           // Internal branch will jump here, match registers to caller
9086           current.is32=0x3FFFFFFFFLL;
9087           current.dirty=0;
9088           clear_all_regs(current.regmap);
9089           alloc_reg(&current,i,CCREG);
9090           dirty_reg(&current,CCREG);
9091           for(j=i-1;j>=0;j--)
9092           {
9093             if(ba[j]==start+i*4+4) {
9094               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9095               current.is32=branch_regs[j].is32;
9096               current.dirty=branch_regs[j].dirty;
9097               break;
9098             }
9099           }
9100           while(j>=0) {
9101             if(ba[j]==start+i*4+4) {
9102               for(hr=0;hr<HOST_REGS;hr++) {
9103                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9104                   current.regmap[hr]=-1;
9105                 }
9106                 current.is32&=branch_regs[j].is32;
9107                 current.dirty&=branch_regs[j].dirty;
9108               }
9109             }
9110             j--;
9111           }
9112         }
9113       }
9114     }
9115
9116     // Count cycles in between branches
9117     ccadj[i]=cc;
9118     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9119     {
9120       cc=0;
9121     }
9122     else
9123     {
9124       cc++;
9125     }
9126
9127     flush_dirty_uppers(&current);
9128     if(!is_ds[i]) {
9129       regs[i].is32=current.is32;
9130       regs[i].dirty=current.dirty;
9131       regs[i].isconst=current.isconst;
9132       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9133     }
9134     for(hr=0;hr<HOST_REGS;hr++) {
9135       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9136         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9137           regs[i].wasconst&=~(1<<hr);
9138         }
9139       }
9140     }
9141     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9142   }
9143   
9144   /* Pass 4 - Cull unused host registers */
9145   
9146   uint64_t nr=0;
9147   
9148   for (i=slen-1;i>=0;i--)
9149   {
9150     int hr;
9151     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9152     {
9153       if(ba[i]<start || ba[i]>=(start+slen*4))
9154       {
9155         // Branch out of this block, don't need anything
9156         nr=0;
9157       }
9158       else
9159       {
9160         // Internal branch
9161         // Need whatever matches the target
9162         nr=0;
9163         int t=(ba[i]-start)>>2;
9164         for(hr=0;hr<HOST_REGS;hr++)
9165         {
9166           if(regs[i].regmap_entry[hr]>=0) {
9167             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9168           }
9169         }
9170       }
9171       // Conditional branch may need registers for following instructions
9172       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9173       {
9174         if(i<slen-2) {
9175           nr|=needed_reg[i+2];
9176           for(hr=0;hr<HOST_REGS;hr++)
9177           {
9178             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9179             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9180           }
9181         }
9182       }
9183       // Don't need stuff which is overwritten
9184       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9185       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9186       // Merge in delay slot
9187       for(hr=0;hr<HOST_REGS;hr++)
9188       {
9189         if(!likely[i]) {
9190           // These are overwritten unless the branch is "likely"
9191           // and the delay slot is nullified if not taken
9192           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9193           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9194         }
9195         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9196         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9197         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9198         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9199         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9200         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9201         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9202         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9203         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9204           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9205           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9206         }
9207         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9208           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9209           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9210         }
9211         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9212           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9213           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9214         }
9215       }
9216     }
9217     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
9218     {
9219       // SYSCALL instruction (software interrupt)
9220       nr=0;
9221     }
9222     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9223     {
9224       // ERET instruction (return from interrupt)
9225       nr=0;
9226     }
9227     else // Non-branch
9228     {
9229       if(i<slen-1) {
9230         for(hr=0;hr<HOST_REGS;hr++) {
9231           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9232           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9233           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9234           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9235         }
9236       }
9237     }
9238     for(hr=0;hr<HOST_REGS;hr++)
9239     {
9240       // Overwritten registers are not needed
9241       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9242       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9243       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9244       // Source registers are needed
9245       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9246       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9247       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9248       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9249       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9250       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9251       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9252       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9253       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9254         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9255         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9256       }
9257       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9258         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9259         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9260       }
9261       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9262         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9263         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9264       }
9265       // Don't store a register immediately after writing it,
9266       // may prevent dual-issue.
9267       // But do so if this is a branch target, otherwise we
9268       // might have to load the register before the branch.
9269       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9270         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9271            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9272           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9273           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9274         }
9275         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9276            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9277           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9278           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9279         }
9280       }
9281     }
9282     // Cycle count is needed at branches.  Assume it is needed at the target too.
9283     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9284       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9285       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9286     }
9287     // Save it
9288     needed_reg[i]=nr;
9289     
9290     // Deallocate unneeded registers
9291     for(hr=0;hr<HOST_REGS;hr++)
9292     {
9293       if(!((nr>>hr)&1)) {
9294         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9295         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9296            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9297            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9298         {
9299           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9300           {
9301             if(likely[i]) {
9302               regs[i].regmap[hr]=-1;
9303               regs[i].isconst&=~(1<<hr);
9304               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9305             }
9306           }
9307         }
9308         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9309         {
9310           int d1=0,d2=0,map=0,temp=0;
9311           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9312           {
9313             d1=dep1[i+1];
9314             d2=dep2[i+1];
9315           }
9316           if(using_tlb) {
9317             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9318                itype[i+1]==STORE || itype[i+1]==STORELR ||
9319                itype[i+1]==C1LS )
9320             map=TLREG;
9321           } else
9322           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9323             map=INVCP;
9324           }
9325           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9326              itype[i+1]==C1LS )
9327             temp=FTEMP;
9328           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9329              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9330              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9331              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9332              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9333              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9334              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9335              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9336              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9337              regs[i].regmap[hr]!=map )
9338           {
9339             regs[i].regmap[hr]=-1;
9340             regs[i].isconst&=~(1<<hr);
9341             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9342                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9343                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9344                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9345                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9346                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9347                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9348                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9349                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9350                branch_regs[i].regmap[hr]!=map)
9351             {
9352               branch_regs[i].regmap[hr]=-1;
9353               branch_regs[i].regmap_entry[hr]=-1;
9354               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9355               {
9356                 if(!likely[i]&&i<slen-2) {
9357                   regmap_pre[i+2][hr]=-1;
9358                 }
9359               }
9360             }
9361           }
9362         }
9363         else
9364         {
9365           // Non-branch
9366           if(i>0)
9367           {
9368             int d1=0,d2=0,map=-1,temp=-1;
9369             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9370             {
9371               d1=dep1[i];
9372               d2=dep2[i];
9373             }
9374             if(using_tlb) {
9375               if(itype[i]==LOAD || itype[i]==LOADLR ||
9376                  itype[i]==STORE || itype[i]==STORELR ||
9377                  itype[i]==C1LS )
9378               map=TLREG;
9379             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9380               map=INVCP;
9381             }
9382             if(itype[i]==LOADLR || itype[i]==STORELR ||
9383                itype[i]==C1LS )
9384               temp=FTEMP;
9385             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9386                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9387                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9388                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9389                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9390                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9391             {
9392               if(i<slen-1&&!is_ds[i]) {
9393                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9394                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9395                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9396                 {
9397                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9398                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9399                 }
9400                 regmap_pre[i+1][hr]=-1;
9401                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9402               }
9403               regs[i].regmap[hr]=-1;
9404               regs[i].isconst&=~(1<<hr);
9405             }
9406           }
9407         }
9408       }
9409     }
9410   }
9411   
9412   /* Pass 5 - Pre-allocate registers */
9413   
9414   // If a register is allocated during a loop, try to allocate it for the
9415   // entire loop, if possible.  This avoids loading/storing registers
9416   // inside of the loop.
9417
9418   signed char f_regmap[HOST_REGS];
9419   clear_all_regs(f_regmap);
9420   for(i=0;i<slen-1;i++)
9421   {
9422     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9423     {
9424       if(ba[i]>=start && ba[i]<(start+i*4)) 
9425       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9426       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9427       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9428       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9429       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9430       {
9431         int t=(ba[i]-start)>>2;
9432         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9433         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9434         for(hr=0;hr<HOST_REGS;hr++)
9435         {
9436           if(regs[i].regmap[hr]>64) {
9437             if(!((regs[i].dirty>>hr)&1))
9438               f_regmap[hr]=regs[i].regmap[hr];
9439             else f_regmap[hr]=-1;
9440           }
9441           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9442           if(branch_regs[i].regmap[hr]>64) {
9443             if(!((branch_regs[i].dirty>>hr)&1))
9444               f_regmap[hr]=branch_regs[i].regmap[hr];
9445             else f_regmap[hr]=-1;
9446           }
9447           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9448           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9449           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9450           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9451           {
9452             // Test both in case the delay slot is ooo,
9453             // could be done better...
9454             if(count_free_regs(branch_regs[i].regmap)<2
9455              ||count_free_regs(regs[i].regmap)<2) 
9456               f_regmap[hr]=branch_regs[i].regmap[hr];
9457           }
9458           // Avoid dirty->clean transition
9459           // #ifdef DESTRUCTIVE_WRITEBACK here?
9460           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9461           if(f_regmap[hr]>0) {
9462             if(regs[t].regmap_entry[hr]<0) {
9463               int r=f_regmap[hr];
9464               for(j=t;j<=i;j++)
9465               {
9466                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9467                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9468                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9469                 if(r>63) {
9470                   // NB This can exclude the case where the upper-half
9471                   // register is lower numbered than the lower-half
9472                   // register.  Not sure if it's worth fixing...
9473                   if(get_reg(regs[j].regmap,r&63)<0) break;
9474                   if(regs[j].is32&(1LL<<(r&63))) break;
9475                 }
9476                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9477                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9478                   int k;
9479                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9480                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9481                     if(r>63) {
9482                       if(get_reg(regs[i].regmap,r&63)<0) break;
9483                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9484                     }
9485                     k=i;
9486                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9487                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9488                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9489                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9490                       ||itype[k-1]==FCOMP) {
9491                         if(count_free_regs(regs[k-1].regmap)<2) {
9492                           //printf("no free regs for store %x\n",start+(k-1)*4);
9493                           break;
9494                         }
9495                       }
9496                       else
9497                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9498                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9499                         //printf("no-match due to different register\n");
9500                         break;
9501                       }
9502                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9503                         //printf("no-match due to branch\n");
9504                         break;
9505                       }
9506                       // call/ret fast path assumes no registers allocated
9507                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9508                         break;
9509                       }
9510                       if(r>63) {
9511                         // NB This can exclude the case where the upper-half
9512                         // register is lower numbered than the lower-half
9513                         // register.  Not sure if it's worth fixing...
9514                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9515                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9516                       }
9517                       k--;
9518                     }
9519                     if(i<slen-1) {
9520                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9521                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9522                         //printf("bad match after branch\n");
9523                         break;
9524                       }
9525                     }
9526                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9527                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9528                       while(k<i) {
9529                         regs[k].regmap_entry[hr]=f_regmap[hr];
9530                         regs[k].regmap[hr]=f_regmap[hr];
9531                         regmap_pre[k+1][hr]=f_regmap[hr];
9532                         regs[k].wasdirty&=~(1<<hr);
9533                         regs[k].dirty&=~(1<<hr);
9534                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9535                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9536                         regs[k].wasconst&=~(1<<hr);
9537                         regs[k].isconst&=~(1<<hr);
9538                         k++;
9539                       }
9540                     }
9541                     else {
9542                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9543                       break;
9544                     }
9545                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9546                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9547                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9548                       regs[i].regmap_entry[hr]=f_regmap[hr];
9549                       regs[i].regmap[hr]=f_regmap[hr];
9550                       regs[i].wasdirty&=~(1<<hr);
9551                       regs[i].dirty&=~(1<<hr);
9552                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9553                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9554                       regs[i].wasconst&=~(1<<hr);
9555                       regs[i].isconst&=~(1<<hr);
9556                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9557                       branch_regs[i].wasdirty&=~(1<<hr);
9558                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9559                       branch_regs[i].regmap[hr]=f_regmap[hr];
9560                       branch_regs[i].dirty&=~(1<<hr);
9561                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9562                       branch_regs[i].wasconst&=~(1<<hr);
9563                       branch_regs[i].isconst&=~(1<<hr);
9564                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9565                         regmap_pre[i+2][hr]=f_regmap[hr];
9566                         regs[i+2].wasdirty&=~(1<<hr);
9567                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9568                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9569                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9570                       }
9571                     }
9572                   }
9573                   for(k=t;k<j;k++) {
9574                     regs[k].regmap_entry[hr]=f_regmap[hr];
9575                     regs[k].regmap[hr]=f_regmap[hr];
9576                     regmap_pre[k+1][hr]=f_regmap[hr];
9577                     regs[k+1].wasdirty&=~(1<<hr);
9578                     regs[k].dirty&=~(1<<hr);
9579                     regs[k].wasconst&=~(1<<hr);
9580                     regs[k].isconst&=~(1<<hr);
9581                   }
9582                   if(regs[j].regmap[hr]==f_regmap[hr])
9583                     regs[j].regmap_entry[hr]=f_regmap[hr];
9584                   break;
9585                 }
9586                 if(j==i) break;
9587                 if(regs[j].regmap[hr]>=0)
9588                   break;
9589                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9590                   //printf("no-match due to different register\n");
9591                   break;
9592                 }
9593                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9594                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9595                   break;
9596                 }
9597                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9598                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9599                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9600                   if(count_free_regs(regs[j].regmap)<2) {
9601                     //printf("No free regs for store %x\n",start+j*4);
9602                     break;
9603                   }
9604                 }
9605                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9606                 if(f_regmap[hr]>=64) {
9607                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9608                     break;
9609                   }
9610                   else
9611                   {
9612                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9613                       break;
9614                     }
9615                   }
9616                 }
9617               }
9618             }
9619           }
9620         }
9621       }
9622     }else{
9623       int count=0;
9624       for(hr=0;hr<HOST_REGS;hr++)
9625       {
9626         if(hr!=EXCLUDE_REG) {
9627           if(regs[i].regmap[hr]>64) {
9628             if(!((regs[i].dirty>>hr)&1))
9629               f_regmap[hr]=regs[i].regmap[hr];
9630           }
9631           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9632           else if(regs[i].regmap[hr]<0) count++;
9633         }
9634       }
9635       // Try to restore cycle count at branch targets
9636       if(bt[i]) {
9637         for(j=i;j<slen-1;j++) {
9638           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9639           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9640           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9641           ||itype[j]==FCOMP||itype[j]==FCONV) {
9642             if(count_free_regs(regs[j].regmap)<2) {
9643               //printf("no free regs for store %x\n",start+j*4);
9644               break;
9645             }
9646           }
9647           else
9648           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9649         }
9650         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9651           int k=i;
9652           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9653           while(k<j) {
9654             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9655             regs[k].regmap[HOST_CCREG]=CCREG;
9656             regmap_pre[k+1][HOST_CCREG]=CCREG;
9657             regs[k+1].wasdirty|=1<<HOST_CCREG;
9658             regs[k].dirty|=1<<HOST_CCREG;
9659             regs[k].wasconst&=~(1<<HOST_CCREG);
9660             regs[k].isconst&=~(1<<HOST_CCREG);
9661             k++;
9662           }
9663           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9664         }
9665         // Work backwards from the branch target
9666         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9667         {
9668           //printf("Extend backwards\n");
9669           int k;
9670           k=i;
9671           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9672             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9673             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9674             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9675               if(count_free_regs(regs[k-1].regmap)<2) {
9676                 //printf("no free regs for store %x\n",start+(k-1)*4);
9677                 break;
9678               }
9679             }
9680             else
9681             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9682             k--;
9683           }
9684           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9685             //printf("Extend CC, %x ->\n",start+k*4);
9686             while(k<=i) {
9687               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9688               regs[k].regmap[HOST_CCREG]=CCREG;
9689               regmap_pre[k+1][HOST_CCREG]=CCREG;
9690               regs[k+1].wasdirty|=1<<HOST_CCREG;
9691               regs[k].dirty|=1<<HOST_CCREG;
9692               regs[k].wasconst&=~(1<<HOST_CCREG);
9693               regs[k].isconst&=~(1<<HOST_CCREG);
9694               k++;
9695             }
9696           }
9697           else {
9698             //printf("Fail Extend CC, %x ->\n",start+k*4);
9699           }
9700         }
9701       }
9702       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9703          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9704          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9705          itype[i]!=FCONV&&itype[i]!=FCOMP)
9706       {
9707         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9708       }
9709     }
9710   }
9711   
9712   // This allocates registers (if possible) one instruction prior
9713   // to use, which can avoid a load-use penalty on certain CPUs.
9714   for(i=0;i<slen-1;i++)
9715   {
9716     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9717     {
9718       if(!bt[i+1])
9719       {
9720         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9721         {
9722           if(rs1[i+1]) {
9723             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9724             {
9725               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9726               {
9727                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9728                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9729                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9730                 regs[i].isconst&=~(1<<hr);
9731                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9732                 constmap[i][hr]=constmap[i+1][hr];
9733                 regs[i+1].wasdirty&=~(1<<hr);
9734                 regs[i].dirty&=~(1<<hr);
9735               }
9736             }
9737           }
9738           if(rs2[i+1]) {
9739             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9740             {
9741               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9742               {
9743                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9744                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9745                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9746                 regs[i].isconst&=~(1<<hr);
9747                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9748                 constmap[i][hr]=constmap[i+1][hr];
9749                 regs[i+1].wasdirty&=~(1<<hr);
9750                 regs[i].dirty&=~(1<<hr);
9751               }
9752             }
9753           }
9754           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9755             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9756             {
9757               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9758               {
9759                 regs[i].regmap[hr]=rs1[i+1];
9760                 regmap_pre[i+1][hr]=rs1[i+1];
9761                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9762                 regs[i].isconst&=~(1<<hr);
9763                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9764                 constmap[i][hr]=constmap[i+1][hr];
9765                 regs[i+1].wasdirty&=~(1<<hr);
9766                 regs[i].dirty&=~(1<<hr);
9767               }
9768             }
9769           }
9770           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9771             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9772             {
9773               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9774               {
9775                 regs[i].regmap[hr]=rs1[i+1];
9776                 regmap_pre[i+1][hr]=rs1[i+1];
9777                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9778                 regs[i].isconst&=~(1<<hr);
9779                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9780                 constmap[i][hr]=constmap[i+1][hr];
9781                 regs[i+1].wasdirty&=~(1<<hr);
9782                 regs[i].dirty&=~(1<<hr);
9783               }
9784             }
9785           }
9786           #ifndef HOST_IMM_ADDR32
9787           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9788             hr=get_reg(regs[i+1].regmap,TLREG);
9789             if(hr>=0) {
9790               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9791               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9792                 int nr;
9793                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9794                 {
9795                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9796                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9797                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9798                   regs[i].isconst&=~(1<<hr);
9799                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9800                   constmap[i][hr]=constmap[i+1][hr];
9801                   regs[i+1].wasdirty&=~(1<<hr);
9802                   regs[i].dirty&=~(1<<hr);
9803                 }
9804                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9805                 {
9806                   // move it to another register
9807                   regs[i+1].regmap[hr]=-1;
9808                   regmap_pre[i+2][hr]=-1;
9809                   regs[i+1].regmap[nr]=TLREG;
9810                   regmap_pre[i+2][nr]=TLREG;
9811                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9812                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9813                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9814                   regs[i].isconst&=~(1<<nr);
9815                   regs[i+1].isconst&=~(1<<nr);
9816                   regs[i].dirty&=~(1<<nr);
9817                   regs[i+1].wasdirty&=~(1<<nr);
9818                   regs[i+1].dirty&=~(1<<nr);
9819                   regs[i+2].wasdirty&=~(1<<nr);
9820                 }
9821               }
9822             }
9823           }
9824           #endif
9825           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9826             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9827               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9828               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9829               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9830               assert(hr>=0);
9831               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9832               {
9833                 regs[i].regmap[hr]=rs1[i+1];
9834                 regmap_pre[i+1][hr]=rs1[i+1];
9835                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9836                 regs[i].isconst&=~(1<<hr);
9837                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9838                 constmap[i][hr]=constmap[i+1][hr];
9839                 regs[i+1].wasdirty&=~(1<<hr);
9840                 regs[i].dirty&=~(1<<hr);
9841               }
9842             }
9843           }
9844           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9845             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9846               int nr;
9847               hr=get_reg(regs[i+1].regmap,FTEMP);
9848               assert(hr>=0);
9849               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9850               {
9851                 regs[i].regmap[hr]=rs1[i+1];
9852                 regmap_pre[i+1][hr]=rs1[i+1];
9853                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9854                 regs[i].isconst&=~(1<<hr);
9855                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9856                 constmap[i][hr]=constmap[i+1][hr];
9857                 regs[i+1].wasdirty&=~(1<<hr);
9858                 regs[i].dirty&=~(1<<hr);
9859               }
9860               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9861               {
9862                 // move it to another register
9863                 regs[i+1].regmap[hr]=-1;
9864                 regmap_pre[i+2][hr]=-1;
9865                 regs[i+1].regmap[nr]=FTEMP;
9866                 regmap_pre[i+2][nr]=FTEMP;
9867                 regs[i].regmap[nr]=rs1[i+1];
9868                 regmap_pre[i+1][nr]=rs1[i+1];
9869                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9870                 regs[i].isconst&=~(1<<nr);
9871                 regs[i+1].isconst&=~(1<<nr);
9872                 regs[i].dirty&=~(1<<nr);
9873                 regs[i+1].wasdirty&=~(1<<nr);
9874                 regs[i+1].dirty&=~(1<<nr);
9875                 regs[i+2].wasdirty&=~(1<<nr);
9876               }
9877             }
9878           }
9879           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9880             if(itype[i+1]==LOAD) 
9881               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9882             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9883               hr=get_reg(regs[i+1].regmap,FTEMP);
9884             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9885               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9886               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9887             }
9888             if(hr>=0&&regs[i].regmap[hr]<0) {
9889               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9890               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9891                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9892                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9893                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9894                 regs[i].isconst&=~(1<<hr);
9895                 regs[i+1].wasdirty&=~(1<<hr);
9896                 regs[i].dirty&=~(1<<hr);
9897               }
9898             }
9899           }
9900         }
9901       }
9902     }
9903   }
9904   
9905   /* Pass 6 - Optimize clean/dirty state */
9906   clean_registers(0,slen-1,1);
9907   
9908   /* Pass 7 - Identify 32-bit registers */
9909   
9910   provisional_r32();
9911
9912   u_int r32=0;
9913   
9914   for (i=slen-1;i>=0;i--)
9915   {
9916     int hr;
9917     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9918     {
9919       if(ba[i]<start || ba[i]>=(start+slen*4))
9920       {
9921         // Branch out of this block, don't need anything
9922         r32=0;
9923       }
9924       else
9925       {
9926         // Internal branch
9927         // Need whatever matches the target
9928         // (and doesn't get overwritten by the delay slot instruction)
9929         r32=0;
9930         int t=(ba[i]-start)>>2;
9931         if(ba[i]>start+i*4) {
9932           // Forward branch
9933           if(!(requires_32bit[t]&~regs[i].was32))
9934             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9935         }else{
9936           // Backward branch
9937           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9938           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9939           if(!(pr32[t]&~regs[i].was32))
9940             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9941         }
9942       }
9943       // Conditional branch may need registers for following instructions
9944       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9945       {
9946         if(i<slen-2) {
9947           r32|=requires_32bit[i+2];
9948           r32&=regs[i].was32;
9949           // Mark this address as a branch target since it may be called
9950           // upon return from interrupt
9951           bt[i+2]=1;
9952         }
9953       }
9954       // Merge in delay slot
9955       if(!likely[i]) {
9956         // These are overwritten unless the branch is "likely"
9957         // and the delay slot is nullified if not taken
9958         r32&=~(1LL<<rt1[i+1]);
9959         r32&=~(1LL<<rt2[i+1]);
9960       }
9961       // Assume these are needed (delay slot)
9962       if(us1[i+1]>0)
9963       {
9964         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9965       }
9966       if(us2[i+1]>0)
9967       {
9968         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9969       }
9970       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9971       {
9972         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9973       }
9974       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9975       {
9976         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9977       }
9978     }
9979     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
9980     {
9981       // SYSCALL instruction (software interrupt)
9982       r32=0;
9983     }
9984     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9985     {
9986       // ERET instruction (return from interrupt)
9987       r32=0;
9988     }
9989     // Check 32 bits
9990     r32&=~(1LL<<rt1[i]);
9991     r32&=~(1LL<<rt2[i]);
9992     if(us1[i]>0)
9993     {
9994       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9995     }
9996     if(us2[i]>0)
9997     {
9998       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9999     }
10000     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10001     {
10002       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10003     }
10004     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10005     {
10006       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10007     }
10008     requires_32bit[i]=r32;
10009     
10010     // Dirty registers which are 32-bit, require 32-bit input
10011     // as they will be written as 32-bit values
10012     for(hr=0;hr<HOST_REGS;hr++)
10013     {
10014       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10015         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10016           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10017           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10018         }
10019       }
10020     }
10021     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10022   }
10023
10024   if(itype[slen-1]==SPAN) {
10025     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10026   }
10027   
10028   /* Debug/disassembly */
10029   if((void*)assem_debug==(void*)printf) 
10030   for(i=0;i<slen;i++)
10031   {
10032     printf("U:");
10033     int r;
10034     for(r=1;r<=CCREG;r++) {
10035       if((unneeded_reg[i]>>r)&1) {
10036         if(r==HIREG) printf(" HI");
10037         else if(r==LOREG) printf(" LO");
10038         else printf(" r%d",r);
10039       }
10040     }
10041 #ifndef FORCE32
10042     printf(" UU:");
10043     for(r=1;r<=CCREG;r++) {
10044       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10045         if(r==HIREG) printf(" HI");
10046         else if(r==LOREG) printf(" LO");
10047         else printf(" r%d",r);
10048       }
10049     }
10050     printf(" 32:");
10051     for(r=0;r<=CCREG;r++) {
10052       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10053       if((regs[i].was32>>r)&1) {
10054         if(r==CCREG) printf(" CC");
10055         else if(r==HIREG) printf(" HI");
10056         else if(r==LOREG) printf(" LO");
10057         else printf(" r%d",r);
10058       }
10059     }
10060 #endif
10061     printf("\n");
10062     #if defined(__i386__) || defined(__x86_64__)
10063     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10064     #endif
10065     #ifdef __arm__
10066     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10067     #endif
10068     printf("needs: ");
10069     if(needed_reg[i]&1) printf("eax ");
10070     if((needed_reg[i]>>1)&1) printf("ecx ");
10071     if((needed_reg[i]>>2)&1) printf("edx ");
10072     if((needed_reg[i]>>3)&1) printf("ebx ");
10073     if((needed_reg[i]>>5)&1) printf("ebp ");
10074     if((needed_reg[i]>>6)&1) printf("esi ");
10075     if((needed_reg[i]>>7)&1) printf("edi ");
10076     printf("r:");
10077     for(r=0;r<=CCREG;r++) {
10078       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10079       if((requires_32bit[i]>>r)&1) {
10080         if(r==CCREG) printf(" CC");
10081         else if(r==HIREG) printf(" HI");
10082         else if(r==LOREG) printf(" LO");
10083         else printf(" r%d",r);
10084       }
10085     }
10086     printf("\n");
10087     /*printf("pr:");
10088     for(r=0;r<=CCREG;r++) {
10089       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10090       if((pr32[i]>>r)&1) {
10091         if(r==CCREG) printf(" CC");
10092         else if(r==HIREG) printf(" HI");
10093         else if(r==LOREG) printf(" LO");
10094         else printf(" r%d",r);
10095       }
10096     }
10097     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10098     printf("\n");*/
10099     #if defined(__i386__) || defined(__x86_64__)
10100     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10101     printf("dirty: ");
10102     if(regs[i].wasdirty&1) printf("eax ");
10103     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10104     if((regs[i].wasdirty>>2)&1) printf("edx ");
10105     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10106     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10107     if((regs[i].wasdirty>>6)&1) printf("esi ");
10108     if((regs[i].wasdirty>>7)&1) printf("edi ");
10109     #endif
10110     #ifdef __arm__
10111     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10112     printf("dirty: ");
10113     if(regs[i].wasdirty&1) printf("r0 ");
10114     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10115     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10116     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10117     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10118     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10119     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10120     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10121     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10122     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10123     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10124     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10125     #endif
10126     printf("\n");
10127     disassemble_inst(i);
10128     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10129     #if defined(__i386__) || defined(__x86_64__)
10130     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10131     if(regs[i].dirty&1) printf("eax ");
10132     if((regs[i].dirty>>1)&1) printf("ecx ");
10133     if((regs[i].dirty>>2)&1) printf("edx ");
10134     if((regs[i].dirty>>3)&1) printf("ebx ");
10135     if((regs[i].dirty>>5)&1) printf("ebp ");
10136     if((regs[i].dirty>>6)&1) printf("esi ");
10137     if((regs[i].dirty>>7)&1) printf("edi ");
10138     #endif
10139     #ifdef __arm__
10140     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10141     if(regs[i].dirty&1) printf("r0 ");
10142     if((regs[i].dirty>>1)&1) printf("r1 ");
10143     if((regs[i].dirty>>2)&1) printf("r2 ");
10144     if((regs[i].dirty>>3)&1) printf("r3 ");
10145     if((regs[i].dirty>>4)&1) printf("r4 ");
10146     if((regs[i].dirty>>5)&1) printf("r5 ");
10147     if((regs[i].dirty>>6)&1) printf("r6 ");
10148     if((regs[i].dirty>>7)&1) printf("r7 ");
10149     if((regs[i].dirty>>8)&1) printf("r8 ");
10150     if((regs[i].dirty>>9)&1) printf("r9 ");
10151     if((regs[i].dirty>>10)&1) printf("r10 ");
10152     if((regs[i].dirty>>12)&1) printf("r12 ");
10153     #endif
10154     printf("\n");
10155     if(regs[i].isconst) {
10156       printf("constants: ");
10157       #if defined(__i386__) || defined(__x86_64__)
10158       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10159       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10160       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10161       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10162       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10163       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10164       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10165       #endif
10166       #ifdef __arm__
10167       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10168       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10169       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10170       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10171       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10172       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10173       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10174       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10175       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10176       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10177       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10178       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10179       #endif
10180       printf("\n");
10181     }
10182 #ifndef FORCE32
10183     printf(" 32:");
10184     for(r=0;r<=CCREG;r++) {
10185       if((regs[i].is32>>r)&1) {
10186         if(r==CCREG) printf(" CC");
10187         else if(r==HIREG) printf(" HI");
10188         else if(r==LOREG) printf(" LO");
10189         else printf(" r%d",r);
10190       }
10191     }
10192     printf("\n");
10193 #endif
10194     /*printf(" p32:");
10195     for(r=0;r<=CCREG;r++) {
10196       if((p32[i]>>r)&1) {
10197         if(r==CCREG) printf(" CC");
10198         else if(r==HIREG) printf(" HI");
10199         else if(r==LOREG) printf(" LO");
10200         else printf(" r%d",r);
10201       }
10202     }
10203     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10204     else printf("\n");*/
10205     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10206       #if defined(__i386__) || defined(__x86_64__)
10207       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10208       if(branch_regs[i].dirty&1) printf("eax ");
10209       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10210       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10211       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10212       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10213       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10214       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10215       #endif
10216       #ifdef __arm__
10217       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10218       if(branch_regs[i].dirty&1) printf("r0 ");
10219       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10220       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10221       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10222       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10223       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10224       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10225       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10226       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10227       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10228       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10229       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10230       #endif
10231 #ifndef FORCE32
10232       printf(" 32:");
10233       for(r=0;r<=CCREG;r++) {
10234         if((branch_regs[i].is32>>r)&1) {
10235           if(r==CCREG) printf(" CC");
10236           else if(r==HIREG) printf(" HI");
10237           else if(r==LOREG) printf(" LO");
10238           else printf(" r%d",r);
10239         }
10240       }
10241       printf("\n");
10242 #endif
10243     }
10244   }
10245
10246   /* Pass 8 - Assembly */
10247   linkcount=0;stubcount=0;
10248   ds=0;is_delayslot=0;
10249   cop1_usable=0;
10250   uint64_t is32_pre=0;
10251   u_int dirty_pre=0;
10252   u_int beginning=(u_int)out;
10253   if((u_int)addr&1) {
10254     ds=1;
10255     pagespan_ds();
10256   }
10257   for(i=0;i<slen;i++)
10258   {
10259     //if(ds) printf("ds: ");
10260     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10261     if(ds) {
10262       ds=0; // Skip delay slot
10263       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10264       instr_addr[i]=0;
10265     } else {
10266       #ifndef DESTRUCTIVE_WRITEBACK
10267       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10268       {
10269         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10270               unneeded_reg[i],unneeded_reg_upper[i]);
10271         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10272               unneeded_reg[i],unneeded_reg_upper[i]);
10273       }
10274       is32_pre=regs[i].is32;
10275       dirty_pre=regs[i].dirty;
10276       #endif
10277       // write back
10278       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10279       {
10280         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10281                       unneeded_reg[i],unneeded_reg_upper[i]);
10282         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10283       }
10284       // branch target entry point
10285       instr_addr[i]=(u_int)out;
10286       assem_debug("<->\n");
10287       // load regs
10288       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10289         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10290       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10291       address_generation(i,&regs[i],regs[i].regmap_entry);
10292       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10293       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10294       {
10295         // Load the delay slot registers if necessary
10296         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10297           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10298         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10299           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10300         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10301           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10302       }
10303       else if(i+1<slen)
10304       {
10305         // Preload registers for following instruction
10306         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10307           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10308             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10309         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10310           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10311             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10312       }
10313       // TODO: if(is_ooo(i)) address_generation(i+1);
10314       if(itype[i]==CJUMP||itype[i]==FJUMP)
10315         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10316       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10317         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10318       if(bt[i]) cop1_usable=0;
10319       // assemble
10320       switch(itype[i]) {
10321         case ALU:
10322           alu_assemble(i,&regs[i]);break;
10323         case IMM16:
10324           imm16_assemble(i,&regs[i]);break;
10325         case SHIFT:
10326           shift_assemble(i,&regs[i]);break;
10327         case SHIFTIMM:
10328           shiftimm_assemble(i,&regs[i]);break;
10329         case LOAD:
10330           load_assemble(i,&regs[i]);break;
10331         case LOADLR:
10332           loadlr_assemble(i,&regs[i]);break;
10333         case STORE:
10334           store_assemble(i,&regs[i]);break;
10335         case STORELR:
10336           storelr_assemble(i,&regs[i]);break;
10337         case COP0:
10338           cop0_assemble(i,&regs[i]);break;
10339         case COP1:
10340           cop1_assemble(i,&regs[i]);break;
10341         case C1LS:
10342           c1ls_assemble(i,&regs[i]);break;
10343         case FCONV:
10344           fconv_assemble(i,&regs[i]);break;
10345         case FLOAT:
10346           float_assemble(i,&regs[i]);break;
10347         case FCOMP:
10348           fcomp_assemble(i,&regs[i]);break;
10349         case MULTDIV:
10350           multdiv_assemble(i,&regs[i]);break;
10351         case MOV:
10352           mov_assemble(i,&regs[i]);break;
10353         case SYSCALL:
10354           syscall_assemble(i,&regs[i]);break;
10355         case HLECALL:
10356           hlecall_assemble(i,&regs[i]);break;
10357         case UJUMP:
10358           ujump_assemble(i,&regs[i]);ds=1;break;
10359         case RJUMP:
10360           rjump_assemble(i,&regs[i]);ds=1;break;
10361         case CJUMP:
10362           cjump_assemble(i,&regs[i]);ds=1;break;
10363         case SJUMP:
10364           sjump_assemble(i,&regs[i]);ds=1;break;
10365         case FJUMP:
10366           fjump_assemble(i,&regs[i]);ds=1;break;
10367         case SPAN:
10368           pagespan_assemble(i,&regs[i]);break;
10369       }
10370       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10371         literal_pool(1024);
10372       else
10373         literal_pool_jumpover(256);
10374     }
10375   }
10376   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10377   // If the block did not end with an unconditional branch,
10378   // add a jump to the next instruction.
10379   if(i>1) {
10380     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10381       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10382       assert(i==slen);
10383       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10384         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10385         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10386           emit_loadreg(CCREG,HOST_CCREG);
10387         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10388       }
10389       else if(!likely[i-2])
10390       {
10391         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10392         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10393       }
10394       else
10395       {
10396         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10397         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10398       }
10399       add_to_linker((int)out,start+i*4,0);
10400       emit_jmp(0);
10401     }
10402   }
10403   else
10404   {
10405     assert(i>0);
10406     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10407     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10408     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10409       emit_loadreg(CCREG,HOST_CCREG);
10410     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10411     add_to_linker((int)out,start+i*4,0);
10412     emit_jmp(0);
10413   }
10414
10415   // TODO: delay slot stubs?
10416   // Stubs
10417   for(i=0;i<stubcount;i++)
10418   {
10419     switch(stubs[i][0])
10420     {
10421       case LOADB_STUB:
10422       case LOADH_STUB:
10423       case LOADW_STUB:
10424       case LOADD_STUB:
10425       case LOADBU_STUB:
10426       case LOADHU_STUB:
10427         do_readstub(i);break;
10428       case STOREB_STUB:
10429       case STOREH_STUB:
10430       case STOREW_STUB:
10431       case STORED_STUB:
10432         do_writestub(i);break;
10433       case CC_STUB:
10434         do_ccstub(i);break;
10435       case INVCODE_STUB:
10436         do_invstub(i);break;
10437       case FP_STUB:
10438         do_cop1stub(i);break;
10439       case STORELR_STUB:
10440         do_unalignedwritestub(i);break;
10441     }
10442   }
10443
10444   /* Pass 9 - Linker */
10445   for(i=0;i<linkcount;i++)
10446   {
10447     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10448     literal_pool(64);
10449     if(!link_addr[i][2])
10450     {
10451       void *stub=out;
10452       void *addr=check_addr(link_addr[i][1]);
10453       emit_extjump(link_addr[i][0],link_addr[i][1]);
10454       if(addr) {
10455         set_jump_target(link_addr[i][0],(int)addr);
10456         add_link(link_addr[i][1],stub);
10457       }
10458       else set_jump_target(link_addr[i][0],(int)stub);
10459     }
10460     else
10461     {
10462       // Internal branch
10463       int target=(link_addr[i][1]-start)>>2;
10464       assert(target>=0&&target<slen);
10465       assert(instr_addr[target]);
10466       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10467       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10468       //#else
10469       set_jump_target(link_addr[i][0],instr_addr[target]);
10470       //#endif
10471     }
10472   }
10473   // External Branch Targets (jump_in)
10474   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10475   for(i=0;i<slen;i++)
10476   {
10477     if(bt[i]||i==0)
10478     {
10479       if(instr_addr[i]) // TODO - delay slots (=null)
10480       {
10481         u_int vaddr=start+i*4;
10482         u_int page=get_page(vaddr);
10483         u_int vpage=get_vpage(vaddr);
10484         literal_pool(256);
10485         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10486         if(!requires_32bit[i])
10487         {
10488           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10489           assem_debug("jump_in: %x\n",start+i*4);
10490           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10491           int entry_point=do_dirty_stub(i);
10492           ll_add(jump_in+page,vaddr,(void *)entry_point);
10493           // If there was an existing entry in the hash table,
10494           // replace it with the new address.
10495           // Don't add new entries.  We'll insert the
10496           // ones that actually get used in check_addr().
10497           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10498           if(ht_bin[0]==vaddr) {
10499             ht_bin[1]=entry_point;
10500           }
10501           if(ht_bin[2]==vaddr) {
10502             ht_bin[3]=entry_point;
10503           }
10504         }
10505         else
10506         {
10507           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10508           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10509           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10510           //int entry_point=(int)out;
10511           ////assem_debug("entry_point: %x\n",entry_point);
10512           //load_regs_entry(i);
10513           //if(entry_point==(int)out)
10514           //  entry_point=instr_addr[i];
10515           //else
10516           //  emit_jmp(instr_addr[i]);
10517           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10518           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10519           int entry_point=do_dirty_stub(i);
10520           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10521         }
10522       }
10523     }
10524   }
10525   // Write out the literal pool if necessary
10526   literal_pool(0);
10527   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10528   // Align code
10529   if(((u_int)out)&7) emit_addnop(13);
10530   #endif
10531   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10532   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10533   memcpy(copy,source,slen*4);
10534   copy+=slen*4;
10535   
10536   #ifdef __arm__
10537   __clear_cache((void *)beginning,out);
10538   #endif
10539   
10540   // If we're within 256K of the end of the buffer,
10541   // start over from the beginning. (Is 256K enough?)
10542   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10543   
10544   // Trap writes to any of the pages we compiled
10545   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10546     invalid_code[i]=0;
10547 #ifndef DISABLE_TLB
10548     memory_map[i]|=0x40000000;
10549     if((signed int)start>=(signed int)0xC0000000) {
10550       assert(using_tlb);
10551       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10552       invalid_code[j]=0;
10553       memory_map[j]|=0x40000000;
10554       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10555     }
10556 #endif
10557   }
10558   
10559   /* Pass 10 - Free memory by expiring oldest blocks */
10560   
10561   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10562   while(expirep!=end)
10563   {
10564     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10565     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10566     inv_debug("EXP: Phase %d\n",expirep);
10567     switch((expirep>>11)&3)
10568     {
10569       case 0:
10570         // Clear jump_in and jump_dirty
10571         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10572         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10573         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10574         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10575         break;
10576       case 1:
10577         // Clear pointers
10578         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10579         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10580         break;
10581       case 2:
10582         // Clear hash table
10583         for(i=0;i<32;i++) {
10584           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10585           if((ht_bin[3]>>shift)==(base>>shift) ||
10586              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10587             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10588             ht_bin[2]=ht_bin[3]=-1;
10589           }
10590           if((ht_bin[1]>>shift)==(base>>shift) ||
10591              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10592             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10593             ht_bin[0]=ht_bin[2];
10594             ht_bin[1]=ht_bin[3];
10595             ht_bin[2]=ht_bin[3]=-1;
10596           }
10597         }
10598         break;
10599       case 3:
10600         // Clear jump_out
10601         #ifdef __arm__
10602         if((expirep&2047)==0)
10603           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10604         #endif
10605         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10606         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10607         break;
10608     }
10609     expirep=(expirep+1)&65535;
10610   }
10611   return 0;
10612 }