9b8f153c0fbcf82852cb17eb60102fe3ac72ef57
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "../recomp.h"
26 #include "../recomph.h" //include for function prototypes
27 #include "../macros.h"
28 #include "../r4300.h"
29 #include "../ops.h"
30 #include "../interupt.h"
31
32 #include "../../memory/memory.h"
33
34 #include <sys/mman.h>
35
36 #ifdef __i386__
37 #include "assem_x86.h"
38 #endif
39 #ifdef __x86_64__
40 #include "assem_x64.h"
41 #endif
42 #ifdef __arm__
43 #include "assem_arm.h"
44 #endif
45
46 #define MAXBLOCK 4096
47 #define MAX_OUTPUT_BLOCK_SIZE 262144
48 #define CLOCK_DIVIDER 2
49
50 struct regstat
51 {
52   signed char regmap_entry[HOST_REGS];
53   signed char regmap[HOST_REGS];
54   uint64_t was32;
55   uint64_t is32;
56   uint64_t wasdirty;
57   uint64_t dirty;
58   uint64_t u;
59   uint64_t uu;
60   u_int wasconst;
61   u_int isconst;
62   uint64_t constmap[HOST_REGS];
63 };
64
65 struct ll_entry
66 {
67   u_int vaddr;
68   u_int reg32;
69   void *addr;
70   struct ll_entry *next;
71 };
72
73   u_int start;
74   u_int *source;
75   u_int pagelimit;
76   char insn[MAXBLOCK][10];
77   u_char itype[MAXBLOCK];
78   u_char opcode[MAXBLOCK];
79   u_char opcode2[MAXBLOCK];
80   u_char bt[MAXBLOCK];
81   u_char rs1[MAXBLOCK];
82   u_char rs2[MAXBLOCK];
83   u_char rt1[MAXBLOCK];
84   u_char rt2[MAXBLOCK];
85   u_char us1[MAXBLOCK];
86   u_char us2[MAXBLOCK];
87   u_char dep1[MAXBLOCK];
88   u_char dep2[MAXBLOCK];
89   u_char lt1[MAXBLOCK];
90   int imm[MAXBLOCK];
91   u_int ba[MAXBLOCK];
92   char likely[MAXBLOCK];
93   char is_ds[MAXBLOCK];
94   uint64_t unneeded_reg[MAXBLOCK];
95   uint64_t unneeded_reg_upper[MAXBLOCK];
96   uint64_t branch_unneeded_reg[MAXBLOCK];
97   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
98   uint64_t p32[MAXBLOCK];
99   uint64_t pr32[MAXBLOCK];
100   signed char regmap_pre[MAXBLOCK][HOST_REGS];
101   signed char regmap[MAXBLOCK][HOST_REGS];
102   signed char regmap_entry[MAXBLOCK][HOST_REGS];
103   uint64_t constmap[MAXBLOCK][HOST_REGS];
104   uint64_t known_value[HOST_REGS];
105   u_int known_reg;
106   struct regstat regs[MAXBLOCK];
107   struct regstat branch_regs[MAXBLOCK];
108   u_int needed_reg[MAXBLOCK];
109   uint64_t requires_32bit[MAXBLOCK];
110   u_int wont_dirty[MAXBLOCK];
111   u_int will_dirty[MAXBLOCK];
112   int ccadj[MAXBLOCK];
113   int slen;
114   u_int instr_addr[MAXBLOCK];
115   u_int link_addr[MAXBLOCK][3];
116   int linkcount;
117   u_int stubs[MAXBLOCK*3][8];
118   int stubcount;
119   u_int literals[1024][2];
120   int literalcount;
121   int is_delayslot;
122   int cop1_usable;
123   u_char *out;
124   struct ll_entry *jump_in[4096];
125   struct ll_entry *jump_out[4096];
126   struct ll_entry *jump_dirty[4096];
127   u_int hash_table[65536][4]  __attribute__((aligned(16)));
128   char shadow[1048576]  __attribute__((aligned(16)));
129   void *copy;
130   int expirep;
131   u_int using_tlb;
132   u_int stop_after_jal;
133   extern u_char restore_candidate[512];
134   extern int cycle_count;
135
136   /* registers that may be allocated */
137   /* 1-31 gpr */
138 #define HIREG 32 // hi
139 #define LOREG 33 // lo
140 #define FSREG 34 // FPU status (FCSR)
141 #define CSREG 35 // Coprocessor status
142 #define CCREG 36 // Cycle count
143 #define INVCP 37 // Pointer to invalid_code
144 #define TEMPREG 38
145 #define FTEMP 38 // FPU temporary register
146 #define PTEMP 39 // Prefetch temporary register
147 #define TLREG 40 // TLB mapping offset
148 #define RHASH 41 // Return address hash
149 #define RHTBL 42 // Return address hash table address
150 #define RTEMP 43 // JR/JALR address register
151 #define MAXREG 43
152 #define AGEN1 44 // Address generation temporary register
153 #define AGEN2 45 // Address generation temporary register
154 #define MGEN1 46 // Maptable address generation temporary register
155 #define MGEN2 47 // Maptable address generation temporary register
156 #define BTREG 48 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185
186   /* stubs */
187 #define CC_STUB 1
188 #define FP_STUB 2
189 #define LOADB_STUB 3
190 #define LOADH_STUB 4
191 #define LOADW_STUB 5
192 #define LOADD_STUB 6
193 #define LOADBU_STUB 7
194 #define LOADHU_STUB 8
195 #define STOREB_STUB 9
196 #define STOREH_STUB 10
197 #define STOREW_STUB 11
198 #define STORED_STUB 12
199 #define STORELR_STUB 13
200 #define INVCODE_STUB 14
201
202   /* branch codes */
203 #define TAKEN 1
204 #define NOTTAKEN 2
205 #define NULLDS 3
206
207 // asm linkage
208 int new_recompile_block(int addr);
209 void *get_addr_ht(u_int vaddr);
210 void invalidate_block(u_int block);
211 void invalidate_addr(u_int addr);
212 void remove_hash(int vaddr);
213 void jump_vaddr();
214 void dyna_linker();
215 void dyna_linker_ds();
216 void verify_code();
217 void verify_code_vm();
218 void verify_code_ds();
219 void cc_interrupt();
220 void fp_exception();
221 void fp_exception_ds();
222 void jump_syscall();
223 void jump_eret();
224
225 // TLB
226 void TLBWI_new();
227 void TLBWR_new();
228 void read_nomem_new();
229 void read_nomemb_new();
230 void read_nomemh_new();
231 void read_nomemd_new();
232 void write_nomem_new();
233 void write_nomemb_new();
234 void write_nomemh_new();
235 void write_nomemd_new();
236 void write_rdram_new();
237 void write_rdramb_new();
238 void write_rdramh_new();
239 void write_rdramd_new();
240 extern u_int memory_map[1048576];
241
242 // Needed by assembler
243 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
244 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
245 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
246 void load_all_regs(signed char i_regmap[]);
247 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
248 void load_regs_entry(int t);
249 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
250
251 int tracedebug=0;
252
253 //#define DEBUG_CYCLE_COUNT 1
254
255 void nullf() {}
256 //#define assem_debug printf
257 //#define inv_debug printf
258 #define assem_debug nullf
259 #define inv_debug nullf
260
261 void tlb_hacks()
262 {
263   // Goldeneye hack
264   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
265   {
266     u_int addr;
267     int n;
268     switch (ROM_HEADER->Country_code&0xFF) 
269     {
270       case 0x45: // U
271         addr=0x34b30;
272         break;                   
273       case 0x4A: // J 
274         addr=0x34b70;    
275         break;    
276       case 0x50: // E 
277         addr=0x329f0;
278         break;                        
279       default: 
280         // Unknown country code
281         addr=0;
282         break;
283     }
284     u_int rom_addr=(u_int)rom;
285     #ifdef ROM_COPY
286     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
287     // in the lower 4G of memory to use this hack.  Copy it if necessary.
288     if((void *)rom>(void *)0xffffffff) {
289       munmap(ROM_COPY, 67108864);
290       if(mmap(ROM_COPY, 12582912,
291               PROT_READ | PROT_WRITE,
292               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
293               -1, 0) <= 0) {printf("mmap() failed\n");}
294       memcpy(ROM_COPY,rom,12582912);
295       rom_addr=(u_int)ROM_COPY;
296     }
297     #endif
298     if(addr) {
299       for(n=0x7F000;n<0x80000;n++) {
300         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
301       }
302     }
303   }
304 }
305
306 // Get address from virtual address
307 // This is called from the recompiled JR/JALR instructions
308 void *get_addr(u_int vaddr)
309 {
310   u_int page=(vaddr^0x80000000)>>12;
311   u_int vpage=page;
312   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
313   if(page>2048) page=2048+(page&2047);
314   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
315   if(vpage>2048) vpage=2048+(vpage&2047);
316   struct ll_entry *head;
317   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
318   head=jump_in[page];
319   while(head!=NULL) {
320     if(head->vaddr==vaddr&&head->reg32==0) {
321   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
322       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
323       ht_bin[3]=ht_bin[1];
324       ht_bin[2]=ht_bin[0];
325       ht_bin[1]=(int)head->addr;
326       ht_bin[0]=vaddr;
327       return head->addr;
328     }
329     head=head->next;
330   }
331   head=jump_dirty[vpage];
332   while(head!=NULL) {
333     if(head->vaddr==vaddr&&head->reg32==0) {
334       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
335       // Don't restore blocks which are about to expire from the cache
336       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
337       if(verify_dirty(head->addr)) {
338         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
339         invalid_code[vaddr>>12]=0;
340         memory_map[vaddr>>12]|=0x40000000;
341         if(vpage<2048) {
342           if(tlb_LUT_r[vaddr>>12]) {
343             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
344             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
345           }
346           restore_candidate[vpage>>3]|=1<<(vpage&7);
347         }
348         else restore_candidate[page>>3]|=1<<(page&7);
349         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
350         if(ht_bin[0]==vaddr) {
351           ht_bin[1]=(int)head->addr; // Replace existing entry
352         }
353         else
354         {
355           ht_bin[3]=ht_bin[1];
356           ht_bin[2]=ht_bin[0];
357           ht_bin[1]=(int)head->addr;
358           ht_bin[0]=vaddr;
359         }
360         return head->addr;
361       }
362     }
363     head=head->next;
364   }
365   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
366   int r=new_recompile_block(vaddr);
367   if(r==0) return get_addr(vaddr);
368   // Execute in unmapped page, generate pagefault execption
369   Status|=2;
370   Cause=(vaddr<<31)|0x8;
371   EPC=(vaddr&1)?vaddr-5:vaddr;
372   BadVAddr=(vaddr&~1);
373   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
374   EntryHi=BadVAddr&0xFFFFE000;
375   return get_addr_ht(0x80000000);
376 }
377 // Look up address in hash table first
378 void *get_addr_ht(u_int vaddr)
379 {
380   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
381   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
382   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
383   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
384   return get_addr(vaddr);
385 }
386
387 void *get_addr_32(u_int vaddr,u_int flags)
388 {
389   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
390   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
391   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
392   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
393   u_int page=(vaddr^0x80000000)>>12;
394   u_int vpage=page;
395   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
396   if(page>2048) page=2048+(page&2047);
397   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
398   if(vpage>2048) vpage=2048+(vpage&2047);
399   struct ll_entry *head;
400   head=jump_in[page];
401   while(head!=NULL) {
402     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
403       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
404       if(head->reg32==0) {
405         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
406         if(ht_bin[0]==-1) {
407           ht_bin[1]=(int)head->addr;
408           ht_bin[0]=vaddr;
409         }else if(ht_bin[2]==-1) {
410           ht_bin[3]=(int)head->addr;
411           ht_bin[2]=vaddr;
412         }
413         //ht_bin[3]=ht_bin[1];
414         //ht_bin[2]=ht_bin[0];
415         //ht_bin[1]=(int)head->addr;
416         //ht_bin[0]=vaddr;
417       }
418       return head->addr;
419     }
420     head=head->next;
421   }
422   head=jump_dirty[vpage];
423   while(head!=NULL) {
424     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
425       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
426       // Don't restore blocks which are about to expire from the cache
427       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
428       if(verify_dirty(head->addr)) {
429         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
430         invalid_code[vaddr>>12]=0;
431         memory_map[vaddr>>12]|=0x40000000;
432         if(vpage<2048) {
433           if(tlb_LUT_r[vaddr>>12]) {
434             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
435             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
436           }
437           restore_candidate[vpage>>3]|=1<<(vpage&7);
438         }
439         else restore_candidate[page>>3]|=1<<(page&7);
440         if(head->reg32==0) {
441           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
442           if(ht_bin[0]==-1) {
443             ht_bin[1]=(int)head->addr;
444             ht_bin[0]=vaddr;
445           }else if(ht_bin[2]==-1) {
446             ht_bin[3]=(int)head->addr;
447             ht_bin[2]=vaddr;
448           }
449           //ht_bin[3]=ht_bin[1];
450           //ht_bin[2]=ht_bin[0];
451           //ht_bin[1]=(int)head->addr;
452           //ht_bin[0]=vaddr;
453         }
454         return head->addr;
455       }
456     }
457     head=head->next;
458   }
459   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
460   int r=new_recompile_block(vaddr);
461   if(r==0) return get_addr(vaddr);
462   // Execute in unmapped page, generate pagefault execption
463   Status|=2;
464   Cause=(vaddr<<31)|0x8;
465   EPC=(vaddr&1)?vaddr-5:vaddr;
466   BadVAddr=(vaddr&~1);
467   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
468   EntryHi=BadVAddr&0xFFFFE000;
469   return get_addr_ht(0x80000000);
470 }
471
472 void clear_all_regs(signed char regmap[])
473 {
474   int hr;
475   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
476 }
477
478 signed char get_reg(signed char regmap[],int r)
479 {
480   int hr;
481   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
482   return -1;
483 }
484
485 // Find a register that is available for two consecutive cycles
486 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
487 {
488   int hr;
489   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
490   return -1;
491 }
492
493 int count_free_regs(signed char regmap[])
494 {
495   int count=0;
496   int hr;
497   for(hr=0;hr<HOST_REGS;hr++)
498   {
499     if(hr!=EXCLUDE_REG) {
500       if(regmap[hr]<0) count++;
501     }
502   }
503   return count;
504 }
505
506 void dirty_reg(struct regstat *cur,signed char reg)
507 {
508   int hr;
509   if(!reg) return;
510   for (hr=0;hr<HOST_REGS;hr++) {
511     if((cur->regmap[hr]&63)==reg) {
512       cur->dirty|=1<<hr;
513     }
514   }
515 }
516
517 // If we dirty the lower half of a 64 bit register which is now being
518 // sign-extended, we need to dump the upper half.
519 // Note: Do this only after completion of the instruction, because
520 // some instructions may need to read the full 64-bit value even if
521 // overwriting it (eg SLTI, DSRA32).
522 static void flush_dirty_uppers(struct regstat *cur)
523 {
524   int hr,reg;
525   for (hr=0;hr<HOST_REGS;hr++) {
526     if((cur->dirty>>hr)&1) {
527       reg=cur->regmap[hr];
528       if(reg>=64) 
529         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
530     }
531   }
532 }
533
534 void set_const(struct regstat *cur,signed char reg,uint64_t value)
535 {
536   int hr;
537   if(!reg) return;
538   for (hr=0;hr<HOST_REGS;hr++) {
539     if(cur->regmap[hr]==reg) {
540       cur->isconst|=1<<hr;
541       cur->constmap[hr]=value;
542     }
543     else if((cur->regmap[hr]^64)==reg) {
544       cur->isconst|=1<<hr;
545       cur->constmap[hr]=value>>32;
546     }
547   }
548 }
549
550 void clear_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if((cur->regmap[hr]&63)==reg) {
556       cur->isconst&=~(1<<hr);
557     }
558   }
559 }
560
561 int is_const(struct regstat *cur,signed char reg)
562 {
563   int hr;
564   if(!reg) return 1;
565   for (hr=0;hr<HOST_REGS;hr++) {
566     if((cur->regmap[hr]&63)==reg) {
567       return (cur->isconst>>hr)&1;
568     }
569   }
570   return 0;
571 }
572 uint64_t get_const(struct regstat *cur,signed char reg)
573 {
574   int hr;
575   if(!reg) return 0;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if(cur->regmap[hr]==reg) {
578       return cur->constmap[hr];
579     }
580   }
581   printf("Unknown constant in r%d\n",reg);
582   exit(1);
583 }
584
585 // Least soon needed registers
586 // Look at the next ten instructions and see which registers
587 // will be used.  Try not to reallocate these.
588 void lsn(u_char hsn[], int i, int *preferred_reg)
589 {
590   int j;
591   int b=-1;
592   for(j=0;j<9;j++)
593   {
594     if(i+j>=slen) {
595       j=slen-i-1;
596       break;
597     }
598     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
599     {
600       // Don't go past an unconditonal jump
601       j++;
602       break;
603     }
604   }
605   for(;j>=0;j--)
606   {
607     if(rs1[i+j]) hsn[rs1[i+j]]=j;
608     if(rs2[i+j]) hsn[rs2[i+j]]=j;
609     if(rt1[i+j]) hsn[rt1[i+j]]=j;
610     if(rt2[i+j]) hsn[rt2[i+j]]=j;
611     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
612       // Stores can allocate zero
613       hsn[rs1[i+j]]=j;
614       hsn[rs2[i+j]]=j;
615     }
616     // On some architectures stores need invc_ptr
617     #if defined(HOST_IMM8)
618     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
619       hsn[INVCP]=j;
620     }
621     #endif
622     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
623     {
624       hsn[CCREG]=j;
625       b=j;
626     }
627   }
628   if(b>=0)
629   {
630     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
631     {
632       // Follow first branch
633       int t=(ba[i+b]-start)>>2;
634       j=7-b;if(t+j>=slen) j=slen-t-1;
635       for(;j>=0;j--)
636       {
637         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
638         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
639         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
640         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
641       }
642     }
643     // TODO: preferred register based on backward branch
644   }
645   // Delay slot should preferably not overwrite branch conditions or cycle count
646   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
647     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
648     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
649     hsn[CCREG]=1;
650     // ...or hash tables
651     hsn[RHASH]=1;
652     hsn[RHTBL]=1;
653   }
654   // Coprocessor load/store needs FTEMP, even if not declared
655   if(itype[i]==C1LS) {
656     hsn[FTEMP]=0;
657   }
658   // Load L/R also uses FTEMP as a temporary register
659   if(itype[i]==LOADLR) {
660     hsn[FTEMP]=0;
661   }
662   // Also 64-bit SDL/SDR
663   if(opcode[i]==0x2c||opcode[i]==0x2d) {
664     hsn[FTEMP]=0;
665   }
666   // Don't remove the TLB registers either
667   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
668     hsn[TLREG]=0;
669   }
670   // Don't remove the miniht registers
671   if(itype[i]==UJUMP||itype[i]==RJUMP)
672   {
673     hsn[RHASH]=0;
674     hsn[RHTBL]=0;
675   }
676 }
677
678 // We only want to allocate registers if we're going to use them again soon
679 int needed_again(int r, int i)
680 {
681   int j;
682   int b=-1;
683   int rn=10;
684   int hr;
685   u_char hsn[MAXREG+1];
686   int preferred_reg;
687   
688   memset(hsn,10,sizeof(hsn));
689   lsn(hsn,i,&preferred_reg);
690   
691   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
692   {
693     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
694       return 0; // Don't need any registers if exiting the block
695   }
696   for(j=0;j<9;j++)
697   {
698     if(i+j>=slen) {
699       j=slen-i-1;
700       break;
701     }
702     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
703     {
704       // Don't go past an unconditonal jump
705       j++;
706       break;
707     }
708     if(itype[i+j]==SYSCALL||((source[i+j]&0xfc00003f)==0x0d))
709     {
710       break;
711     }
712   }
713   for(;j>=1;j--)
714   {
715     if(rs1[i+j]==r) rn=j;
716     if(rs2[i+j]==r) rn=j;
717     if((unneeded_reg[i+j]>>r)&1) rn=10;
718     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
719     {
720       b=j;
721     }
722   }
723   /*
724   if(b>=0)
725   {
726     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
727     {
728       // Follow first branch
729       int o=rn;
730       int t=(ba[i+b]-start)>>2;
731       j=7-b;if(t+j>=slen) j=slen-t-1;
732       for(;j>=0;j--)
733       {
734         if(!((unneeded_reg[t+j]>>r)&1)) {
735           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
736           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
737         }
738         else rn=o;
739       }
740     }
741   }*/
742   for(hr=0;hr<HOST_REGS;hr++) {
743     if(hr!=EXCLUDE_REG) {
744       if(rn<hsn[hr]) return 1;
745     }
746   }
747   return 0;
748 }
749
750 // Try to match register allocations at the end of a loop with those
751 // at the beginning
752 int loop_reg(int i, int r, int hr)
753 {
754   int j,k;
755   for(j=0;j<9;j++)
756   {
757     if(i+j>=slen) {
758       j=slen-i-1;
759       break;
760     }
761     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
762     {
763       // Don't go past an unconditonal jump
764       j++;
765       break;
766     }
767   }
768   k=0;
769   if(i>0){
770     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
771       k--;
772   }
773   for(;k<j;k++)
774   {
775     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
776     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
777     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
778     {
779       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
780       {
781         int t=(ba[i+k]-start)>>2;
782         int reg=get_reg(regs[t].regmap_entry,r);
783         if(reg>=0) return reg;
784         //reg=get_reg(regs[t+1].regmap_entry,r);
785         //if(reg>=0) return reg;
786       }
787     }
788   }
789   return hr;
790 }
791
792
793 // Allocate every register, preserving source/target regs
794 void alloc_all(struct regstat *cur,int i)
795 {
796   int hr;
797   
798   for(hr=0;hr<HOST_REGS;hr++) {
799     if(hr!=EXCLUDE_REG) {
800       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
801          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
802       {
803         cur->regmap[hr]=-1;
804         cur->dirty&=~(1<<hr);
805       }
806       // Don't need zeros
807       if((cur->regmap[hr]&63)==0)
808       {
809         cur->regmap[hr]=-1;
810         cur->dirty&=~(1<<hr);
811       }
812     }
813   }
814 }
815
816
817 void div64(int64_t dividend,int64_t divisor)
818 {
819   lo=dividend/divisor;
820   hi=dividend%divisor;
821   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
822   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
823 }
824 void divu64(uint64_t dividend,uint64_t divisor)
825 {
826   lo=dividend/divisor;
827   hi=dividend%divisor;
828   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
829   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
830 }
831
832 void mult64(uint64_t m1,uint64_t m2)
833 {
834    unsigned long long int op1, op2, op3, op4;
835    unsigned long long int result1, result2, result3, result4;
836    unsigned long long int temp1, temp2, temp3, temp4;
837    int sign = 0;
838    
839    if (m1 < 0)
840      {
841     op2 = -m1;
842     sign = 1 - sign;
843      }
844    else op2 = m1;
845    if (m2 < 0)
846      {
847     op4 = -m2;
848     sign = 1 - sign;
849      }
850    else op4 = m2;
851    
852    op1 = op2 & 0xFFFFFFFF;
853    op2 = (op2 >> 32) & 0xFFFFFFFF;
854    op3 = op4 & 0xFFFFFFFF;
855    op4 = (op4 >> 32) & 0xFFFFFFFF;
856    
857    temp1 = op1 * op3;
858    temp2 = (temp1 >> 32) + op1 * op4;
859    temp3 = op2 * op3;
860    temp4 = (temp3 >> 32) + op2 * op4;
861    
862    result1 = temp1 & 0xFFFFFFFF;
863    result2 = temp2 + (temp3 & 0xFFFFFFFF);
864    result3 = (result2 >> 32) + temp4;
865    result4 = (result3 >> 32);
866    
867    lo = result1 | (result2 << 32);
868    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
869    if (sign)
870      {
871     hi = ~hi;
872     if (!lo) hi++;
873     else lo = ~lo + 1;
874      }
875 }
876
877 void multu64(uint64_t m1,uint64_t m2)
878 {
879    unsigned long long int op1, op2, op3, op4;
880    unsigned long long int result1, result2, result3, result4;
881    unsigned long long int temp1, temp2, temp3, temp4;
882    
883    op1 = m1 & 0xFFFFFFFF;
884    op2 = (m1 >> 32) & 0xFFFFFFFF;
885    op3 = m2 & 0xFFFFFFFF;
886    op4 = (m2 >> 32) & 0xFFFFFFFF;
887    
888    temp1 = op1 * op3;
889    temp2 = (temp1 >> 32) + op1 * op4;
890    temp3 = op2 * op3;
891    temp4 = (temp3 >> 32) + op2 * op4;
892    
893    result1 = temp1 & 0xFFFFFFFF;
894    result2 = temp2 + (temp3 & 0xFFFFFFFF);
895    result3 = (result2 >> 32) + temp4;
896    result4 = (result3 >> 32);
897    
898    lo = result1 | (result2 << 32);
899    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
900    
901   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
902   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
903 }
904
905 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
906 {
907   if(bits) {
908     original<<=64-bits;
909     original>>=64-bits;
910     loaded<<=bits;
911     original|=loaded;
912   }
913   else original=loaded;
914   return original;
915 }
916 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
917 {
918   if(bits^56) {
919     original>>=64-(bits^56);
920     original<<=64-(bits^56);
921     loaded>>=bits^56;
922     original|=loaded;
923   }
924   else original=loaded;
925   return original;
926 }
927
928 #ifdef __i386__
929 #include "assem_x86.c"
930 #endif
931 #ifdef __x86_64__
932 #include "assem_x64.c"
933 #endif
934 #ifdef __arm__
935 #include "assem_arm.c"
936 #endif
937
938 // Add virtual address mapping to linked list
939 void ll_add(struct ll_entry **head,int vaddr,void *addr)
940 {
941   struct ll_entry *new_entry;
942   new_entry=malloc(sizeof(struct ll_entry));
943   assert(new_entry!=NULL);
944   new_entry->vaddr=vaddr;
945   new_entry->reg32=0;
946   new_entry->addr=addr;
947   new_entry->next=*head;
948   *head=new_entry;
949 }
950
951 // Add virtual address mapping for 32-bit compiled block
952 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
953 {
954   struct ll_entry *new_entry;
955   new_entry=malloc(sizeof(struct ll_entry));
956   assert(new_entry!=NULL);
957   new_entry->vaddr=vaddr;
958   new_entry->reg32=reg32;
959   new_entry->addr=addr;
960   new_entry->next=*head;
961   *head=new_entry;
962 }
963
964 // Check if an address is already compiled
965 // but don't return addresses which are about to expire from the cache
966 void *check_addr(u_int vaddr)
967 {
968   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
969   if(ht_bin[0]==vaddr) {
970     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
971       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
972   }
973   if(ht_bin[2]==vaddr) {
974     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
975       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
976   }
977   u_int page=(vaddr^0x80000000)>>12;
978   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
979   if(page>2048) page=2048+(page&2047);
980   struct ll_entry *head;
981   head=jump_in[page];
982   while(head!=NULL) {
983     if(head->vaddr==vaddr&&head->reg32==0) {
984       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
985         // Update existing entry with current address
986         if(ht_bin[0]==vaddr) {
987           ht_bin[1]=(int)head->addr;
988           return head->addr;
989         }
990         if(ht_bin[2]==vaddr) {
991           ht_bin[3]=(int)head->addr;
992           return head->addr;
993         }
994         // Insert into hash table with low priority.
995         // Don't evict existing entries, as they are probably
996         // addresses that are being accessed frequently.
997         if(ht_bin[0]==-1) {
998           ht_bin[1]=(int)head->addr;
999           ht_bin[0]=vaddr;
1000         }else if(ht_bin[2]==-1) {
1001           ht_bin[3]=(int)head->addr;
1002           ht_bin[2]=vaddr;
1003         }
1004         return head->addr;
1005       }
1006     }
1007     head=head->next;
1008   }
1009   return 0;
1010 }
1011
1012 void remove_hash(int vaddr)
1013 {
1014   //printf("remove hash: %x\n",vaddr);
1015   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1016   if(ht_bin[2]==vaddr) {
1017     ht_bin[2]=ht_bin[3]=-1;
1018   }
1019   if(ht_bin[0]==vaddr) {
1020     ht_bin[0]=ht_bin[2];
1021     ht_bin[1]=ht_bin[3];
1022     ht_bin[2]=ht_bin[3]=-1;
1023   }
1024 }
1025
1026 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1027 {
1028   struct ll_entry *next;
1029   while(*head) {
1030     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1031        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1032     {
1033       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1034       remove_hash((*head)->vaddr);
1035       next=(*head)->next;
1036       free(*head);
1037       *head=next;
1038     }
1039     else
1040     {
1041       head=&((*head)->next);
1042     }
1043   }
1044 }
1045
1046 // Remove all entries from linked list
1047 void ll_clear(struct ll_entry **head)
1048 {
1049   struct ll_entry *cur;
1050   struct ll_entry *next;
1051   if(cur=*head) {
1052     *head=0;
1053     while(cur) {
1054       next=cur->next;
1055       free(cur);
1056       cur=next;
1057     }
1058   }
1059 }
1060
1061 // Dereference the pointers and remove if it matches
1062 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1063 {
1064   while(head) {
1065     int ptr=get_pointer(head->addr);
1066     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1067     if(((ptr>>shift)==(addr>>shift)) ||
1068        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1069     {
1070       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1071       kill_pointer(head->addr);
1072     }
1073     head=head->next;
1074   }
1075 }
1076
1077 // This is called when we write to a compiled block (see do_invstub)
1078 int invalidate_page(u_int page)
1079 {
1080   int modified=0;
1081   struct ll_entry *head;
1082   struct ll_entry *next;
1083   head=jump_in[page];
1084   jump_in[page]=0;
1085   while(head!=NULL) {
1086     inv_debug("INVALIDATE: %x\n",head->vaddr);
1087     remove_hash(head->vaddr);
1088     next=head->next;
1089     free(head);
1090     head=next;
1091   }
1092   head=jump_out[page];
1093   jump_out[page]=0;
1094   while(head!=NULL) {
1095     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1096     kill_pointer(head->addr);
1097     modified=1;
1098     next=head->next;
1099     free(head);
1100     head=next;
1101   }
1102   return modified;
1103 }
1104 void invalidate_block(u_int block)
1105 {
1106   int modified;
1107   u_int page,vpage;
1108   page=vpage=block^0x80000;
1109   if(page>262143&&tlb_LUT_r[block]) page=(tlb_LUT_r[block]^0x80000000)>>12;
1110   if(page>2048) page=2048+(page&2047);
1111   if(vpage>262143&&tlb_LUT_r[block]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
1112   if(vpage>2048) vpage=2048+(vpage&2047);
1113   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1114   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1115   u_int first,last;
1116   first=last=page;
1117   struct ll_entry *head;
1118   head=jump_dirty[vpage];
1119   //printf("page=%d vpage=%d\n",page,vpage);
1120   while(head!=NULL) {
1121     u_int start,end;
1122     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1123       get_bounds((int)head->addr,&start,&end);
1124       //printf("start: %x end: %x\n",start,end);
1125       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1126         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1127           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1128           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1129         }
1130       }
1131       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1132         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1133           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1134           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1135         }
1136       }
1137     }
1138     head=head->next;
1139   }
1140   //printf("first=%d last=%d\n",first,last);
1141   modified=invalidate_page(page);
1142   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1143   assert(last<page+5);
1144   // Invalidate the adjacent pages if a block crosses a 4K boundary
1145   while(first<page) {
1146     invalidate_page(first);
1147     first++;
1148   }
1149   for(first=page+1;first<last;first++) {
1150     invalidate_page(first);
1151   }
1152   
1153   // Don't trap writes
1154   invalid_code[block]=1;
1155   // If there is a valid TLB entry for this page, remove write protect
1156   if(tlb_LUT_w[block]) {
1157     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1158     // CHECK: Is this right?
1159     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1160     u_int real_block=tlb_LUT_w[block]>>12;
1161     invalid_code[real_block]=1;
1162     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1163   }
1164   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1165   #ifdef __arm__
1166   if(modified)
1167     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1168   #endif
1169   #ifdef USE_MINI_HT
1170   memset(mini_ht,-1,sizeof(mini_ht));
1171   #endif
1172 }
1173 void invalidate_addr(u_int addr)
1174 {
1175   invalidate_block(addr>>12);
1176 }
1177 void invalidate_all_pages()
1178 {
1179   u_int page,n;
1180   for(page=0;page<4096;page++)
1181     invalidate_page(page);
1182   for(page=0;page<1048576;page++)
1183     if(!invalid_code[page]) {
1184       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1185       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1186     }
1187   #ifdef __arm__
1188   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1189   #endif
1190   #ifdef USE_MINI_HT
1191   memset(mini_ht,-1,sizeof(mini_ht));
1192   #endif
1193   // TLB
1194   for(page=0;page<0x100000;page++) {
1195     if(tlb_LUT_r[page]) {
1196       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1197       if(!tlb_LUT_w[page]||!invalid_code[page])
1198         memory_map[page]|=0x40000000; // Write protect
1199     }
1200     else memory_map[page]=-1;
1201     if(page==0x80000) page=0xC0000;
1202   }
1203   tlb_hacks();
1204 }
1205
1206 // Add an entry to jump_out after making a link
1207 void add_link(u_int vaddr,void *src)
1208 {
1209   u_int page=(vaddr^0x80000000)>>12;
1210   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
1211   if(page>4095) page=2048+(page&2047);
1212   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1213   ll_add(jump_out+page,vaddr,src);
1214   //int ptr=get_pointer(src);
1215   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1216 }
1217
1218 // If a code block was found to be unmodified (bit was set in
1219 // restore_candidate) and it remains unmodified (bit is clear
1220 // in invalid_code) then move the entries for that 4K page from
1221 // the dirty list to the clean list.
1222 void clean_blocks(u_int page)
1223 {
1224   struct ll_entry *head;
1225   inv_debug("INV: clean_blocks page=%d\n",page);
1226   head=jump_dirty[page];
1227   while(head!=NULL) {
1228     if(!invalid_code[head->vaddr>>12]) {
1229       // Don't restore blocks which are about to expire from the cache
1230       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1231         u_int start,end;
1232         if(verify_dirty((int)head->addr)) {
1233           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1234           u_int i;
1235           u_int inv=0;
1236           get_bounds((int)head->addr,&start,&end);
1237           if(start-(u_int)rdram<0x800000) {
1238             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1239               inv|=invalid_code[i];
1240             }
1241           }
1242           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1243             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1244             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1245             if(addr<start||addr>=end) inv=1;
1246           }
1247           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1248             inv=1;
1249           }
1250           if(!inv) {
1251             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1252             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1253               u_int ppage=page;
1254               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1255               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1256               //printf("page=%x, addr=%x\n",page,head->vaddr);
1257               //assert(head->vaddr>>12==(page|0x80000));
1258               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1259               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1260               if(!head->reg32) {
1261                 if(ht_bin[0]==head->vaddr) {
1262                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1263                 }
1264                 if(ht_bin[2]==head->vaddr) {
1265                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1266                 }
1267               }
1268             }
1269           }
1270         }
1271       }
1272     }
1273     head=head->next;
1274   }
1275 }
1276
1277
1278 void mov_alloc(struct regstat *current,int i)
1279 {
1280   // Note: Don't need to actually alloc the source registers
1281   if((~current->is32>>rs1[i])&1) {
1282     //alloc_reg64(current,i,rs1[i]);
1283     alloc_reg64(current,i,rt1[i]);
1284     current->is32&=~(1LL<<rt1[i]);
1285   } else {
1286     //alloc_reg(current,i,rs1[i]);
1287     alloc_reg(current,i,rt1[i]);
1288     current->is32|=(1LL<<rt1[i]);
1289   }
1290   clear_const(current,rs1[i]);
1291   clear_const(current,rt1[i]);
1292   dirty_reg(current,rt1[i]);
1293 }
1294
1295 void shiftimm_alloc(struct regstat *current,int i)
1296 {
1297   clear_const(current,rs1[i]);
1298   clear_const(current,rt1[i]);
1299   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1300   {
1301     if(rt1[i]) {
1302       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1303       else lt1[i]=rs1[i];
1304       alloc_reg(current,i,rt1[i]);
1305       current->is32|=1LL<<rt1[i];
1306       dirty_reg(current,rt1[i]);
1307     }
1308   }
1309   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1310   {
1311     if(rt1[i]) {
1312       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1313       alloc_reg64(current,i,rt1[i]);
1314       current->is32&=~(1LL<<rt1[i]);
1315       dirty_reg(current,rt1[i]);
1316     }
1317   }
1318   if(opcode2[i]==0x3c) // DSLL32
1319   {
1320     if(rt1[i]) {
1321       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1322       alloc_reg64(current,i,rt1[i]);
1323       current->is32&=~(1LL<<rt1[i]);
1324       dirty_reg(current,rt1[i]);
1325     }
1326   }
1327   if(opcode2[i]==0x3e) // DSRL32
1328   {
1329     if(rt1[i]) {
1330       alloc_reg64(current,i,rs1[i]);
1331       if(imm[i]==32) {
1332         alloc_reg64(current,i,rt1[i]);
1333         current->is32&=~(1LL<<rt1[i]);
1334       } else {
1335         alloc_reg(current,i,rt1[i]);
1336         current->is32|=1LL<<rt1[i];
1337       }
1338       dirty_reg(current,rt1[i]);
1339     }
1340   }
1341   if(opcode2[i]==0x3f) // DSRA32
1342   {
1343     if(rt1[i]) {
1344       alloc_reg64(current,i,rs1[i]);
1345       alloc_reg(current,i,rt1[i]);
1346       current->is32|=1LL<<rt1[i];
1347       dirty_reg(current,rt1[i]);
1348     }
1349   }
1350 }
1351
1352 void shift_alloc(struct regstat *current,int i)
1353 {
1354   if(rt1[i]) {
1355     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1356     {
1357       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1358       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1359       alloc_reg(current,i,rt1[i]);
1360       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1361       current->is32|=1LL<<rt1[i];
1362     } else { // DSLLV/DSRLV/DSRAV
1363       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1364       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1365       alloc_reg64(current,i,rt1[i]);
1366       current->is32&=~(1LL<<rt1[i]);
1367       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1368         alloc_reg_temp(current,i,-1);
1369     }
1370     clear_const(current,rs1[i]);
1371     clear_const(current,rs2[i]);
1372     clear_const(current,rt1[i]);
1373     dirty_reg(current,rt1[i]);
1374   }
1375 }
1376
1377 void alu_alloc(struct regstat *current,int i)
1378 {
1379   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1380     if(rt1[i]) {
1381       if(rs1[i]&&rs2[i]) {
1382         alloc_reg(current,i,rs1[i]);
1383         alloc_reg(current,i,rs2[i]);
1384       }
1385       else {
1386         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1387         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1388       }
1389       alloc_reg(current,i,rt1[i]);
1390     }
1391     current->is32|=1LL<<rt1[i];
1392   }
1393   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1394     if(rt1[i]) {
1395       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1396       {
1397         alloc_reg64(current,i,rs1[i]);
1398         alloc_reg64(current,i,rs2[i]);
1399         alloc_reg(current,i,rt1[i]);
1400       } else {
1401         alloc_reg(current,i,rs1[i]);
1402         alloc_reg(current,i,rs2[i]);
1403         alloc_reg(current,i,rt1[i]);
1404       }
1405     }
1406     current->is32|=1LL<<rt1[i];
1407   }
1408   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1409     if(rt1[i]) {
1410       if(rs1[i]&&rs2[i]) {
1411         alloc_reg(current,i,rs1[i]);
1412         alloc_reg(current,i,rs2[i]);
1413       }
1414       else
1415       {
1416         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1417         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1418       }
1419       alloc_reg(current,i,rt1[i]);
1420       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1421       {
1422         if(!((current->uu>>rt1[i])&1)) {
1423           alloc_reg64(current,i,rt1[i]);
1424         }
1425         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1426           if(rs1[i]&&rs2[i]) {
1427             alloc_reg64(current,i,rs1[i]);
1428             alloc_reg64(current,i,rs2[i]);
1429           }
1430           else
1431           {
1432             // Is is really worth it to keep 64-bit values in registers?
1433             #ifdef NATIVE_64BIT
1434             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1436             #endif
1437           }
1438         }
1439         current->is32&=~(1LL<<rt1[i]);
1440       } else {
1441         current->is32|=1LL<<rt1[i];
1442       }
1443     }
1444   }
1445   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1446     if(rt1[i]) {
1447       if(rs1[i]&&rs2[i]) {
1448         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1449           alloc_reg64(current,i,rs1[i]);
1450           alloc_reg64(current,i,rs2[i]);
1451           alloc_reg64(current,i,rt1[i]);
1452         } else {
1453           alloc_reg(current,i,rs1[i]);
1454           alloc_reg(current,i,rs2[i]);
1455           alloc_reg(current,i,rt1[i]);
1456         }
1457       }
1458       else {
1459         alloc_reg(current,i,rt1[i]);
1460         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1461           // DADD used as move, or zeroing
1462           // If we have a 64-bit source, then make the target 64 bits too
1463           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1464             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1465             alloc_reg64(current,i,rt1[i]);
1466           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1467             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1468             alloc_reg64(current,i,rt1[i]);
1469           }
1470           if(opcode2[i]>=0x2e&&rs2[i]) {
1471             // DSUB used as negation - 64-bit result
1472             // If we have a 32-bit register, extend it to 64 bits
1473             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1474             alloc_reg64(current,i,rt1[i]);
1475           }
1476         }
1477       }
1478       if(rs1[i]&&rs2[i]) {
1479         current->is32&=~(1LL<<rt1[i]);
1480       } else if(rs1[i]) {
1481         current->is32&=~(1LL<<rt1[i]);
1482         if((current->is32>>rs1[i])&1)
1483           current->is32|=1LL<<rt1[i];
1484       } else if(rs2[i]) {
1485         current->is32&=~(1LL<<rt1[i]);
1486         if((current->is32>>rs2[i])&1)
1487           current->is32|=1LL<<rt1[i];
1488       } else {
1489         current->is32|=1LL<<rt1[i];
1490       }
1491     }
1492   }
1493   clear_const(current,rs1[i]);
1494   clear_const(current,rs2[i]);
1495   clear_const(current,rt1[i]);
1496   dirty_reg(current,rt1[i]);
1497 }
1498
1499 void imm16_alloc(struct regstat *current,int i)
1500 {
1501   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1502   else lt1[i]=rs1[i];
1503   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1504   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1505     current->is32&=~(1LL<<rt1[i]);
1506     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1507       // TODO: Could preserve the 32-bit flag if the immediate is zero
1508       alloc_reg64(current,i,rt1[i]);
1509       alloc_reg64(current,i,rs1[i]);
1510     }
1511     clear_const(current,rs1[i]);
1512     clear_const(current,rt1[i]);
1513   }
1514   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1515     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1516     current->is32|=1LL<<rt1[i];
1517     clear_const(current,rs1[i]);
1518     clear_const(current,rt1[i]);
1519   }
1520   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1521     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1522       if(rs1[i]!=rt1[i]) {
1523         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1524         alloc_reg64(current,i,rt1[i]);
1525         current->is32&=~(1LL<<rt1[i]);
1526       }
1527     }
1528     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1529     if(is_const(current,rs1[i])) {
1530       int v=get_const(current,rs1[i]);
1531       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1532       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1533       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1534     }
1535     else clear_const(current,rt1[i]);
1536   }
1537   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1538     if(is_const(current,rs1[i])) {
1539       int v=get_const(current,rs1[i]);
1540       set_const(current,rt1[i],v+imm[i]);
1541     }
1542     else clear_const(current,rt1[i]);
1543     current->is32|=1LL<<rt1[i];
1544   }
1545   else {
1546     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1547     current->is32|=1LL<<rt1[i];
1548   }
1549   dirty_reg(current,rt1[i]);
1550 }
1551
1552 void load_alloc(struct regstat *current,int i)
1553 {
1554   clear_const(current,rt1[i]);
1555   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1556   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1557   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1558   if(rt1[i]) {
1559     alloc_reg(current,i,rt1[i]);
1560     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1561     {
1562       current->is32&=~(1LL<<rt1[i]);
1563       alloc_reg64(current,i,rt1[i]);
1564     }
1565     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1566     {
1567       current->is32&=~(1LL<<rt1[i]);
1568       alloc_reg64(current,i,rt1[i]);
1569       alloc_all(current,i);
1570       alloc_reg64(current,i,FTEMP);
1571     }
1572     else current->is32|=1LL<<rt1[i];
1573     dirty_reg(current,rt1[i]);
1574     // If using TLB, need a register for pointer to the mapping table
1575     if(using_tlb) alloc_reg(current,i,TLREG);
1576     // LWL/LWR need a temporary register for the old value
1577     if(opcode[i]==0x22||opcode[i]==0x26)
1578     {
1579       alloc_reg(current,i,FTEMP);
1580       alloc_reg_temp(current,i,-1);
1581     }
1582   }
1583   else
1584   {
1585     // Load to r0 (dummy load)
1586     // but we still need a register to calculate the address
1587     alloc_reg_temp(current,i,-1);
1588   }
1589 }
1590
1591 void store_alloc(struct regstat *current,int i)
1592 {
1593   clear_const(current,rs2[i]);
1594   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1595   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1596   alloc_reg(current,i,rs2[i]);
1597   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1598     alloc_reg64(current,i,rs2[i]);
1599     if(rs2[i]) alloc_reg(current,i,FTEMP);
1600   }
1601   // If using TLB, need a register for pointer to the mapping table
1602   if(using_tlb) alloc_reg(current,i,TLREG);
1603   #if defined(HOST_IMM8)
1604   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1605   else alloc_reg(current,i,INVCP);
1606   #endif
1607   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1608     alloc_reg(current,i,FTEMP);
1609   }
1610   // We need a temporary register for address generation
1611   alloc_reg_temp(current,i,-1);
1612 }
1613
1614 void c1ls_alloc(struct regstat *current,int i)
1615 {
1616   //clear_const(current,rs1[i]); // FIXME
1617   clear_const(current,rt1[i]);
1618   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1619   alloc_reg(current,i,CSREG); // Status
1620   alloc_reg(current,i,FTEMP);
1621   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1622     alloc_reg64(current,i,FTEMP);
1623   }
1624   // If using TLB, need a register for pointer to the mapping table
1625   if(using_tlb) alloc_reg(current,i,TLREG);
1626   #if defined(HOST_IMM8)
1627   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1628   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1629     alloc_reg(current,i,INVCP);
1630   #endif
1631   // We need a temporary register for address generation
1632   alloc_reg_temp(current,i,-1);
1633 }
1634
1635 #ifndef multdiv_alloc
1636 void multdiv_alloc(struct regstat *current,int i)
1637 {
1638   //  case 0x18: MULT
1639   //  case 0x19: MULTU
1640   //  case 0x1A: DIV
1641   //  case 0x1B: DIVU
1642   //  case 0x1C: DMULT
1643   //  case 0x1D: DMULTU
1644   //  case 0x1E: DDIV
1645   //  case 0x1F: DDIVU
1646   clear_const(current,rs1[i]);
1647   clear_const(current,rs2[i]);
1648   if(rs1[i]&&rs2[i])
1649   {
1650     if((opcode2[i]&4)==0) // 32-bit
1651     {
1652       current->u&=~(1LL<<HIREG);
1653       current->u&=~(1LL<<LOREG);
1654       alloc_reg(current,i,HIREG);
1655       alloc_reg(current,i,LOREG);
1656       alloc_reg(current,i,rs1[i]);
1657       alloc_reg(current,i,rs2[i]);
1658       current->is32|=1LL<<HIREG;
1659       current->is32|=1LL<<LOREG;
1660       dirty_reg(current,HIREG);
1661       dirty_reg(current,LOREG);
1662     }
1663     else // 64-bit
1664     {
1665       current->u&=~(1LL<<HIREG);
1666       current->u&=~(1LL<<LOREG);
1667       current->uu&=~(1LL<<HIREG);
1668       current->uu&=~(1LL<<LOREG);
1669       alloc_reg64(current,i,HIREG);
1670       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1671       alloc_reg64(current,i,rs1[i]);
1672       alloc_reg64(current,i,rs2[i]);
1673       alloc_all(current,i);
1674       current->is32&=~(1LL<<HIREG);
1675       current->is32&=~(1LL<<LOREG);
1676       dirty_reg(current,HIREG);
1677       dirty_reg(current,LOREG);
1678     }
1679   }
1680   else
1681   {
1682     // Multiply by zero is zero.
1683     // MIPS does not have a divide by zero exception.
1684     // The result is undefined, we return zero.
1685     alloc_reg(current,i,HIREG);
1686     alloc_reg(current,i,LOREG);
1687     current->is32|=1LL<<HIREG;
1688     current->is32|=1LL<<LOREG;
1689     dirty_reg(current,HIREG);
1690     dirty_reg(current,LOREG);
1691   }
1692 }
1693 #endif
1694
1695 void cop0_alloc(struct regstat *current,int i)
1696 {
1697   if(opcode2[i]==0) // MFC0
1698   {
1699     if(rt1[i]) {
1700       clear_const(current,rt1[i]);
1701       alloc_all(current,i);
1702       alloc_reg(current,i,rt1[i]);
1703       current->is32|=1LL<<rt1[i];
1704       dirty_reg(current,rt1[i]);
1705     }
1706   }
1707   else if(opcode2[i]==4) // MTC0
1708   {
1709     if(rs1[i]){
1710       clear_const(current,rs1[i]);
1711       alloc_reg(current,i,rs1[i]);
1712       alloc_all(current,i);
1713     }
1714     else {
1715       alloc_all(current,i); // FIXME: Keep r0
1716       current->u&=~1LL;
1717       alloc_reg(current,i,0);
1718     }
1719   }
1720   else
1721   {
1722     // TLBR/TLBWI/TLBWR/TLBP/ERET
1723     assert(opcode2[i]==0x10);
1724     alloc_all(current,i);
1725   }
1726 }
1727
1728 void cop1_alloc(struct regstat *current,int i)
1729 {
1730   alloc_reg(current,i,CSREG); // Load status
1731   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1732   {
1733     assert(rt1[i]);
1734     clear_const(current,rt1[i]);
1735     if(opcode2[i]==1) {
1736       alloc_reg64(current,i,rt1[i]); // DMFC1
1737       current->is32&=~(1LL<<rt1[i]);
1738     }else{
1739       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1740       current->is32|=1LL<<rt1[i];
1741     }
1742     dirty_reg(current,rt1[i]);
1743     alloc_reg_temp(current,i,-1);
1744   }
1745   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1746   {
1747     if(rs1[i]){
1748       clear_const(current,rs1[i]);
1749       if(opcode2[i]==5)
1750         alloc_reg64(current,i,rs1[i]); // DMTC1
1751       else
1752         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1753       alloc_reg_temp(current,i,-1);
1754     }
1755     else {
1756       current->u&=~1LL;
1757       alloc_reg(current,i,0);
1758       alloc_reg_temp(current,i,-1);
1759     }
1760   }
1761 }
1762 void fconv_alloc(struct regstat *current,int i)
1763 {
1764   alloc_reg(current,i,CSREG); // Load status
1765   alloc_reg_temp(current,i,-1);
1766 }
1767 void float_alloc(struct regstat *current,int i)
1768 {
1769   alloc_reg(current,i,CSREG); // Load status
1770   alloc_reg_temp(current,i,-1);
1771 }
1772 void fcomp_alloc(struct regstat *current,int i)
1773 {
1774   alloc_reg(current,i,CSREG); // Load status
1775   alloc_reg(current,i,FSREG); // Load flags
1776   dirty_reg(current,FSREG); // Flag will be modified
1777   alloc_reg_temp(current,i,-1);
1778 }
1779
1780 void syscall_alloc(struct regstat *current,int i)
1781 {
1782   alloc_cc(current,i);
1783   dirty_reg(current,CCREG);
1784   alloc_all(current,i);
1785   current->isconst=0;
1786 }
1787
1788 void delayslot_alloc(struct regstat *current,int i)
1789 {
1790   switch(itype[i]) {
1791     case UJUMP:
1792     case CJUMP:
1793     case SJUMP:
1794     case RJUMP:
1795     case FJUMP:
1796     case SYSCALL:
1797     case SPAN:
1798       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1799       printf("Disabled speculative precompilation\n");
1800       stop_after_jal=1;
1801       break;
1802     case IMM16:
1803       imm16_alloc(current,i);
1804       break;
1805     case LOAD:
1806     case LOADLR:
1807       load_alloc(current,i);
1808       break;
1809     case STORE:
1810     case STORELR:
1811       store_alloc(current,i);
1812       break;
1813     case ALU:
1814       alu_alloc(current,i);
1815       break;
1816     case SHIFT:
1817       shift_alloc(current,i);
1818       break;
1819     case MULTDIV:
1820       multdiv_alloc(current,i);
1821       break;
1822     case SHIFTIMM:
1823       shiftimm_alloc(current,i);
1824       break;
1825     case MOV:
1826       mov_alloc(current,i);
1827       break;
1828     case COP0:
1829       cop0_alloc(current,i);
1830       break;
1831     case COP1:
1832       cop1_alloc(current,i);
1833       break;
1834     case C1LS:
1835       c1ls_alloc(current,i);
1836       break;
1837     case FCONV:
1838       fconv_alloc(current,i);
1839       break;
1840     case FLOAT:
1841       float_alloc(current,i);
1842       break;
1843     case FCOMP:
1844       fcomp_alloc(current,i);
1845       break;
1846   }
1847 }
1848
1849 // Special case where a branch and delay slot span two pages in virtual memory
1850 static void pagespan_alloc(struct regstat *current,int i)
1851 {
1852   current->isconst=0;
1853   current->wasconst=0;
1854   regs[i].wasconst=0;
1855   alloc_all(current,i);
1856   alloc_cc(current,i);
1857   dirty_reg(current,CCREG);
1858   if(opcode[i]==3) // JAL
1859   {
1860     alloc_reg(current,i,31);
1861     dirty_reg(current,31);
1862   }
1863   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1864   {
1865     alloc_reg(current,i,rs1[i]);
1866     if (rt1[i]==31) {
1867       alloc_reg(current,i,31);
1868       dirty_reg(current,31);
1869     }
1870   }
1871   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1872   {
1873     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1874     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1875     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1876     {
1877       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1878       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1879     }
1880   }
1881   else
1882   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1883   {
1884     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1885     if(!((current->is32>>rs1[i])&1))
1886     {
1887       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1888     }
1889   }
1890   else
1891   if(opcode[i]==0x11) // BC1
1892   {
1893     alloc_reg(current,i,FSREG);
1894     alloc_reg(current,i,CSREG);
1895   }
1896   //else ...
1897 }
1898
1899 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1900 {
1901   stubs[stubcount][0]=type;
1902   stubs[stubcount][1]=addr;
1903   stubs[stubcount][2]=retaddr;
1904   stubs[stubcount][3]=a;
1905   stubs[stubcount][4]=b;
1906   stubs[stubcount][5]=c;
1907   stubs[stubcount][6]=d;
1908   stubs[stubcount][7]=e;
1909   stubcount++;
1910 }
1911
1912 // Write out a single register
1913 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1914 {
1915   int hr;
1916   for(hr=0;hr<HOST_REGS;hr++) {
1917     if(hr!=EXCLUDE_REG) {
1918       if((regmap[hr]&63)==r) {
1919         if((dirty>>hr)&1) {
1920           if(regmap[hr]<64) {
1921             emit_storereg(r,hr);
1922             if((is32>>regmap[hr])&1) {
1923               emit_sarimm(hr,31,hr);
1924               emit_storereg(r|64,hr);
1925             }
1926           }else{
1927             emit_storereg(r|64,hr);
1928           }
1929         }
1930       }
1931     }
1932   }
1933 }
1934
1935 int mchecksum()
1936 {
1937   //if(!tracedebug) return 0;
1938   int i;
1939   int sum=0;
1940   for(i=0;i<2097152;i++) {
1941     unsigned int temp=sum;
1942     sum<<=1;
1943     sum|=(~temp)>>31;
1944     sum^=((u_int *)rdram)[i];
1945   }
1946   return sum;
1947 }
1948 int rchecksum()
1949 {
1950   int i;
1951   int sum=0;
1952   for(i=0;i<64;i++)
1953     sum^=((u_int *)reg)[i];
1954   return sum;
1955 }
1956 int fchecksum()
1957 {
1958   int i;
1959   int sum=0;
1960   for(i=0;i<64;i++)
1961     sum^=((u_int *)reg_cop1_fgr_64)[i];
1962   return sum;
1963 }
1964 void rlist()
1965 {
1966   int i;
1967   printf("TRACE: ");
1968   for(i=0;i<32;i++)
1969     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1970   printf("\n");
1971   printf("TRACE: ");
1972   for(i=0;i<32;i++)
1973     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1974   printf("\n");
1975 }
1976
1977 void enabletrace()
1978 {
1979   tracedebug=1;
1980 }
1981
1982 void memdebug(int i)
1983 {
1984   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1985   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1986   //rlist();
1987   //if(tracedebug) {
1988   //if(Count>=-2084597794) {
1989   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1990   //if(0) {
1991     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1992     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1993     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1994     rlist();
1995     #ifdef __i386__
1996     printf("TRACE: %x\n",(&i)[-1]);
1997     #endif
1998     #ifdef __arm__
1999     int j;
2000     printf("TRACE: %x \n",(&j)[10]);
2001     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2002     #endif
2003     //fflush(stdout);
2004   }
2005   //printf("TRACE: %x\n",(&i)[-1]);
2006 }
2007
2008 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2009 {
2010   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2011 }
2012
2013 void alu_assemble(int i,struct regstat *i_regs)
2014 {
2015   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2016     if(rt1[i]) {
2017       signed char s1,s2,t;
2018       t=get_reg(i_regs->regmap,rt1[i]);
2019       if(t>=0) {
2020         s1=get_reg(i_regs->regmap,rs1[i]);
2021         s2=get_reg(i_regs->regmap,rs2[i]);
2022         if(rs1[i]&&rs2[i]) {
2023           assert(s1>=0);
2024           assert(s2>=0);
2025           if(opcode2[i]&2) emit_sub(s1,s2,t);
2026           else emit_add(s1,s2,t);
2027         }
2028         else if(rs1[i]) {
2029           if(s1>=0) emit_mov(s1,t);
2030           else emit_loadreg(rs1[i],t);
2031         }
2032         else if(rs2[i]) {
2033           if(s2>=0) {
2034             if(opcode2[i]&2) emit_neg(s2,t);
2035             else emit_mov(s2,t);
2036           }
2037           else {
2038             emit_loadreg(rs2[i],t);
2039             if(opcode2[i]&2) emit_neg(t,t);
2040           }
2041         }
2042         else emit_zeroreg(t);
2043       }
2044     }
2045   }
2046   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2047     if(rt1[i]) {
2048       signed char s1l,s2l,s1h,s2h,tl,th;
2049       tl=get_reg(i_regs->regmap,rt1[i]);
2050       th=get_reg(i_regs->regmap,rt1[i]|64);
2051       if(tl>=0) {
2052         s1l=get_reg(i_regs->regmap,rs1[i]);
2053         s2l=get_reg(i_regs->regmap,rs2[i]);
2054         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2055         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2056         if(rs1[i]&&rs2[i]) {
2057           assert(s1l>=0);
2058           assert(s2l>=0);
2059           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2060           else emit_adds(s1l,s2l,tl);
2061           if(th>=0) {
2062             #ifdef INVERTED_CARRY
2063             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2064             #else
2065             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2066             #endif
2067             else emit_add(s1h,s2h,th);
2068           }
2069         }
2070         else if(rs1[i]) {
2071           if(s1l>=0) emit_mov(s1l,tl);
2072           else emit_loadreg(rs1[i],tl);
2073           if(th>=0) {
2074             if(s1h>=0) emit_mov(s1h,th);
2075             else emit_loadreg(rs1[i]|64,th);
2076           }
2077         }
2078         else if(rs2[i]) {
2079           if(s2l>=0) {
2080             if(opcode2[i]&2) emit_negs(s2l,tl);
2081             else emit_mov(s2l,tl);
2082           }
2083           else {
2084             emit_loadreg(rs2[i],tl);
2085             if(opcode2[i]&2) emit_negs(tl,tl);
2086           }
2087           if(th>=0) {
2088             #ifdef INVERTED_CARRY
2089             if(s2h>=0) emit_mov(s2h,th);
2090             else emit_loadreg(rs2[i]|64,th);
2091             if(opcode2[i]&2) {
2092               emit_adcimm(-1,th); // x86 has inverted carry flag
2093               emit_not(th,th);
2094             }
2095             #else
2096             if(opcode2[i]&2) {
2097               if(s2h>=0) emit_rscimm(s2h,0,th);
2098               else {
2099                 emit_loadreg(rs2[i]|64,th);
2100                 emit_rscimm(th,0,th);
2101               }
2102             }else{
2103               if(s2h>=0) emit_mov(s2h,th);
2104               else emit_loadreg(rs2[i]|64,th);
2105             }
2106             #endif
2107           }
2108         }
2109         else {
2110           emit_zeroreg(tl);
2111           if(th>=0) emit_zeroreg(th);
2112         }
2113       }
2114     }
2115   }
2116   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2117     if(rt1[i]) {
2118       signed char s1l,s1h,s2l,s2h,t;
2119       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2120       {
2121         t=get_reg(i_regs->regmap,rt1[i]);
2122         //assert(t>=0);
2123         if(t>=0) {
2124           s1l=get_reg(i_regs->regmap,rs1[i]);
2125           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2126           s2l=get_reg(i_regs->regmap,rs2[i]);
2127           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2128           if(rs2[i]==0) // rx<r0
2129           {
2130             assert(s1h>=0);
2131             if(opcode2[i]==0x2a) // SLT
2132               emit_shrimm(s1h,31,t);
2133             else // SLTU (unsigned can not be less than zero)
2134               emit_zeroreg(t);
2135           }
2136           else if(rs1[i]==0) // r0<rx
2137           {
2138             assert(s2h>=0);
2139             if(opcode2[i]==0x2a) // SLT
2140               emit_set_gz64_32(s2h,s2l,t);
2141             else // SLTU (set if not zero)
2142               emit_set_nz64_32(s2h,s2l,t);
2143           }
2144           else {
2145             assert(s1l>=0);assert(s1h>=0);
2146             assert(s2l>=0);assert(s2h>=0);
2147             if(opcode2[i]==0x2a) // SLT
2148               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2149             else // SLTU
2150               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2151           }
2152         }
2153       } else {
2154         t=get_reg(i_regs->regmap,rt1[i]);
2155         //assert(t>=0);
2156         if(t>=0) {
2157           s1l=get_reg(i_regs->regmap,rs1[i]);
2158           s2l=get_reg(i_regs->regmap,rs2[i]);
2159           if(rs2[i]==0) // rx<r0
2160           {
2161             assert(s1l>=0);
2162             if(opcode2[i]==0x2a) // SLT
2163               emit_shrimm(s1l,31,t);
2164             else // SLTU (unsigned can not be less than zero)
2165               emit_zeroreg(t);
2166           }
2167           else if(rs1[i]==0) // r0<rx
2168           {
2169             assert(s2l>=0);
2170             if(opcode2[i]==0x2a) // SLT
2171               emit_set_gz32(s2l,t);
2172             else // SLTU (set if not zero)
2173               emit_set_nz32(s2l,t);
2174           }
2175           else{
2176             assert(s1l>=0);assert(s2l>=0);
2177             if(opcode2[i]==0x2a) // SLT
2178               emit_set_if_less32(s1l,s2l,t);
2179             else // SLTU
2180               emit_set_if_carry32(s1l,s2l,t);
2181           }
2182         }
2183       }
2184     }
2185   }
2186   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2187     if(rt1[i]) {
2188       signed char s1l,s1h,s2l,s2h,th,tl;
2189       tl=get_reg(i_regs->regmap,rt1[i]);
2190       th=get_reg(i_regs->regmap,rt1[i]|64);
2191       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2192       {
2193         assert(tl>=0);
2194         if(tl>=0) {
2195           s1l=get_reg(i_regs->regmap,rs1[i]);
2196           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2197           s2l=get_reg(i_regs->regmap,rs2[i]);
2198           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2199           if(rs1[i]&&rs2[i]) {
2200             assert(s1l>=0);assert(s1h>=0);
2201             assert(s2l>=0);assert(s2h>=0);
2202             if(opcode2[i]==0x24) { // AND
2203               emit_and(s1l,s2l,tl);
2204               emit_and(s1h,s2h,th);
2205             } else
2206             if(opcode2[i]==0x25) { // OR
2207               emit_or(s1l,s2l,tl);
2208               emit_or(s1h,s2h,th);
2209             } else
2210             if(opcode2[i]==0x26) { // XOR
2211               emit_xor(s1l,s2l,tl);
2212               emit_xor(s1h,s2h,th);
2213             } else
2214             if(opcode2[i]==0x27) { // NOR
2215               emit_or(s1l,s2l,tl);
2216               emit_or(s1h,s2h,th);
2217               emit_not(tl,tl);
2218               emit_not(th,th);
2219             }
2220           }
2221           else
2222           {
2223             if(opcode2[i]==0x24) { // AND
2224               emit_zeroreg(tl);
2225               emit_zeroreg(th);
2226             } else
2227             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2228               if(rs1[i]){
2229                 if(s1l>=0) emit_mov(s1l,tl);
2230                 else emit_loadreg(rs1[i],tl);
2231                 if(s1h>=0) emit_mov(s1h,th);
2232                 else emit_loadreg(rs1[i]|64,th);
2233               }
2234               else
2235               if(rs2[i]){
2236                 if(s2l>=0) emit_mov(s2l,tl);
2237                 else emit_loadreg(rs2[i],tl);
2238                 if(s2h>=0) emit_mov(s2h,th);
2239                 else emit_loadreg(rs2[i]|64,th);
2240               }
2241               else{
2242                 emit_zeroreg(tl);
2243                 emit_zeroreg(th);
2244               }
2245             } else
2246             if(opcode2[i]==0x27) { // NOR
2247               if(rs1[i]){
2248                 if(s1l>=0) emit_not(s1l,tl);
2249                 else{
2250                   emit_loadreg(rs1[i],tl);
2251                   emit_not(tl,tl);
2252                 }
2253                 if(s1h>=0) emit_not(s1h,th);
2254                 else{
2255                   emit_loadreg(rs1[i]|64,th);
2256                   emit_not(th,th);
2257                 }
2258               }
2259               else
2260               if(rs2[i]){
2261                 if(s2l>=0) emit_not(s2l,tl);
2262                 else{
2263                   emit_loadreg(rs2[i],tl);
2264                   emit_not(tl,tl);
2265                 }
2266                 if(s2h>=0) emit_not(s2h,th);
2267                 else{
2268                   emit_loadreg(rs2[i]|64,th);
2269                   emit_not(th,th);
2270                 }
2271               }
2272               else {
2273                 emit_movimm(-1,tl);
2274                 emit_movimm(-1,th);
2275               }
2276             }
2277           }
2278         }
2279       }
2280       else
2281       {
2282         // 32 bit
2283         if(tl>=0) {
2284           s1l=get_reg(i_regs->regmap,rs1[i]);
2285           s2l=get_reg(i_regs->regmap,rs2[i]);
2286           if(rs1[i]&&rs2[i]) {
2287             assert(s1l>=0);
2288             assert(s2l>=0);
2289             if(opcode2[i]==0x24) { // AND
2290               emit_and(s1l,s2l,tl);
2291             } else
2292             if(opcode2[i]==0x25) { // OR
2293               emit_or(s1l,s2l,tl);
2294             } else
2295             if(opcode2[i]==0x26) { // XOR
2296               emit_xor(s1l,s2l,tl);
2297             } else
2298             if(opcode2[i]==0x27) { // NOR
2299               emit_or(s1l,s2l,tl);
2300               emit_not(tl,tl);
2301             }
2302           }
2303           else
2304           {
2305             if(opcode2[i]==0x24) { // AND
2306               emit_zeroreg(tl);
2307             } else
2308             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2309               if(rs1[i]){
2310                 if(s1l>=0) emit_mov(s1l,tl);
2311                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2312               }
2313               else
2314               if(rs2[i]){
2315                 if(s2l>=0) emit_mov(s2l,tl);
2316                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2317               }
2318               else emit_zeroreg(tl);
2319             } else
2320             if(opcode2[i]==0x27) { // NOR
2321               if(rs1[i]){
2322                 if(s1l>=0) emit_not(s1l,tl);
2323                 else {
2324                   emit_loadreg(rs1[i],tl);
2325                   emit_not(tl,tl);
2326                 }
2327               }
2328               else
2329               if(rs2[i]){
2330                 if(s2l>=0) emit_not(s2l,tl);
2331                 else {
2332                   emit_loadreg(rs2[i],tl);
2333                   emit_not(tl,tl);
2334                 }
2335               }
2336               else emit_movimm(-1,tl);
2337             }
2338           }
2339         }
2340       }
2341     }
2342   }
2343 }
2344
2345 void imm16_assemble(int i,struct regstat *i_regs)
2346 {
2347   if (opcode[i]==0x0f) { // LUI
2348     if(rt1[i]) {
2349       signed char t;
2350       t=get_reg(i_regs->regmap,rt1[i]);
2351       //assert(t>=0);
2352       if(t>=0) {
2353         if(!((i_regs->isconst>>t)&1))
2354           emit_movimm(imm[i]<<16,t);
2355       }
2356     }
2357   }
2358   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2359     if(rt1[i]) {
2360       signed char s,t;
2361       t=get_reg(i_regs->regmap,rt1[i]);
2362       s=get_reg(i_regs->regmap,rs1[i]);
2363       if(rs1[i]) {
2364         //assert(t>=0);
2365         //assert(s>=0);
2366         if(t>=0) {
2367           if(!((i_regs->isconst>>t)&1)) {
2368             if(s<0) {
2369               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2370               emit_addimm(t,imm[i],t);
2371             }else{
2372               if(!((i_regs->wasconst>>s)&1))
2373                 emit_addimm(s,imm[i],t);
2374               else
2375                 emit_movimm(constmap[i][s]+imm[i],t);
2376             }
2377           }
2378         }
2379       } else {
2380         if(t>=0) {
2381           if(!((i_regs->isconst>>t)&1))
2382             emit_movimm(imm[i],t);
2383         }
2384       }
2385     }
2386   }
2387   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2388     if(rt1[i]) {
2389       signed char sh,sl,th,tl;
2390       th=get_reg(i_regs->regmap,rt1[i]|64);
2391       tl=get_reg(i_regs->regmap,rt1[i]);
2392       sh=get_reg(i_regs->regmap,rs1[i]|64);
2393       sl=get_reg(i_regs->regmap,rs1[i]);
2394       if(tl>=0) {
2395         if(rs1[i]) {
2396           assert(sh>=0);
2397           assert(sl>=0);
2398           if(th>=0) {
2399             emit_addimm64_32(sh,sl,imm[i],th,tl);
2400           }
2401           else {
2402             emit_addimm(sl,imm[i],tl);
2403           }
2404         } else {
2405           emit_movimm(imm[i],tl);
2406           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2407         }
2408       }
2409     }
2410   }
2411   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2412     if(rt1[i]) {
2413       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2414       signed char sh,sl,t;
2415       t=get_reg(i_regs->regmap,rt1[i]);
2416       sh=get_reg(i_regs->regmap,rs1[i]|64);
2417       sl=get_reg(i_regs->regmap,rs1[i]);
2418       //assert(t>=0);
2419       if(t>=0) {
2420         if(rs1[i]>0) {
2421           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2422           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2423             if(opcode[i]==0x0a) { // SLTI
2424               if(sl<0) {
2425                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2426                 emit_slti32(t,imm[i],t);
2427               }else{
2428                 emit_slti32(sl,imm[i],t);
2429               }
2430             }
2431             else { // SLTIU
2432               if(sl<0) {
2433                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2434                 emit_sltiu32(t,imm[i],t);
2435               }else{
2436                 emit_sltiu32(sl,imm[i],t);
2437               }
2438             }
2439           }else{ // 64-bit
2440             assert(sl>=0);
2441             if(opcode[i]==0x0a) // SLTI
2442               emit_slti64_32(sh,sl,imm[i],t);
2443             else // SLTIU
2444               emit_sltiu64_32(sh,sl,imm[i],t);
2445           }
2446         }else{
2447           // SLTI(U) with r0 is just stupid,
2448           // nonetheless examples can be found
2449           if(opcode[i]==0x0a) // SLTI
2450             if(0<imm[i]) emit_movimm(1,t);
2451             else emit_zeroreg(t);
2452           else // SLTIU
2453           {
2454             if(imm[i]) emit_movimm(1,t);
2455             else emit_zeroreg(t);
2456           }
2457         }
2458       }
2459     }
2460   }
2461   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2462     if(rt1[i]) {
2463       signed char sh,sl,th,tl;
2464       th=get_reg(i_regs->regmap,rt1[i]|64);
2465       tl=get_reg(i_regs->regmap,rt1[i]);
2466       sh=get_reg(i_regs->regmap,rs1[i]|64);
2467       sl=get_reg(i_regs->regmap,rs1[i]);
2468       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2469         if(opcode[i]==0x0c) //ANDI
2470         {
2471           if(rs1[i]) {
2472             if(sl<0) {
2473               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2474               emit_andimm(tl,imm[i],tl);
2475             }else{
2476               if(!((i_regs->wasconst>>sl)&1))
2477                 emit_andimm(sl,imm[i],tl);
2478               else
2479                 emit_movimm(constmap[i][sl]&imm[i],tl);
2480             }
2481           }
2482           else
2483             emit_zeroreg(tl);
2484           if(th>=0) emit_zeroreg(th);
2485         }
2486         else
2487         {
2488           if(rs1[i]) {
2489             if(sl<0) {
2490               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2491             }
2492             if(th>=0) {
2493               if(sh<0) {
2494                 emit_loadreg(rs1[i]|64,th);
2495               }else{
2496                 emit_mov(sh,th);
2497               }
2498             }
2499             if(opcode[i]==0x0d) //ORI
2500             if(sl<0) {
2501               emit_orimm(tl,imm[i],tl);
2502             }else{
2503               if(!((i_regs->wasconst>>sl)&1))
2504                 emit_orimm(sl,imm[i],tl);
2505               else
2506                 emit_movimm(constmap[i][sl]|imm[i],tl);
2507             }
2508             if(opcode[i]==0x0e) //XORI
2509             if(sl<0) {
2510               emit_xorimm(tl,imm[i],tl);
2511             }else{
2512               if(!((i_regs->wasconst>>sl)&1))
2513                 emit_xorimm(sl,imm[i],tl);
2514               else
2515                 emit_movimm(constmap[i][sl]^imm[i],tl);
2516             }
2517           }
2518           else {
2519             emit_movimm(imm[i],tl);
2520             if(th>=0) emit_zeroreg(th);
2521           }
2522         }
2523       }
2524     }
2525   }
2526 }
2527
2528 void shiftimm_assemble(int i,struct regstat *i_regs)
2529 {
2530   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2531   {
2532     if(rt1[i]) {
2533       signed char s,t;
2534       t=get_reg(i_regs->regmap,rt1[i]);
2535       s=get_reg(i_regs->regmap,rs1[i]);
2536       //assert(t>=0);
2537       if(t>=0){
2538         if(rs1[i]==0)
2539         {
2540           emit_zeroreg(t);
2541         }
2542         else
2543         {
2544           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2545           if(imm[i]) {
2546             if(opcode2[i]==0) // SLL
2547             {
2548               emit_shlimm(s<0?t:s,imm[i],t);
2549             }
2550             if(opcode2[i]==2) // SRL
2551             {
2552               emit_shrimm(s<0?t:s,imm[i],t);
2553             }
2554             if(opcode2[i]==3) // SRA
2555             {
2556               emit_sarimm(s<0?t:s,imm[i],t);
2557             }
2558           }else{
2559             // Shift by zero
2560             if(s>=0 && s!=t) emit_mov(s,t);
2561           }
2562         }
2563       }
2564       //emit_storereg(rt1[i],t); //DEBUG
2565     }
2566   }
2567   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2568   {
2569     if(rt1[i]) {
2570       signed char sh,sl,th,tl;
2571       th=get_reg(i_regs->regmap,rt1[i]|64);
2572       tl=get_reg(i_regs->regmap,rt1[i]);
2573       sh=get_reg(i_regs->regmap,rs1[i]|64);
2574       sl=get_reg(i_regs->regmap,rs1[i]);
2575       if(tl>=0) {
2576         if(rs1[i]==0)
2577         {
2578           emit_zeroreg(tl);
2579           if(th>=0) emit_zeroreg(th);
2580         }
2581         else
2582         {
2583           assert(sl>=0);
2584           assert(sh>=0);
2585           if(imm[i]) {
2586             if(opcode2[i]==0x38) // DSLL
2587             {
2588               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2589               emit_shlimm(sl,imm[i],tl);
2590             }
2591             if(opcode2[i]==0x3a) // DSRL
2592             {
2593               emit_shrdimm(sl,sh,imm[i],tl);
2594               if(th>=0) emit_shrimm(sh,imm[i],th);
2595             }
2596             if(opcode2[i]==0x3b) // DSRA
2597             {
2598               emit_shrdimm(sl,sh,imm[i],tl);
2599               if(th>=0) emit_sarimm(sh,imm[i],th);
2600             }
2601           }else{
2602             // Shift by zero
2603             if(sl!=tl) emit_mov(sl,tl);
2604             if(th>=0&&sh!=th) emit_mov(sh,th);
2605           }
2606         }
2607       }
2608     }
2609   }
2610   if(opcode2[i]==0x3c) // DSLL32
2611   {
2612     if(rt1[i]) {
2613       signed char sl,tl,th;
2614       tl=get_reg(i_regs->regmap,rt1[i]);
2615       th=get_reg(i_regs->regmap,rt1[i]|64);
2616       sl=get_reg(i_regs->regmap,rs1[i]);
2617       if(th>=0||tl>=0){
2618         assert(tl>=0);
2619         assert(th>=0);
2620         assert(sl>=0);
2621         emit_mov(sl,th);
2622         emit_zeroreg(tl);
2623         if(imm[i]>32)
2624         {
2625           emit_shlimm(th,imm[i]&31,th);
2626         }
2627       }
2628     }
2629   }
2630   if(opcode2[i]==0x3e) // DSRL32
2631   {
2632     if(rt1[i]) {
2633       signed char sh,tl,th;
2634       tl=get_reg(i_regs->regmap,rt1[i]);
2635       th=get_reg(i_regs->regmap,rt1[i]|64);
2636       sh=get_reg(i_regs->regmap,rs1[i]|64);
2637       if(tl>=0){
2638         assert(sh>=0);
2639         emit_mov(sh,tl);
2640         if(th>=0) emit_zeroreg(th);
2641         if(imm[i]>32)
2642         {
2643           emit_shrimm(tl,imm[i]&31,tl);
2644         }
2645       }
2646     }
2647   }
2648   if(opcode2[i]==0x3f) // DSRA32
2649   {
2650     if(rt1[i]) {
2651       signed char sh,tl;
2652       tl=get_reg(i_regs->regmap,rt1[i]);
2653       sh=get_reg(i_regs->regmap,rs1[i]|64);
2654       if(tl>=0){
2655         assert(sh>=0);
2656         emit_mov(sh,tl);
2657         if(imm[i]>32)
2658         {
2659           emit_sarimm(tl,imm[i]&31,tl);
2660         }
2661       }
2662     }
2663   }
2664 }
2665
2666 #ifndef shift_assemble
2667 void shift_assemble(int i,struct regstat *i_regs)
2668 {
2669   printf("Need shift_assemble for this architecture.\n");
2670   exit(1);
2671 }
2672 #endif
2673
2674 void load_assemble(int i,struct regstat *i_regs)
2675 {
2676   int s,th,tl,addr,map=-1;
2677   int offset;
2678   int jaddr=0;
2679   int memtarget,c=0;
2680   u_int hr,reglist=0;
2681   th=get_reg(i_regs->regmap,rt1[i]|64);
2682   tl=get_reg(i_regs->regmap,rt1[i]);
2683   s=get_reg(i_regs->regmap,rs1[i]);
2684   offset=imm[i];
2685   for(hr=0;hr<HOST_REGS;hr++) {
2686     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2687   }
2688   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2689   if(s>=0) {
2690     c=(i_regs->wasconst>>s)&1;
2691     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2692     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2693   }
2694   if(offset||s<0||c) addr=tl;
2695   else addr=s;
2696   //printf("load_assemble: c=%d\n",c);
2697   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2698   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2699   if(tl>=0) {
2700     //assert(tl>=0);
2701     //assert(rt1[i]);
2702     reglist&=~(1<<tl);
2703     if(th>=0) reglist&=~(1<<th);
2704     if(!using_tlb) {
2705       if(!c) {
2706 //#define R29_HACK 1
2707         #ifdef R29_HACK
2708         // Strmnnrmn's speed hack
2709         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2710         #endif
2711         {
2712           emit_cmpimm(addr,0x800000);
2713           jaddr=(int)out;
2714           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2715           // Hint to branch predictor that the branch is unlikely to be taken
2716           if(rs1[i]>=28)
2717             emit_jno_unlikely(0);
2718           else
2719           #endif
2720           emit_jno(0);
2721         }
2722       }
2723     }else{ // using tlb
2724       int x=0;
2725       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2726       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2727       map=get_reg(i_regs->regmap,TLREG);
2728       assert(map>=0);
2729       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2730       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2731     }
2732     if (opcode[i]==0x20) { // LB
2733       if(!c||memtarget) {
2734         #ifdef HOST_IMM_ADDR32
2735         if(c)
2736           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2737         else
2738         #endif
2739         {
2740           //emit_xorimm(addr,3,tl);
2741           //gen_tlb_addr_r(tl,map);
2742           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2743           int x=0;
2744           if(!c) emit_xorimm(addr,3,tl);
2745           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2746           emit_movsbl_indexed_tlb(x,tl,map,tl);
2747         }
2748         if(jaddr)
2749           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2750       }
2751       else
2752         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2753     }
2754     if (opcode[i]==0x21) { // LH
2755       if(!c||memtarget) {
2756         #ifdef HOST_IMM_ADDR32
2757         if(c)
2758           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2759         else
2760         #endif
2761         {
2762           int x=0;
2763           if(!c) emit_xorimm(addr,2,tl);
2764           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2765           //#ifdef
2766           //emit_movswl_indexed_tlb(x,tl,map,tl);
2767           //else
2768           if(map>=0) {
2769             gen_tlb_addr_r(tl,map);
2770             emit_movswl_indexed(x,tl,tl);
2771           }else
2772             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2773         }
2774         if(jaddr)
2775           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2776       }
2777       else
2778         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2779     }
2780     if (opcode[i]==0x23) { // LW
2781       if(!c||memtarget) {
2782         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2783         #ifdef HOST_IMM_ADDR32
2784         if(c)
2785           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2786         else
2787         #endif
2788         emit_readword_indexed_tlb(0,addr,map,tl);
2789         if(jaddr)
2790           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2791       }
2792       else
2793         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2794     }
2795     if (opcode[i]==0x24) { // LBU
2796       if(!c||memtarget) {
2797         #ifdef HOST_IMM_ADDR32
2798         if(c)
2799           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2800         else
2801         #endif
2802         {
2803           //emit_xorimm(addr,3,tl);
2804           //gen_tlb_addr_r(tl,map);
2805           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2806           int x=0;
2807           if(!c) emit_xorimm(addr,3,tl);
2808           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2809           emit_movzbl_indexed_tlb(x,tl,map,tl);
2810         }
2811         if(jaddr)
2812           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2813       }
2814       else
2815         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2816     }
2817     if (opcode[i]==0x25) { // LHU
2818       if(!c||memtarget) {
2819         #ifdef HOST_IMM_ADDR32
2820         if(c)
2821           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2822         else
2823         #endif
2824         {
2825           int x=0;
2826           if(!c) emit_xorimm(addr,2,tl);
2827           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2828           //#ifdef
2829           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2830           //#else
2831           if(map>=0) {
2832             gen_tlb_addr_r(tl,map);
2833             emit_movzwl_indexed(x,tl,tl);
2834           }else
2835             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2836           if(jaddr)
2837             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2838         }
2839       }
2840       else
2841         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2842     }
2843     if (opcode[i]==0x27) { // LWU
2844       assert(th>=0);
2845       if(!c||memtarget) {
2846         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2847         #ifdef HOST_IMM_ADDR32
2848         if(c)
2849           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2850         else
2851         #endif
2852         emit_readword_indexed_tlb(0,addr,map,tl);
2853         if(jaddr)
2854           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2855       }
2856       else {
2857         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2858       }
2859       emit_zeroreg(th);
2860     }
2861     if (opcode[i]==0x37) { // LD
2862       if(!c||memtarget) {
2863         //gen_tlb_addr_r(tl,map);
2864         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2865         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2866         #ifdef HOST_IMM_ADDR32
2867         if(c)
2868           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2869         else
2870         #endif
2871         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2872         if(jaddr)
2873           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2874       }
2875       else
2876         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2877     }
2878     //emit_storereg(rt1[i],tl); // DEBUG
2879   }
2880   //if(opcode[i]==0x23)
2881   //if(opcode[i]==0x24)
2882   //if(opcode[i]==0x23||opcode[i]==0x24)
2883   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2884   {
2885     //emit_pusha();
2886     save_regs(0x100f);
2887         emit_readword((int)&last_count,ECX);
2888         #ifdef __i386__
2889         if(get_reg(i_regs->regmap,CCREG)<0)
2890           emit_loadreg(CCREG,HOST_CCREG);
2891         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2892         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2893         emit_writeword(HOST_CCREG,(int)&Count);
2894         #endif
2895         #ifdef __arm__
2896         if(get_reg(i_regs->regmap,CCREG)<0)
2897           emit_loadreg(CCREG,0);
2898         else
2899           emit_mov(HOST_CCREG,0);
2900         emit_add(0,ECX,0);
2901         emit_addimm(0,2*ccadj[i],0);
2902         emit_writeword(0,(int)&Count);
2903         #endif
2904     emit_call((int)memdebug);
2905     //emit_popa();
2906     restore_regs(0x100f);
2907   }/**/
2908 }
2909
2910 #ifndef loadlr_assemble
2911 void loadlr_assemble(int i,struct regstat *i_regs)
2912 {
2913   printf("Need loadlr_assemble for this architecture.\n");
2914   exit(1);
2915 }
2916 #endif
2917
2918 void store_assemble(int i,struct regstat *i_regs)
2919 {
2920   int s,th,tl,map=-1;
2921   int addr,temp;
2922   int offset;
2923   int jaddr=0,jaddr2,type;
2924   int memtarget,c=0;
2925   int agr=AGEN1+(i&1);
2926   u_int hr,reglist=0;
2927   th=get_reg(i_regs->regmap,rs2[i]|64);
2928   tl=get_reg(i_regs->regmap,rs2[i]);
2929   s=get_reg(i_regs->regmap,rs1[i]);
2930   temp=get_reg(i_regs->regmap,agr);
2931   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2932   offset=imm[i];
2933   if(s>=0) {
2934     c=(i_regs->wasconst>>s)&1;
2935     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2936     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2937   }
2938   assert(tl>=0);
2939   assert(temp>=0);
2940   for(hr=0;hr<HOST_REGS;hr++) {
2941     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2942   }
2943   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2944   if(offset||s<0||c) addr=temp;
2945   else addr=s;
2946   if(!using_tlb) {
2947     if(!c) {
2948       #ifdef R29_HACK
2949       // Strmnnrmn's speed hack
2950       memtarget=1;
2951       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2952       #endif
2953       emit_cmpimm(addr,0x800000);
2954       #ifdef DESTRUCTIVE_SHIFT
2955       if(s==addr) emit_mov(s,temp);
2956       #endif
2957       #ifdef R29_HACK
2958       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2959       #endif
2960       {
2961         jaddr=(int)out;
2962         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2963         // Hint to branch predictor that the branch is unlikely to be taken
2964         if(rs1[i]>=28)
2965           emit_jno_unlikely(0);
2966         else
2967         #endif
2968         emit_jno(0);
2969       }
2970     }
2971   }else{ // using tlb
2972     int x=0;
2973     if (opcode[i]==0x28) x=3; // SB
2974     if (opcode[i]==0x29) x=2; // SH
2975     map=get_reg(i_regs->regmap,TLREG);
2976     assert(map>=0);
2977     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
2978     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
2979   }
2980
2981   if (opcode[i]==0x28) { // SB
2982     if(!c||memtarget) {
2983       int x=0;
2984       if(!c) emit_xorimm(addr,3,temp);
2985       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2986       //gen_tlb_addr_w(temp,map);
2987       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2988       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
2989     }
2990     type=STOREB_STUB;
2991   }
2992   if (opcode[i]==0x29) { // SH
2993     if(!c||memtarget) {
2994       int x=0;
2995       if(!c) emit_xorimm(addr,2,temp);
2996       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2997       //#ifdef
2998       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2999       //#else
3000       if(map>=0) {
3001         gen_tlb_addr_w(temp,map);
3002         emit_writehword_indexed(tl,x,temp);
3003       }else
3004         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3005     }
3006     type=STOREH_STUB;
3007   }
3008   if (opcode[i]==0x2B) { // SW
3009     if(!c||memtarget)
3010       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3011       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3012     type=STOREW_STUB;
3013   }
3014   if (opcode[i]==0x3F) { // SD
3015     if(!c||memtarget) {
3016       if(rs2[i]) {
3017         assert(th>=0);
3018         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3019         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3020         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3021       }else{
3022         // Store zero
3023         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3024         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3025         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3026       }
3027     }
3028     type=STORED_STUB;
3029   }
3030   if(jaddr) {
3031     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3032   } else if(!memtarget) {
3033     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3034   }
3035   if(!using_tlb) {
3036     if(!c||memtarget) {
3037       #ifdef DESTRUCTIVE_SHIFT
3038       // The x86 shift operation is 'destructive'; it overwrites the
3039       // source register, so we need to make a copy first and use that.
3040       addr=temp;
3041       #endif
3042       #if defined(HOST_IMM8)
3043       int ir=get_reg(i_regs->regmap,INVCP);
3044       assert(ir>=0);
3045       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3046       #else
3047       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3048       #endif
3049       jaddr2=(int)out;
3050       emit_jne(0);
3051       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3052     }
3053   }
3054   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3055   //if(opcode[i]==0x2B || opcode[i]==0x28)
3056   //if(opcode[i]==0x2B || opcode[i]==0x29)
3057   //if(opcode[i]==0x2B)
3058   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3059   {
3060     //emit_pusha();
3061     save_regs(0x100f);
3062         emit_readword((int)&last_count,ECX);
3063         #ifdef __i386__
3064         if(get_reg(i_regs->regmap,CCREG)<0)
3065           emit_loadreg(CCREG,HOST_CCREG);
3066         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3067         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3068         emit_writeword(HOST_CCREG,(int)&Count);
3069         #endif
3070         #ifdef __arm__
3071         if(get_reg(i_regs->regmap,CCREG)<0)
3072           emit_loadreg(CCREG,0);
3073         else
3074           emit_mov(HOST_CCREG,0);
3075         emit_add(0,ECX,0);
3076         emit_addimm(0,2*ccadj[i],0);
3077         emit_writeword(0,(int)&Count);
3078         #endif
3079     emit_call((int)memdebug);
3080     //emit_popa();
3081     restore_regs(0x100f);
3082   }/**/
3083 }
3084
3085 void storelr_assemble(int i,struct regstat *i_regs)
3086 {
3087   int s,th,tl;
3088   int temp;
3089   int temp2;
3090   int offset;
3091   int jaddr=0,jaddr2;
3092   int case1,case2,case3;
3093   int done0,done1,done2;
3094   int memtarget,c=0;
3095   u_int hr,reglist=0;
3096   th=get_reg(i_regs->regmap,rs2[i]|64);
3097   tl=get_reg(i_regs->regmap,rs2[i]);
3098   s=get_reg(i_regs->regmap,rs1[i]);
3099   temp=get_reg(i_regs->regmap,-1);
3100   offset=imm[i];
3101   if(s>=0) {
3102     c=(i_regs->isconst>>s)&1;
3103     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3104     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3105   }
3106   assert(tl>=0);
3107   for(hr=0;hr<HOST_REGS;hr++) {
3108     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3109   }
3110   if(tl>=0) {
3111     assert(temp>=0);
3112     if(!using_tlb) {
3113       if(!c) {
3114         emit_cmpimm(s<0||offset?temp:s,0x800000);
3115         if(!offset&&s!=temp) emit_mov(s,temp);
3116         jaddr=(int)out;
3117         emit_jno(0);
3118       }
3119       else
3120       {
3121         if(!memtarget||!rs1[i]) {
3122           jaddr=(int)out;
3123           emit_jmp(0);
3124         }
3125       }
3126       if((u_int)rdram!=0x80000000) 
3127         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3128     }else{ // using tlb
3129       int map=get_reg(i_regs->regmap,TLREG);
3130       assert(map>=0);
3131       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3132       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3133       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3134       if(!jaddr&&!memtarget) {
3135         jaddr=(int)out;
3136         emit_jmp(0);
3137       }
3138       gen_tlb_addr_w(temp,map);
3139     }
3140
3141     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3142       temp2=get_reg(i_regs->regmap,FTEMP);
3143       if(!rs2[i]) temp2=th=tl;
3144     }
3145
3146     emit_testimm(temp,2);
3147     case2=(int)out;
3148     emit_jne(0);
3149     emit_testimm(temp,1);
3150     case1=(int)out;
3151     emit_jne(0);
3152     // 0
3153     if (opcode[i]==0x2A) { // SWL
3154       emit_writeword_indexed(tl,0,temp);
3155     }
3156     if (opcode[i]==0x2E) { // SWR
3157       emit_writebyte_indexed(tl,3,temp);
3158     }
3159     if (opcode[i]==0x2C) { // SDL
3160       emit_writeword_indexed(th,0,temp);
3161       if(rs2[i]) emit_mov(tl,temp2);
3162     }
3163     if (opcode[i]==0x2D) { // SDR
3164       emit_writebyte_indexed(tl,3,temp);
3165       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3166     }
3167     done0=(int)out;
3168     emit_jmp(0);
3169     // 1
3170     set_jump_target(case1,(int)out);
3171     if (opcode[i]==0x2A) { // SWL
3172       // Write 3 msb into three least significant bytes
3173       if(rs2[i]) emit_rorimm(tl,8,tl);
3174       emit_writehword_indexed(tl,-1,temp);
3175       if(rs2[i]) emit_rorimm(tl,16,tl);
3176       emit_writebyte_indexed(tl,1,temp);
3177       if(rs2[i]) emit_rorimm(tl,8,tl);
3178     }
3179     if (opcode[i]==0x2E) { // SWR
3180       // Write two lsb into two most significant bytes
3181       emit_writehword_indexed(tl,1,temp);
3182     }
3183     if (opcode[i]==0x2C) { // SDL
3184       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3185       // Write 3 msb into three least significant bytes
3186       if(rs2[i]) emit_rorimm(th,8,th);
3187       emit_writehword_indexed(th,-1,temp);
3188       if(rs2[i]) emit_rorimm(th,16,th);
3189       emit_writebyte_indexed(th,1,temp);
3190       if(rs2[i]) emit_rorimm(th,8,th);
3191     }
3192     if (opcode[i]==0x2D) { // SDR
3193       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3194       // Write two lsb into two most significant bytes
3195       emit_writehword_indexed(tl,1,temp);
3196     }
3197     done1=(int)out;
3198     emit_jmp(0);
3199     // 2
3200     set_jump_target(case2,(int)out);
3201     emit_testimm(temp,1);
3202     case3=(int)out;
3203     emit_jne(0);
3204     if (opcode[i]==0x2A) { // SWL
3205       // Write two msb into two least significant bytes
3206       if(rs2[i]) emit_rorimm(tl,16,tl);
3207       emit_writehword_indexed(tl,-2,temp);
3208       if(rs2[i]) emit_rorimm(tl,16,tl);
3209     }
3210     if (opcode[i]==0x2E) { // SWR
3211       // Write 3 lsb into three most significant bytes
3212       emit_writebyte_indexed(tl,-1,temp);
3213       if(rs2[i]) emit_rorimm(tl,8,tl);
3214       emit_writehword_indexed(tl,0,temp);
3215       if(rs2[i]) emit_rorimm(tl,24,tl);
3216     }
3217     if (opcode[i]==0x2C) { // SDL
3218       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3219       // Write two msb into two least significant bytes
3220       if(rs2[i]) emit_rorimm(th,16,th);
3221       emit_writehword_indexed(th,-2,temp);
3222       if(rs2[i]) emit_rorimm(th,16,th);
3223     }
3224     if (opcode[i]==0x2D) { // SDR
3225       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3226       // Write 3 lsb into three most significant bytes
3227       emit_writebyte_indexed(tl,-1,temp);
3228       if(rs2[i]) emit_rorimm(tl,8,tl);
3229       emit_writehword_indexed(tl,0,temp);
3230       if(rs2[i]) emit_rorimm(tl,24,tl);
3231     }
3232     done2=(int)out;
3233     emit_jmp(0);
3234     // 3
3235     set_jump_target(case3,(int)out);
3236     if (opcode[i]==0x2A) { // SWL
3237       // Write msb into least significant byte
3238       if(rs2[i]) emit_rorimm(tl,24,tl);
3239       emit_writebyte_indexed(tl,-3,temp);
3240       if(rs2[i]) emit_rorimm(tl,8,tl);
3241     }
3242     if (opcode[i]==0x2E) { // SWR
3243       // Write entire word
3244       emit_writeword_indexed(tl,-3,temp);
3245     }
3246     if (opcode[i]==0x2C) { // SDL
3247       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3248       // Write msb into least significant byte
3249       if(rs2[i]) emit_rorimm(th,24,th);
3250       emit_writebyte_indexed(th,-3,temp);
3251       if(rs2[i]) emit_rorimm(th,8,th);
3252     }
3253     if (opcode[i]==0x2D) { // SDR
3254       if(rs2[i]) emit_mov(th,temp2);
3255       // Write entire word
3256       emit_writeword_indexed(tl,-3,temp);
3257     }
3258     set_jump_target(done0,(int)out);
3259     set_jump_target(done1,(int)out);
3260     set_jump_target(done2,(int)out);
3261     if (opcode[i]==0x2C) { // SDL
3262       emit_testimm(temp,4);
3263       done0=(int)out;
3264       emit_jne(0);
3265       emit_andimm(temp,~3,temp);
3266       emit_writeword_indexed(temp2,4,temp);
3267       set_jump_target(done0,(int)out);
3268     }
3269     if (opcode[i]==0x2D) { // SDR
3270       emit_testimm(temp,4);
3271       done0=(int)out;
3272       emit_jeq(0);
3273       emit_andimm(temp,~3,temp);
3274       emit_writeword_indexed(temp2,-4,temp);
3275       set_jump_target(done0,(int)out);
3276     }
3277     if(!c||!memtarget)
3278       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3279   }
3280   if(!using_tlb) {
3281     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3282     #if defined(HOST_IMM8)
3283     int ir=get_reg(i_regs->regmap,INVCP);
3284     assert(ir>=0);
3285     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3286     #else
3287     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3288     #endif
3289     jaddr2=(int)out;
3290     emit_jne(0);
3291     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3292   }
3293   /*
3294     emit_pusha();
3295     //save_regs(0x100f);
3296         emit_readword((int)&last_count,ECX);
3297         if(get_reg(i_regs->regmap,CCREG)<0)
3298           emit_loadreg(CCREG,HOST_CCREG);
3299         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3300         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3301         emit_writeword(HOST_CCREG,(int)&Count);
3302     emit_call((int)memdebug);
3303     emit_popa();
3304     //restore_regs(0x100f);
3305   /**/
3306 }
3307
3308 void c1ls_assemble(int i,struct regstat *i_regs)
3309 {
3310   int s,th,tl;
3311   int temp,ar;
3312   int map=-1;
3313   int offset;
3314   int c=0;
3315   int jaddr,jaddr2=0,jaddr3,type;
3316   int agr=AGEN1+(i&1);
3317   u_int hr,reglist=0;
3318   th=get_reg(i_regs->regmap,FTEMP|64);
3319   tl=get_reg(i_regs->regmap,FTEMP);
3320   s=get_reg(i_regs->regmap,rs1[i]);
3321   temp=get_reg(i_regs->regmap,agr);
3322   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3323   offset=imm[i];
3324   assert(tl>=0);
3325   assert(rs1[i]>0);
3326   assert(temp>=0);
3327   for(hr=0;hr<HOST_REGS;hr++) {
3328     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3329   }
3330   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3331   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3332   {
3333     // Loads use a temporary register which we need to save
3334     reglist|=1<<temp;
3335   }
3336   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3337     ar=temp;
3338   else // LWC1/LDC1
3339     ar=tl;
3340   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3341   //else c=(i_regs->wasconst>>s)&1;
3342   if(s>=0) c=(i_regs->wasconst>>s)&1;
3343   // Check cop1 unusable
3344   if(!cop1_usable) {
3345     signed char rs=get_reg(i_regs->regmap,CSREG);
3346     assert(rs>=0);
3347     emit_testimm(rs,0x20000000);
3348     jaddr=(int)out;
3349     emit_jeq(0);
3350     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3351     cop1_usable=1;
3352   }
3353   if (opcode[i]==0x39) { // SWC1 (get float address)
3354     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3355   }
3356   if (opcode[i]==0x3D) { // SDC1 (get double address)
3357     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3358   }
3359   // Generate address + offset
3360   if(!using_tlb) {
3361     if(!c)
3362       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3363   }
3364   else
3365   {
3366     map=get_reg(i_regs->regmap,TLREG);
3367     assert(map>=0);
3368     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3369       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3370     }
3371     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3372       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3373     }
3374   }
3375   if (opcode[i]==0x39) { // SWC1 (read float)
3376     emit_readword_indexed(0,tl,tl);
3377   }
3378   if (opcode[i]==0x3D) { // SDC1 (read double)
3379     emit_readword_indexed(4,tl,th);
3380     emit_readword_indexed(0,tl,tl);
3381   }
3382   if (opcode[i]==0x31) { // LWC1 (get target address)
3383     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3384   }
3385   if (opcode[i]==0x35) { // LDC1 (get target address)
3386     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3387   }
3388   if(!using_tlb) {
3389     if(!c) {
3390       jaddr2=(int)out;
3391       emit_jno(0);
3392     }
3393     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3394       jaddr2=(int)out;
3395       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3396     }
3397     #ifdef DESTRUCTIVE_SHIFT
3398     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3399       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3400     }
3401     #endif
3402   }else{
3403     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3404       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3405     }
3406     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3407       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3408     }
3409   }
3410   if (opcode[i]==0x31) { // LWC1
3411     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3412     //gen_tlb_addr_r(ar,map);
3413     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3414     #ifdef HOST_IMM_ADDR32
3415     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3416     else
3417     #endif
3418     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3419     type=LOADW_STUB;
3420   }
3421   if (opcode[i]==0x35) { // LDC1
3422     assert(th>=0);
3423     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3424     //gen_tlb_addr_r(ar,map);
3425     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3426     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3427     #ifdef HOST_IMM_ADDR32
3428     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3429     else
3430     #endif
3431     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3432     type=LOADD_STUB;
3433   }
3434   if (opcode[i]==0x39) { // SWC1
3435     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3436     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3437     type=STOREW_STUB;
3438   }
3439   if (opcode[i]==0x3D) { // SDC1
3440     assert(th>=0);
3441     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3442     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3443     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3444     type=STORED_STUB;
3445   }
3446   if(!using_tlb) {
3447     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3448       #ifndef DESTRUCTIVE_SHIFT
3449       temp=offset||c||s<0?ar:s;
3450       #endif
3451       #if defined(HOST_IMM8)
3452       int ir=get_reg(i_regs->regmap,INVCP);
3453       assert(ir>=0);
3454       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3455       #else
3456       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3457       #endif
3458       jaddr3=(int)out;
3459       emit_jne(0);
3460       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3461     }
3462   }
3463   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3464   if (opcode[i]==0x31) { // LWC1 (write float)
3465     emit_writeword_indexed(tl,0,temp);
3466   }
3467   if (opcode[i]==0x35) { // LDC1 (write double)
3468     emit_writeword_indexed(th,4,temp);
3469     emit_writeword_indexed(tl,0,temp);
3470   }
3471   //if(opcode[i]==0x39)
3472   /*if(opcode[i]==0x39||opcode[i]==0x31)
3473   {
3474     emit_pusha();
3475         emit_readword((int)&last_count,ECX);
3476         if(get_reg(i_regs->regmap,CCREG)<0)
3477           emit_loadreg(CCREG,HOST_CCREG);
3478         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3479         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3480         emit_writeword(HOST_CCREG,(int)&Count);
3481     emit_call((int)memdebug);
3482     emit_popa();
3483   }/**/
3484 }
3485
3486 #ifndef multdiv_assemble
3487 void multdiv_assemble(int i,struct regstat *i_regs)
3488 {
3489   printf("Need multdiv_assemble for this architecture.\n");
3490   exit(1);
3491 }
3492 #endif
3493
3494 void mov_assemble(int i,struct regstat *i_regs)
3495 {
3496   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3497   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3498   assert(rt1[i]>0);
3499   if(rt1[i]) {
3500     signed char sh,sl,th,tl;
3501     th=get_reg(i_regs->regmap,rt1[i]|64);
3502     tl=get_reg(i_regs->regmap,rt1[i]);
3503     //assert(tl>=0);
3504     if(tl>=0) {
3505       sh=get_reg(i_regs->regmap,rs1[i]|64);
3506       sl=get_reg(i_regs->regmap,rs1[i]);
3507       if(sl>=0) emit_mov(sl,tl);
3508       else emit_loadreg(rs1[i],tl);
3509       if(th>=0) {
3510         if(sh>=0) emit_mov(sh,th);
3511         else emit_loadreg(rs1[i]|64,th);
3512       }
3513     }
3514   }
3515 }
3516
3517 #ifndef fconv_assemble
3518 void fconv_assemble(int i,struct regstat *i_regs)
3519 {
3520   printf("Need fconv_assemble for this architecture.\n");
3521   exit(1);
3522 }
3523 #endif
3524
3525 #if 0
3526 void float_assemble(int i,struct regstat *i_regs)
3527 {
3528   printf("Need float_assemble for this architecture.\n");
3529   exit(1);
3530 }
3531 #endif
3532
3533 void syscall_assemble(int i,struct regstat *i_regs)
3534 {
3535   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3536   assert(ccreg==HOST_CCREG);
3537   assert(!is_delayslot);
3538   emit_movimm(start+i*4,EAX); // Get PC
3539   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3540   emit_jmp((int)jump_syscall);
3541 }
3542
3543 void ds_assemble(int i,struct regstat *i_regs)
3544 {
3545   is_delayslot=1;
3546   switch(itype[i]) {
3547     case ALU:
3548       alu_assemble(i,i_regs);break;
3549     case IMM16:
3550       imm16_assemble(i,i_regs);break;
3551     case SHIFT:
3552       shift_assemble(i,i_regs);break;
3553     case SHIFTIMM:
3554       shiftimm_assemble(i,i_regs);break;
3555     case LOAD:
3556       load_assemble(i,i_regs);break;
3557     case LOADLR:
3558       loadlr_assemble(i,i_regs);break;
3559     case STORE:
3560       store_assemble(i,i_regs);break;
3561     case STORELR:
3562       storelr_assemble(i,i_regs);break;
3563     case COP0:
3564       cop0_assemble(i,i_regs);break;
3565     case COP1:
3566       cop1_assemble(i,i_regs);break;
3567     case C1LS:
3568       c1ls_assemble(i,i_regs);break;
3569     case FCONV:
3570       fconv_assemble(i,i_regs);break;
3571     case FLOAT:
3572       float_assemble(i,i_regs);break;
3573     case FCOMP:
3574       fcomp_assemble(i,i_regs);break;
3575     case MULTDIV:
3576       multdiv_assemble(i,i_regs);break;
3577     case MOV:
3578       mov_assemble(i,i_regs);break;
3579     case SYSCALL:
3580     case SPAN:
3581     case UJUMP:
3582     case RJUMP:
3583     case CJUMP:
3584     case SJUMP:
3585     case FJUMP:
3586       printf("Jump in the delay slot.  This is probably a bug.\n");
3587   }
3588   is_delayslot=0;
3589 }
3590
3591 // Is the branch target a valid internal jump?
3592 int internal_branch(uint64_t i_is32,int addr)
3593 {
3594   if(addr&1) return 0; // Indirect (register) jump
3595   if(addr>=start && addr<start+slen*4-4)
3596   {
3597     int t=(addr-start)>>2;
3598     // Delay slots are not valid branch targets
3599     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3600     // 64 -> 32 bit transition requires a recompile
3601     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3602     {
3603       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3604       else printf("optimizable: yes\n");
3605     }*/
3606     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3607     if(requires_32bit[t]&~i_is32) return 0;
3608     else return 1;
3609   }
3610   return 0;
3611 }
3612
3613 #ifndef wb_invalidate
3614 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3615   uint64_t u,uint64_t uu)
3616 {
3617   int hr;
3618   for(hr=0;hr<HOST_REGS;hr++) {
3619     if(hr!=EXCLUDE_REG) {
3620       if(pre[hr]!=entry[hr]) {
3621         if(pre[hr]>=0) {
3622           if((dirty>>hr)&1) {
3623             if(get_reg(entry,pre[hr])<0) {
3624               if(pre[hr]<64) {
3625                 if(!((u>>pre[hr])&1)) {
3626                   emit_storereg(pre[hr],hr);
3627                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3628                     emit_sarimm(hr,31,hr);
3629                     emit_storereg(pre[hr]|64,hr);
3630                   }
3631                 }
3632               }else{
3633                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3634                   emit_storereg(pre[hr],hr);
3635                 }
3636               }
3637             }
3638           }
3639         }
3640       }
3641     }
3642   }
3643   // Move from one register to another (no writeback)
3644   for(hr=0;hr<HOST_REGS;hr++) {
3645     if(hr!=EXCLUDE_REG) {
3646       if(pre[hr]!=entry[hr]) {
3647         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3648           int nr;
3649           if((nr=get_reg(entry,pre[hr]))>=0) {
3650             emit_mov(hr,nr);
3651           }
3652         }
3653       }
3654     }
3655   }
3656 }
3657 #endif
3658
3659 // Load the specified registers
3660 // This only loads the registers given as arguments because
3661 // we don't want to load things that will be overwritten
3662 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3663 {
3664   int hr;
3665   // Load 32-bit regs
3666   for(hr=0;hr<HOST_REGS;hr++) {
3667     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3668       if(entry[hr]!=regmap[hr]) {
3669         if(regmap[hr]==rs1||regmap[hr]==rs2)
3670         {
3671           if(regmap[hr]==0) {
3672             emit_zeroreg(hr);
3673           }
3674           else
3675           {
3676             emit_loadreg(regmap[hr],hr);
3677           }
3678         }
3679       }
3680     }
3681   }
3682   //Load 64-bit regs
3683   for(hr=0;hr<HOST_REGS;hr++) {
3684     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3685       if(entry[hr]!=regmap[hr]) {
3686         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3687         {
3688           assert(regmap[hr]!=64);
3689           if((is32>>(regmap[hr]&63))&1) {
3690             int lr=get_reg(regmap,regmap[hr]-64);
3691             if(lr>=0)
3692               emit_sarimm(lr,31,hr);
3693             else
3694               emit_loadreg(regmap[hr],hr);
3695           }
3696           else
3697           {
3698             emit_loadreg(regmap[hr],hr);
3699           }
3700         }
3701       }
3702     }
3703   }
3704 }
3705
3706 // Load registers prior to the start of a loop
3707 // so that they are not loaded within the loop
3708 static void loop_preload(signed char pre[],signed char entry[])
3709 {
3710   int hr;
3711   for(hr=0;hr<HOST_REGS;hr++) {
3712     if(hr!=EXCLUDE_REG) {
3713       if(pre[hr]!=entry[hr]) {
3714         if(entry[hr]>=0) {
3715           if(get_reg(pre,entry[hr])<0) {
3716             assem_debug("loop preload:\n");
3717             //printf("loop preload: %d\n",hr);
3718             if(entry[hr]==0) {
3719               emit_zeroreg(hr);
3720             }
3721             else if(entry[hr]<TEMPREG)
3722             {
3723               emit_loadreg(entry[hr],hr);
3724             }
3725             else if(entry[hr]-64<TEMPREG)
3726             {
3727               emit_loadreg(entry[hr],hr);
3728             }
3729           }
3730         }
3731       }
3732     }
3733   }
3734 }
3735
3736 // Generate address for load/store instruction
3737 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3738 {
3739   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3740     int ra;
3741     int agr=AGEN1+(i&1);
3742     int mgr=MGEN1+(i&1);
3743     if(itype[i]==LOAD) {
3744       ra=get_reg(i_regs->regmap,rt1[i]);
3745       //if(rt1[i]) assert(ra>=0);
3746     }
3747     if(itype[i]==LOADLR) {
3748       ra=get_reg(i_regs->regmap,FTEMP);
3749     }
3750     if(itype[i]==STORE||itype[i]==STORELR) {
3751       ra=get_reg(i_regs->regmap,agr);
3752       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3753     }
3754     if(itype[i]==C1LS) {
3755       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3756         ra=get_reg(i_regs->regmap,FTEMP);
3757       else { // SWC1/SDC1
3758         ra=get_reg(i_regs->regmap,agr);
3759         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3760       }
3761     }
3762     int rs=get_reg(i_regs->regmap,rs1[i]);
3763     int rm=get_reg(i_regs->regmap,TLREG);
3764     if(ra>=0) {
3765       int offset=imm[i];
3766       int c=(i_regs->wasconst>>rs)&1;
3767       if(rs1[i]==0) {
3768         // Using r0 as a base address
3769         /*if(rm>=0) {
3770           if(!entry||entry[rm]!=mgr) {
3771             generate_map_const(offset,rm);
3772           } // else did it in the previous cycle
3773         }*/
3774         if(!entry||entry[ra]!=agr) {
3775           if (opcode[i]==0x22||opcode[i]==0x26) {
3776             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3777           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3778             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3779           }else{
3780             emit_movimm(offset,ra);
3781           }
3782         } // else did it in the previous cycle
3783       }
3784       else if(rs<0) {
3785         if(!entry||entry[ra]!=rs1[i])
3786           emit_loadreg(rs1[i],ra);
3787         //if(!entry||entry[ra]!=rs1[i])
3788         //  printf("poor load scheduling!\n");
3789       }
3790       else if(c) {
3791         if(rm>=0) {
3792           if(!entry||entry[rm]!=mgr) {
3793             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3794               // Stores to memory go thru the mapper to detect self-modifying
3795               // code, loads don't.
3796               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3797                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3798                 generate_map_const(constmap[i][rs]+offset,rm);
3799             }else{
3800               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3801                 generate_map_const(constmap[i][rs]+offset,rm);
3802             }
3803           }
3804         }
3805         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3806           if(!entry||entry[ra]!=agr) {
3807             if (opcode[i]==0x22||opcode[i]==0x26) {
3808               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3809             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3810               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3811             }else{
3812               #ifdef HOST_IMM_ADDR32
3813               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3814                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3815               #endif
3816               emit_movimm(constmap[i][rs]+offset,ra);
3817             }
3818           } // else did it in the previous cycle
3819         } // else load_consts already did it
3820       }
3821       if(offset&&!c&&rs1[i]) {
3822         if(rs>=0) {
3823           emit_addimm(rs,offset,ra);
3824         }else{
3825           emit_addimm(ra,offset,ra);
3826         }
3827       }
3828     }
3829   }
3830   // Preload constants for next instruction
3831   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3832     int agr,ra;
3833     #ifndef HOST_IMM_ADDR32
3834     // Mapper entry
3835     agr=MGEN1+((i+1)&1);
3836     ra=get_reg(i_regs->regmap,agr);
3837     if(ra>=0) {
3838       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3839       int offset=imm[i+1];
3840       int c=(regs[i+1].wasconst>>rs)&1;
3841       if(c) {
3842         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3843           // Stores to memory go thru the mapper to detect self-modifying
3844           // code, loads don't.
3845           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3846              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3847             generate_map_const(constmap[i+1][rs]+offset,ra);
3848         }else{
3849           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3850             generate_map_const(constmap[i+1][rs]+offset,ra);
3851         }
3852       }
3853       /*else if(rs1[i]==0) {
3854         generate_map_const(offset,ra);
3855       }*/
3856     }
3857     #endif
3858     // Actual address
3859     agr=AGEN1+((i+1)&1);
3860     ra=get_reg(i_regs->regmap,agr);
3861     if(ra>=0) {
3862       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3863       int offset=imm[i+1];
3864       int c=(regs[i+1].wasconst>>rs)&1;
3865       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3866         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3867           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3868         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3869           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3870         }else{
3871           #ifdef HOST_IMM_ADDR32
3872           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3873              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3874           #endif
3875           emit_movimm(constmap[i+1][rs]+offset,ra);
3876         }
3877       }
3878       else if(rs1[i+1]==0) {
3879         // Using r0 as a base address
3880         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3881           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3882         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3883           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3884         }else{
3885           emit_movimm(offset,ra);
3886         }
3887       }
3888     }
3889   }
3890 }
3891
3892 int get_final_value(int hr, int i, int *value)
3893 {
3894   int reg=regs[i].regmap[hr];
3895   while(i<slen-1) {
3896     if(regs[i+1].regmap[hr]!=reg) break;
3897     if(!((regs[i+1].isconst>>hr)&1)) break;
3898     if(bt[i+1]) break;
3899     i++;
3900   }
3901   if(i<slen-1) {
3902     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3903       *value=constmap[i][hr];
3904       return 1;
3905     }
3906     if(!bt[i+1]) {
3907       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3908         // Load in delay slot, out-of-order execution
3909         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3910         {
3911           #ifdef HOST_IMM_ADDR32
3912           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3913           #endif
3914           // Precompute load address
3915           *value=constmap[i][hr]+imm[i+2];
3916           return 1;
3917         }
3918       }
3919       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3920       {
3921         #ifdef HOST_IMM_ADDR32
3922         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3923         #endif
3924         // Precompute load address
3925         *value=constmap[i][hr]+imm[i+1];
3926         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3927         return 1;
3928       }
3929     }
3930   }
3931   *value=constmap[i][hr];
3932   //printf("c=%x\n",(int)constmap[i][hr]);
3933   if(i==slen-1) return 1;
3934   if(reg<64) {
3935     return !((unneeded_reg[i+1]>>reg)&1);
3936   }else{
3937     return !((unneeded_reg_upper[i+1]>>reg)&1);
3938   }
3939 }
3940
3941 // Load registers with known constants
3942 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3943 {
3944   int hr;
3945   // Load 32-bit regs
3946   for(hr=0;hr<HOST_REGS;hr++) {
3947     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3948       //if(entry[hr]!=regmap[hr]) {
3949       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3950         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3951           int value;
3952           if(get_final_value(hr,i,&value)) {
3953             if(value==0) {
3954               emit_zeroreg(hr);
3955             }
3956             else {
3957               emit_movimm(value,hr);
3958             }
3959           }
3960         }
3961       }
3962     }
3963   }
3964   // Load 64-bit regs
3965   for(hr=0;hr<HOST_REGS;hr++) {
3966     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3967       //if(entry[hr]!=regmap[hr]) {
3968       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3969         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3970           if((is32>>(regmap[hr]&63))&1) {
3971             int lr=get_reg(regmap,regmap[hr]-64);
3972             assert(lr>=0);
3973             emit_sarimm(lr,31,hr);
3974           }
3975           else
3976           {
3977             int value;
3978             if(get_final_value(hr,i,&value)) {
3979               if(value==0) {
3980                 emit_zeroreg(hr);
3981               }
3982               else {
3983                 emit_movimm(value,hr);
3984               }
3985             }
3986           }
3987         }
3988       }
3989     }
3990   }
3991 }
3992 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3993 {
3994   int hr;
3995   // Load 32-bit regs
3996   for(hr=0;hr<HOST_REGS;hr++) {
3997     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3998       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3999         int value=constmap[i][hr];
4000         if(value==0) {
4001           emit_zeroreg(hr);
4002         }
4003         else {
4004           emit_movimm(value,hr);
4005         }
4006       }
4007     }
4008   }
4009   // Load 64-bit regs
4010   for(hr=0;hr<HOST_REGS;hr++) {
4011     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4012       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4013         if((is32>>(regmap[hr]&63))&1) {
4014           int lr=get_reg(regmap,regmap[hr]-64);
4015           assert(lr>=0);
4016           emit_sarimm(lr,31,hr);
4017         }
4018         else
4019         {
4020           int value=constmap[i][hr];
4021           if(value==0) {
4022             emit_zeroreg(hr);
4023           }
4024           else {
4025             emit_movimm(value,hr);
4026           }
4027         }
4028       }
4029     }
4030   }
4031 }
4032
4033 // Write out all dirty registers (except cycle count)
4034 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4035 {
4036   int hr;
4037   for(hr=0;hr<HOST_REGS;hr++) {
4038     if(hr!=EXCLUDE_REG) {
4039       if(i_regmap[hr]>0) {
4040         if(i_regmap[hr]!=CCREG) {
4041           if((i_dirty>>hr)&1) {
4042             if(i_regmap[hr]<64) {
4043               emit_storereg(i_regmap[hr],hr);
4044               if( ((i_is32>>i_regmap[hr])&1) ) {
4045                 #ifdef DESTRUCTIVE_WRITEBACK
4046                 emit_sarimm(hr,31,hr);
4047                 emit_storereg(i_regmap[hr]|64,hr);
4048                 #else
4049                 emit_sarimm(hr,31,HOST_TEMPREG);
4050                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4051                 #endif
4052               }
4053             }else{
4054               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4055                 emit_storereg(i_regmap[hr],hr);
4056               }
4057             }
4058           }
4059         }
4060       }
4061     }
4062   }
4063 }
4064 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4065 // This writes the registers not written by store_regs_bt
4066 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4067 {
4068   int hr;
4069   int t=(addr-start)>>2;
4070   for(hr=0;hr<HOST_REGS;hr++) {
4071     if(hr!=EXCLUDE_REG) {
4072       if(i_regmap[hr]>0) {
4073         if(i_regmap[hr]!=CCREG) {
4074           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4075             if((i_dirty>>hr)&1) {
4076               if(i_regmap[hr]<64) {
4077                 emit_storereg(i_regmap[hr],hr);
4078                 if( ((i_is32>>i_regmap[hr])&1) ) {
4079                   #ifdef DESTRUCTIVE_WRITEBACK
4080                   emit_sarimm(hr,31,hr);
4081                   emit_storereg(i_regmap[hr]|64,hr);
4082                   #else
4083                   emit_sarimm(hr,31,HOST_TEMPREG);
4084                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4085                   #endif
4086                 }
4087               }else{
4088                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4089                   emit_storereg(i_regmap[hr],hr);
4090                 }
4091               }
4092             }
4093           }
4094         }
4095       }
4096     }
4097   }
4098 }
4099
4100 // Load all registers (except cycle count)
4101 void load_all_regs(signed char i_regmap[])
4102 {
4103   int hr;
4104   for(hr=0;hr<HOST_REGS;hr++) {
4105     if(hr!=EXCLUDE_REG) {
4106       if(i_regmap[hr]==0) {
4107         emit_zeroreg(hr);
4108       }
4109       else
4110       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4111       {
4112         emit_loadreg(i_regmap[hr],hr);
4113       }
4114     }
4115   }
4116 }
4117
4118 // Load all current registers also needed by next instruction
4119 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4120 {
4121   int hr;
4122   for(hr=0;hr<HOST_REGS;hr++) {
4123     if(hr!=EXCLUDE_REG) {
4124       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4125         if(i_regmap[hr]==0) {
4126           emit_zeroreg(hr);
4127         }
4128         else
4129         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4130         {
4131           emit_loadreg(i_regmap[hr],hr);
4132         }
4133       }
4134     }
4135   }
4136 }
4137
4138 // Load all regs, storing cycle count if necessary
4139 void load_regs_entry(int t)
4140 {
4141   int hr;
4142   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4143   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4144   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4145     emit_storereg(CCREG,HOST_CCREG);
4146   }
4147   // Load 32-bit regs
4148   for(hr=0;hr<HOST_REGS;hr++) {
4149     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4150       if(regs[t].regmap_entry[hr]==0) {
4151         emit_zeroreg(hr);
4152       }
4153       else if(regs[t].regmap_entry[hr]!=CCREG)
4154       {
4155         emit_loadreg(regs[t].regmap_entry[hr],hr);
4156       }
4157     }
4158   }
4159   // Load 64-bit regs
4160   for(hr=0;hr<HOST_REGS;hr++) {
4161     if(regs[t].regmap_entry[hr]>=64) {
4162       assert(regs[t].regmap_entry[hr]!=64);
4163       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4164         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4165         if(lr<0) {
4166           emit_loadreg(regs[t].regmap_entry[hr],hr);
4167         }
4168         else
4169         {
4170           emit_sarimm(lr,31,hr);
4171         }
4172       }
4173       else
4174       {
4175         emit_loadreg(regs[t].regmap_entry[hr],hr);
4176       }
4177     }
4178   }
4179 }
4180
4181 // Store dirty registers prior to branch
4182 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4183 {
4184   if(internal_branch(i_is32,addr))
4185   {
4186     int t=(addr-start)>>2;
4187     int hr;
4188     for(hr=0;hr<HOST_REGS;hr++) {
4189       if(hr!=EXCLUDE_REG) {
4190         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4191           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4192             if((i_dirty>>hr)&1) {
4193               if(i_regmap[hr]<64) {
4194                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4195                   emit_storereg(i_regmap[hr],hr);
4196                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4197                     #ifdef DESTRUCTIVE_WRITEBACK
4198                     emit_sarimm(hr,31,hr);
4199                     emit_storereg(i_regmap[hr]|64,hr);
4200                     #else
4201                     emit_sarimm(hr,31,HOST_TEMPREG);
4202                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4203                     #endif
4204                   }
4205                 }
4206               }else{
4207                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4208                   emit_storereg(i_regmap[hr],hr);
4209                 }
4210               }
4211             }
4212           }
4213         }
4214       }
4215     }
4216   }
4217   else
4218   {
4219     // Branch out of this block, write out all dirty regs
4220     wb_dirtys(i_regmap,i_is32,i_dirty);
4221   }
4222 }
4223
4224 // Load all needed registers for branch target
4225 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4226 {
4227   //if(addr>=start && addr<(start+slen*4))
4228   if(internal_branch(i_is32,addr))
4229   {
4230     int t=(addr-start)>>2;
4231     int hr;
4232     // Store the cycle count before loading something else
4233     if(i_regmap[HOST_CCREG]!=CCREG) {
4234       assert(i_regmap[HOST_CCREG]==-1);
4235     }
4236     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4237       emit_storereg(CCREG,HOST_CCREG);
4238     }
4239     // Load 32-bit regs
4240     for(hr=0;hr<HOST_REGS;hr++) {
4241       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4242         #ifdef DESTRUCTIVE_WRITEBACK
4243         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4244         #else
4245         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4246         #endif
4247           if(regs[t].regmap_entry[hr]==0) {
4248             emit_zeroreg(hr);
4249           }
4250           else if(regs[t].regmap_entry[hr]!=CCREG)
4251           {
4252             emit_loadreg(regs[t].regmap_entry[hr],hr);
4253           }
4254         }
4255       }
4256     }
4257     //Load 64-bit regs
4258     for(hr=0;hr<HOST_REGS;hr++) {
4259       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4260         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4261           assert(regs[t].regmap_entry[hr]!=64);
4262           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4263             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4264             if(lr<0) {
4265               emit_loadreg(regs[t].regmap_entry[hr],hr);
4266             }
4267             else
4268             {
4269               emit_sarimm(lr,31,hr);
4270             }
4271           }
4272           else
4273           {
4274             emit_loadreg(regs[t].regmap_entry[hr],hr);
4275           }
4276         }
4277         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4278           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4279           assert(lr>=0);
4280           emit_sarimm(lr,31,hr);
4281         }
4282       }
4283     }
4284   }
4285 }
4286
4287 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4288 {
4289   if(addr>=start && addr<start+slen*4-4)
4290   {
4291     int t=(addr-start)>>2;
4292     int hr;
4293     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4294     for(hr=0;hr<HOST_REGS;hr++)
4295     {
4296       if(hr!=EXCLUDE_REG)
4297       {
4298         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4299         {
4300           if(regs[t].regmap_entry[hr]!=-1)
4301           {
4302             return 0;
4303           }
4304           else 
4305           if((i_dirty>>hr)&1)
4306           {
4307             if(i_regmap[hr]<64)
4308             {
4309               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4310                 return 0;
4311             }
4312             else
4313             {
4314               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4315                 return 0;
4316             }
4317           }
4318         }
4319         else // Same register but is it 32-bit or dirty?
4320         if(i_regmap[hr]>=0)
4321         {
4322           if(!((regs[t].dirty>>hr)&1))
4323           {
4324             if((i_dirty>>hr)&1)
4325             {
4326               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4327               {
4328                 //printf("%x: dirty no match\n",addr);
4329                 return 0;
4330               }
4331             }
4332           }
4333           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4334           {
4335             //printf("%x: is32 no match\n",addr);
4336             return 0;
4337           }
4338         }
4339       }
4340     }
4341     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4342     if(requires_32bit[t]&~i_is32) return 0;
4343     // Delay slots are not valid branch targets
4344     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4345     // Delay slots require additional processing, so do not match
4346     if(is_ds[t]) return 0;
4347   }
4348   else
4349   {
4350     int hr;
4351     for(hr=0;hr<HOST_REGS;hr++)
4352     {
4353       if(hr!=EXCLUDE_REG)
4354       {
4355         if(i_regmap[hr]>=0)
4356         {
4357           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4358           {
4359             if((i_dirty>>hr)&1)
4360             {
4361               return 0;
4362             }
4363           }
4364         }
4365       }
4366     }
4367   }
4368   return 1;
4369 }
4370
4371 // Used when a branch jumps into the delay slot of another branch
4372 void ds_assemble_entry(int i)
4373 {
4374   int t=(ba[i]-start)>>2;
4375   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4376   assem_debug("Assemble delay slot at %x\n",ba[i]);
4377   assem_debug("<->\n");
4378   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4379     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4380   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4381   address_generation(t,&regs[t],regs[t].regmap_entry);
4382   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4383     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4384   cop1_usable=0;
4385   is_delayslot=0;
4386   switch(itype[t]) {
4387     case ALU:
4388       alu_assemble(t,&regs[t]);break;
4389     case IMM16:
4390       imm16_assemble(t,&regs[t]);break;
4391     case SHIFT:
4392       shift_assemble(t,&regs[t]);break;
4393     case SHIFTIMM:
4394       shiftimm_assemble(t,&regs[t]);break;
4395     case LOAD:
4396       load_assemble(t,&regs[t]);break;
4397     case LOADLR:
4398       loadlr_assemble(t,&regs[t]);break;
4399     case STORE:
4400       store_assemble(t,&regs[t]);break;
4401     case STORELR:
4402       storelr_assemble(t,&regs[t]);break;
4403     case COP0:
4404       cop0_assemble(t,&regs[t]);break;
4405     case COP1:
4406       cop1_assemble(t,&regs[t]);break;
4407     case C1LS:
4408       c1ls_assemble(t,&regs[t]);break;
4409     case FCONV:
4410       fconv_assemble(t,&regs[t]);break;
4411     case FLOAT:
4412       float_assemble(t,&regs[t]);break;
4413     case FCOMP:
4414       fcomp_assemble(t,&regs[t]);break;
4415     case MULTDIV:
4416       multdiv_assemble(t,&regs[t]);break;
4417     case MOV:
4418       mov_assemble(t,&regs[t]);break;
4419     case SYSCALL:
4420     case SPAN:
4421     case UJUMP:
4422     case RJUMP:
4423     case CJUMP:
4424     case SJUMP:
4425     case FJUMP:
4426       printf("Jump in the delay slot.  This is probably a bug.\n");
4427   }
4428   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4429   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4430   if(internal_branch(regs[t].is32,ba[i]+4))
4431     assem_debug("branch: internal\n");
4432   else
4433     assem_debug("branch: external\n");
4434   assert(internal_branch(regs[t].is32,ba[i]+4));
4435   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4436   emit_jmp(0);
4437 }
4438
4439 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4440 {
4441   int count;
4442   int jaddr;
4443   int idle=0;
4444   if(itype[i]==RJUMP)
4445   {
4446     *adj=0;
4447   }
4448   //if(ba[i]>=start && ba[i]<(start+slen*4))
4449   if(internal_branch(branch_regs[i].is32,ba[i]))
4450   {
4451     int t=(ba[i]-start)>>2;
4452     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4453     else *adj=ccadj[t];
4454   }
4455   else
4456   {
4457     *adj=0;
4458   }
4459   count=ccadj[i];
4460   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4461     // Idle loop
4462     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4463     idle=(int)out;
4464     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4465     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4466     jaddr=(int)out;
4467     emit_jmp(0);
4468   }
4469   else if(*adj==0||invert) {
4470     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4471     jaddr=(int)out;
4472     emit_jns(0);
4473   }
4474   else
4475   {
4476     emit_cmpimm(HOST_CCREG,-2*(count+2));
4477     jaddr=(int)out;
4478     emit_jns(0);
4479   }
4480   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4481 }
4482
4483 void do_ccstub(int n)
4484 {
4485   literal_pool(256);
4486   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4487   set_jump_target(stubs[n][1],(int)out);
4488   int i=stubs[n][4];
4489   if(stubs[n][6]==NULLDS) {
4490     // Delay slot instruction is nullified ("likely" branch)
4491     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4492   }
4493   else if(stubs[n][6]!=TAKEN) {
4494     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4495   }
4496   else {
4497     if(internal_branch(branch_regs[i].is32,ba[i]))
4498       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4499   }
4500   if(stubs[n][5]!=-1)
4501   {
4502     // Save PC as return address
4503     emit_movimm(stubs[n][5],EAX);
4504     emit_writeword(EAX,(int)&pcaddr);
4505   }
4506   else
4507   {
4508     // Return address depends on which way the branch goes
4509     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4510     {
4511       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4512       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4513       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4514       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4515       if(rs1[i]==0)
4516       {
4517         s1l=s2l;s1h=s2h;
4518         s2l=s2h=-1;
4519       }
4520       else if(rs2[i]==0)
4521       {
4522         s2l=s2h=-1;
4523       }
4524       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4525         s1h=s2h=-1;
4526       }
4527       assert(s1l>=0);
4528       #ifdef DESTRUCTIVE_WRITEBACK
4529       if(rs1[i]) {
4530         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4531           emit_loadreg(rs1[i],s1l);
4532       } 
4533       else {
4534         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4535           emit_loadreg(rs2[i],s1l);
4536       }
4537       if(s2l>=0)
4538         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4539           emit_loadreg(rs2[i],s2l);
4540       #endif
4541       int hr=0;
4542       int addr,alt,ntaddr;
4543       while(hr<HOST_REGS)
4544       {
4545         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4546            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4547            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4548         {
4549           addr=hr++;break;
4550         }
4551         hr++;
4552       }
4553       while(hr<HOST_REGS)
4554       {
4555         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4556            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4557            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4558         {
4559           alt=hr++;break;
4560         }
4561         hr++;
4562       }
4563       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4564       {
4565         while(hr<HOST_REGS)
4566         {
4567           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4568              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4569              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4570           {
4571             ntaddr=hr;break;
4572           }
4573           hr++;
4574         }
4575         assert(hr<HOST_REGS);
4576       }
4577       if((opcode[i]&0x2f)==4) // BEQ
4578       {
4579         #ifdef HAVE_CMOV_IMM
4580         if(s1h<0) {
4581           if(s2l>=0) emit_cmp(s1l,s2l);
4582           else emit_test(s1l,s1l);
4583           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4584         }
4585         else
4586         #endif
4587         {
4588           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4589           if(s1h>=0) {
4590             if(s2h>=0) emit_cmp(s1h,s2h);
4591             else emit_test(s1h,s1h);
4592             emit_cmovne_reg(alt,addr);
4593           }
4594           if(s2l>=0) emit_cmp(s1l,s2l);
4595           else emit_test(s1l,s1l);
4596           emit_cmovne_reg(alt,addr);
4597         }
4598       }
4599       if((opcode[i]&0x2f)==5) // BNE
4600       {
4601         #ifdef HAVE_CMOV_IMM
4602         if(s1h<0) {
4603           if(s2l>=0) emit_cmp(s1l,s2l);
4604           else emit_test(s1l,s1l);
4605           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4606         }
4607         else
4608         #endif
4609         {
4610           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4611           if(s1h>=0) {
4612             if(s2h>=0) emit_cmp(s1h,s2h);
4613             else emit_test(s1h,s1h);
4614             emit_cmovne_reg(alt,addr);
4615           }
4616           if(s2l>=0) emit_cmp(s1l,s2l);
4617           else emit_test(s1l,s1l);
4618           emit_cmovne_reg(alt,addr);
4619         }
4620       }
4621       if((opcode[i]&0x2f)==6) // BLEZ
4622       {
4623         //emit_movimm(ba[i],alt);
4624         //emit_movimm(start+i*4+8,addr);
4625         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4626         emit_cmpimm(s1l,1);
4627         if(s1h>=0) emit_mov(addr,ntaddr);
4628         emit_cmovl_reg(alt,addr);
4629         if(s1h>=0) {
4630           emit_test(s1h,s1h);
4631           emit_cmovne_reg(ntaddr,addr);
4632           emit_cmovs_reg(alt,addr);
4633         }
4634       }
4635       if((opcode[i]&0x2f)==7) // BGTZ
4636       {
4637         //emit_movimm(ba[i],addr);
4638         //emit_movimm(start+i*4+8,ntaddr);
4639         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4640         emit_cmpimm(s1l,1);
4641         if(s1h>=0) emit_mov(addr,alt);
4642         emit_cmovl_reg(ntaddr,addr);
4643         if(s1h>=0) {
4644           emit_test(s1h,s1h);
4645           emit_cmovne_reg(alt,addr);
4646           emit_cmovs_reg(ntaddr,addr);
4647         }
4648       }
4649       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4650       {
4651         //emit_movimm(ba[i],alt);
4652         //emit_movimm(start+i*4+8,addr);
4653         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4654         if(s1h>=0) emit_test(s1h,s1h);
4655         else emit_test(s1l,s1l);
4656         emit_cmovs_reg(alt,addr);
4657       }
4658       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4659       {
4660         //emit_movimm(ba[i],addr);
4661         //emit_movimm(start+i*4+8,alt);
4662         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4663         if(s1h>=0) emit_test(s1h,s1h);
4664         else emit_test(s1l,s1l);
4665         emit_cmovs_reg(alt,addr);
4666       }
4667       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4668         if(source[i]&0x10000) // BC1T
4669         {
4670           //emit_movimm(ba[i],alt);
4671           //emit_movimm(start+i*4+8,addr);
4672           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4673           emit_testimm(s1l,0x800000);
4674           emit_cmovne_reg(alt,addr);
4675         }
4676         else // BC1F
4677         {
4678           //emit_movimm(ba[i],addr);
4679           //emit_movimm(start+i*4+8,alt);
4680           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4681           emit_testimm(s1l,0x800000);
4682           emit_cmovne_reg(alt,addr);
4683         }
4684       }
4685       emit_writeword(addr,(int)&pcaddr);
4686     }
4687     else
4688     if(itype[i]==RJUMP)
4689     {
4690       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4691       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4692         r=get_reg(branch_regs[i].regmap,RTEMP);
4693       }
4694       emit_writeword(r,(int)&pcaddr);
4695     }
4696     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4697   }
4698   // Update cycle count
4699   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4700   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4701   emit_call((int)cc_interrupt);
4702   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4703   if(stubs[n][6]==TAKEN) {
4704     if(internal_branch(branch_regs[i].is32,ba[i]))
4705       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4706     else if(itype[i]==RJUMP) {
4707       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4708         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4709       else
4710         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4711     }
4712   }else if(stubs[n][6]==NOTTAKEN) {
4713     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4714     else load_all_regs(branch_regs[i].regmap);
4715   }else if(stubs[n][6]==NULLDS) {
4716     // Delay slot instruction is nullified ("likely" branch)
4717     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4718     else load_all_regs(regs[i].regmap);
4719   }else{
4720     load_all_regs(branch_regs[i].regmap);
4721   }
4722   emit_jmp(stubs[n][2]); // return address
4723   
4724   /* This works but uses a lot of memory...
4725   emit_readword((int)&last_count,ECX);
4726   emit_add(HOST_CCREG,ECX,EAX);
4727   emit_writeword(EAX,(int)&Count);
4728   emit_call((int)gen_interupt);
4729   emit_readword((int)&Count,HOST_CCREG);
4730   emit_readword((int)&next_interupt,EAX);
4731   emit_readword((int)&pending_exception,EBX);
4732   emit_writeword(EAX,(int)&last_count);
4733   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4734   emit_test(EBX,EBX);
4735   int jne_instr=(int)out;
4736   emit_jne(0);
4737   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4738   load_all_regs(branch_regs[i].regmap);
4739   emit_jmp(stubs[n][2]); // return address
4740   set_jump_target(jne_instr,(int)out);
4741   emit_readword((int)&pcaddr,EAX);
4742   // Call get_addr_ht instead of doing the hash table here.
4743   // This code is executed infrequently and takes up a lot of space
4744   // so smaller is better.
4745   emit_storereg(CCREG,HOST_CCREG);
4746   emit_pushreg(EAX);
4747   emit_call((int)get_addr_ht);
4748   emit_loadreg(CCREG,HOST_CCREG);
4749   emit_addimm(ESP,4,ESP);
4750   emit_jmpreg(EAX);*/
4751 }
4752
4753 add_to_linker(int addr,int target,int ext)
4754 {
4755   link_addr[linkcount][0]=addr;
4756   link_addr[linkcount][1]=target;
4757   link_addr[linkcount][2]=ext;  
4758   linkcount++;
4759 }
4760
4761 void ujump_assemble(int i,struct regstat *i_regs)
4762 {
4763   signed char *i_regmap=i_regs->regmap;
4764   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4765   address_generation(i+1,i_regs,regs[i].regmap_entry);
4766   #ifdef REG_PREFETCH
4767   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4768   if(rt1[i]==31&&temp>=0) 
4769   {
4770     int return_address=start+i*4+8;
4771     if(get_reg(branch_regs[i].regmap,31)>0) 
4772     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4773   }
4774   #endif
4775   ds_assemble(i+1,i_regs);
4776   uint64_t bc_unneeded=branch_regs[i].u;
4777   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4778   bc_unneeded|=1|(1LL<<rt1[i]);
4779   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4780   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4781                 bc_unneeded,bc_unneeded_upper);
4782   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4783   if(rt1[i]==31) {
4784     int rt;
4785     unsigned int return_address;
4786     assert(rt1[i+1]!=31);
4787     assert(rt2[i+1]!=31);
4788     rt=get_reg(branch_regs[i].regmap,31);
4789     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4790     //assert(rt>=0);
4791     return_address=start+i*4+8;
4792     if(rt>=0) {
4793       #ifdef USE_MINI_HT
4794       if(internal_branch(branch_regs[i].is32,return_address)) {
4795         int temp=rt+1;
4796         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4797            branch_regs[i].regmap[temp]>=0)
4798         {
4799           temp=get_reg(branch_regs[i].regmap,-1);
4800         }
4801         #ifdef HOST_TEMPREG
4802         if(temp<0) temp=HOST_TEMPREG;
4803         #endif
4804         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4805         else emit_movimm(return_address,rt);
4806       }
4807       else
4808       #endif
4809       {
4810         #ifdef REG_PREFETCH
4811         if(temp>=0) 
4812         {
4813           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4814         }
4815         #endif
4816         emit_movimm(return_address,rt); // PC into link register
4817         #ifdef IMM_PREFETCH
4818         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4819         #endif
4820       }
4821     }
4822   }
4823   int cc,adj;
4824   cc=get_reg(branch_regs[i].regmap,CCREG);
4825   assert(cc==HOST_CCREG);
4826   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4827   #ifdef REG_PREFETCH
4828   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4829   #endif
4830   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4831   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4832   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4833   if(internal_branch(branch_regs[i].is32,ba[i]))
4834     assem_debug("branch: internal\n");
4835   else
4836     assem_debug("branch: external\n");
4837   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4838     ds_assemble_entry(i);
4839   }
4840   else {
4841     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4842     emit_jmp(0);
4843   }
4844 }
4845
4846 void rjump_assemble(int i,struct regstat *i_regs)
4847 {
4848   signed char *i_regmap=i_regs->regmap;
4849   int temp;
4850   int rs,cc,adj;
4851   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4852   assert(rs>=0);
4853   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4854     // Delay slot abuse, make a copy of the branch address register
4855     temp=get_reg(branch_regs[i].regmap,RTEMP);
4856     assert(temp>=0);
4857     assert(regs[i].regmap[temp]==RTEMP);
4858     emit_mov(rs,temp);
4859     rs=temp;
4860   }
4861   address_generation(i+1,i_regs,regs[i].regmap_entry);
4862   #ifdef REG_PREFETCH
4863   if(rt1[i]==31) 
4864   {
4865     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4866       int return_address=start+i*4+8;
4867       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4868     }
4869   }
4870   #endif
4871   #ifdef USE_MINI_HT
4872   if(rs1[i]==31) {
4873     int rh=get_reg(regs[i].regmap,RHASH);
4874     if(rh>=0) do_preload_rhash(rh);
4875   }
4876   #endif
4877   ds_assemble(i+1,i_regs);
4878   uint64_t bc_unneeded=branch_regs[i].u;
4879   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4880   bc_unneeded|=1|(1LL<<rt1[i]);
4881   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4882   bc_unneeded&=~(1LL<<rs1[i]);
4883   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4884                 bc_unneeded,bc_unneeded_upper);
4885   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4886   if(rt1[i]==31) {
4887     int rt,return_address;
4888     assert(rt1[i+1]!=31);
4889     assert(rt2[i+1]!=31);
4890     rt=get_reg(branch_regs[i].regmap,31);
4891     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4892     assert(rt>=0);
4893     return_address=start+i*4+8;
4894     #ifdef REG_PREFETCH
4895     if(temp>=0) 
4896     {
4897       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4898     }
4899     #endif
4900     emit_movimm(return_address,rt); // PC into link register
4901     #ifdef IMM_PREFETCH
4902     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4903     #endif
4904   }
4905   cc=get_reg(branch_regs[i].regmap,CCREG);
4906   assert(cc==HOST_CCREG);
4907   #ifdef USE_MINI_HT
4908   int rh=get_reg(branch_regs[i].regmap,RHASH);
4909   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4910   if(rs1[i]==31) {
4911     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4912     do_preload_rhtbl(ht);
4913     do_rhash(rs,rh);
4914   }
4915   #endif
4916   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4917   #ifdef DESTRUCTIVE_WRITEBACK
4918   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4919     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4920       emit_loadreg(rs1[i],rs);
4921     }
4922   }
4923   #endif
4924   #ifdef REG_PREFETCH
4925   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4926   #endif
4927   #ifdef USE_MINI_HT
4928   if(rs1[i]==31) {
4929     do_miniht_load(ht,rh);
4930   }
4931   #endif
4932   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4933   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4934   //assert(adj==0);
4935   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
4936   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4937   emit_jns(0);
4938   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4939   #ifdef USE_MINI_HT
4940   if(rs1[i]==31) {
4941     do_miniht_jump(rs,rh,ht);
4942   }
4943   else
4944   #endif
4945   {
4946     //if(rs!=EAX) emit_mov(rs,EAX);
4947     //emit_jmp((int)jump_vaddr_eax);
4948     emit_jmp(jump_vaddr_reg[rs]);
4949   }
4950   /* Check hash table
4951   temp=!rs;
4952   emit_mov(rs,temp);
4953   emit_shrimm(rs,16,rs);
4954   emit_xor(temp,rs,rs);
4955   emit_movzwl_reg(rs,rs);
4956   emit_shlimm(rs,4,rs);
4957   emit_cmpmem_indexed((int)hash_table,rs,temp);
4958   emit_jne((int)out+14);
4959   emit_readword_indexed((int)hash_table+4,rs,rs);
4960   emit_jmpreg(rs);
4961   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4962   emit_addimm_no_flags(8,rs);
4963   emit_jeq((int)out-17);
4964   // No hit on hash table, call compiler
4965   emit_pushreg(temp);
4966 //DEBUG >
4967 #ifdef DEBUG_CYCLE_COUNT
4968   emit_readword((int)&last_count,ECX);
4969   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4970   emit_readword((int)&next_interupt,ECX);
4971   emit_writeword(HOST_CCREG,(int)&Count);
4972   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4973   emit_writeword(ECX,(int)&last_count);
4974 #endif
4975 //DEBUG <
4976   emit_storereg(CCREG,HOST_CCREG);
4977   emit_call((int)get_addr);
4978   emit_loadreg(CCREG,HOST_CCREG);
4979   emit_addimm(ESP,4,ESP);
4980   emit_jmpreg(EAX);*/
4981   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4982   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4983   #endif
4984 }
4985
4986 void cjump_assemble(int i,struct regstat *i_regs)
4987 {
4988   signed char *i_regmap=i_regs->regmap;
4989   int cc;
4990   int match;
4991   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4992   assem_debug("match=%d\n",match);
4993   int s1h,s1l,s2h,s2l;
4994   int prev_cop1_usable=cop1_usable;
4995   int unconditional=0,nop=0;
4996   int only32=0;
4997   int ooo=1;
4998   int invert=0;
4999   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5000   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5001   if(likely[i]) ooo=0;
5002   if(!match) invert=1;
5003   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5004   if(i>(ba[i]-start)>>2) invert=1;
5005   #endif
5006     
5007   if(ooo)
5008     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5009        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5010   {
5011     // Write-after-read dependency prevents out of order execution
5012     // First test branch condition, then execute delay slot, then branch
5013     ooo=0;
5014   }
5015
5016   if(ooo) {
5017     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5018     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5019     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5020     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5021   }
5022   else {
5023     s1l=get_reg(i_regmap,rs1[i]);
5024     s1h=get_reg(i_regmap,rs1[i]|64);
5025     s2l=get_reg(i_regmap,rs2[i]);
5026     s2h=get_reg(i_regmap,rs2[i]|64);
5027   }
5028   if(rs1[i]==0&&rs2[i]==0)
5029   {
5030     if(opcode[i]&1) nop=1;
5031     else unconditional=1;
5032     //assert(opcode[i]!=5);
5033     //assert(opcode[i]!=7);
5034     //assert(opcode[i]!=0x15);
5035     //assert(opcode[i]!=0x17);
5036   }
5037   else if(rs1[i]==0)
5038   {
5039     s1l=s2l;s1h=s2h;
5040     s2l=s2h=-1;
5041     only32=(regs[i].was32>>rs2[i])&1;
5042   }
5043   else if(rs2[i]==0)
5044   {
5045     s2l=s2h=-1;
5046     only32=(regs[i].was32>>rs1[i])&1;
5047   }
5048   else {
5049     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5050   }
5051
5052   if(ooo) {
5053     // Out of order execution (delay slot first)
5054     //printf("OOOE\n");
5055     address_generation(i+1,i_regs,regs[i].regmap_entry);
5056     ds_assemble(i+1,i_regs);
5057     int adj;
5058     uint64_t bc_unneeded=branch_regs[i].u;
5059     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5060     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5061     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5062     bc_unneeded|=1;
5063     bc_unneeded_upper|=1;
5064     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5065                   bc_unneeded,bc_unneeded_upper);
5066     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5067     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5068     cc=get_reg(branch_regs[i].regmap,CCREG);
5069     assert(cc==HOST_CCREG);
5070     if(unconditional) 
5071       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5072     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5073     //assem_debug("cycle count (adj)\n");
5074     if(unconditional) {
5075       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5076       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5077         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5078         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5079         if(internal)
5080           assem_debug("branch: internal\n");
5081         else
5082           assem_debug("branch: external\n");
5083         if(internal&&is_ds[(ba[i]-start)>>2]) {
5084           ds_assemble_entry(i);
5085         }
5086         else {
5087           add_to_linker((int)out,ba[i],internal);
5088           emit_jmp(0);
5089         }
5090         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5091         if(((u_int)out)&7) emit_addnop(0);
5092         #endif
5093       }
5094     }
5095     else if(nop) {
5096       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5097       int jaddr=(int)out;
5098       emit_jns(0);
5099       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5100     }
5101     else {
5102       int taken=0,nottaken=0,nottaken1=0;
5103       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5104       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5105       if(!only32)
5106       {
5107         assert(s1h>=0);
5108         if(opcode[i]==4) // BEQ
5109         {
5110           if(s2h>=0) emit_cmp(s1h,s2h);
5111           else emit_test(s1h,s1h);
5112           nottaken1=(int)out;
5113           emit_jne(1);
5114         }
5115         if(opcode[i]==5) // BNE
5116         {
5117           if(s2h>=0) emit_cmp(s1h,s2h);
5118           else emit_test(s1h,s1h);
5119           if(invert) taken=(int)out;
5120           else add_to_linker((int)out,ba[i],internal);
5121           emit_jne(0);
5122         }
5123         if(opcode[i]==6) // BLEZ
5124         {
5125           emit_test(s1h,s1h);
5126           if(invert) taken=(int)out;
5127           else add_to_linker((int)out,ba[i],internal);
5128           emit_js(0);
5129           nottaken1=(int)out;
5130           emit_jne(1);
5131         }
5132         if(opcode[i]==7) // BGTZ
5133         {
5134           emit_test(s1h,s1h);
5135           nottaken1=(int)out;
5136           emit_js(1);
5137           if(invert) taken=(int)out;
5138           else add_to_linker((int)out,ba[i],internal);
5139           emit_jne(0);
5140         }
5141       } // if(!only32)
5142           
5143       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5144       assert(s1l>=0);
5145       if(opcode[i]==4) // BEQ
5146       {
5147         if(s2l>=0) emit_cmp(s1l,s2l);
5148         else emit_test(s1l,s1l);
5149         if(invert){
5150           nottaken=(int)out;
5151           emit_jne(1);
5152         }else{
5153           add_to_linker((int)out,ba[i],internal);
5154           emit_jeq(0);
5155         }
5156       }
5157       if(opcode[i]==5) // BNE
5158       {
5159         if(s2l>=0) emit_cmp(s1l,s2l);
5160         else emit_test(s1l,s1l);
5161         if(invert){
5162           nottaken=(int)out;
5163           emit_jeq(1);
5164         }else{
5165           add_to_linker((int)out,ba[i],internal);
5166           emit_jne(0);
5167         }
5168       }
5169       if(opcode[i]==6) // BLEZ
5170       {
5171         emit_cmpimm(s1l,1);
5172         if(invert){
5173           nottaken=(int)out;
5174           emit_jge(1);
5175         }else{
5176           add_to_linker((int)out,ba[i],internal);
5177           emit_jl(0);
5178         }
5179       }
5180       if(opcode[i]==7) // BGTZ
5181       {
5182         emit_cmpimm(s1l,1);
5183         if(invert){
5184           nottaken=(int)out;
5185           emit_jl(1);
5186         }else{
5187           add_to_linker((int)out,ba[i],internal);
5188           emit_jge(0);
5189         }
5190       }
5191       if(invert) {
5192         if(taken) set_jump_target(taken,(int)out);
5193         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5194         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5195           if(adj) {
5196             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5197             add_to_linker((int)out,ba[i],internal);
5198           }else{
5199             emit_addnop(13);
5200             add_to_linker((int)out,ba[i],internal*2);
5201           }
5202           emit_jmp(0);
5203         }else
5204         #endif
5205         {
5206           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5207           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5208           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5209           if(internal)
5210             assem_debug("branch: internal\n");
5211           else
5212             assem_debug("branch: external\n");
5213           if(internal&&is_ds[(ba[i]-start)>>2]) {
5214             ds_assemble_entry(i);
5215           }
5216           else {
5217             add_to_linker((int)out,ba[i],internal);
5218             emit_jmp(0);
5219           }
5220         }
5221         set_jump_target(nottaken,(int)out);
5222       }
5223
5224       if(nottaken1) set_jump_target(nottaken1,(int)out);
5225       if(adj) {
5226         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5227       }
5228     } // (!unconditional)
5229   } // if(ooo)
5230   else
5231   {
5232     // In-order execution (branch first)
5233     //if(likely[i]) printf("IOL\n");
5234     //else
5235     //printf("IOE\n");
5236     int taken=0,nottaken=0,nottaken1=0;
5237     if(!unconditional&&!nop) {
5238       if(!only32)
5239       {
5240         assert(s1h>=0);
5241         if((opcode[i]&0x2f)==4) // BEQ
5242         {
5243           if(s2h>=0) emit_cmp(s1h,s2h);
5244           else emit_test(s1h,s1h);
5245           nottaken1=(int)out;
5246           emit_jne(2);
5247         }
5248         if((opcode[i]&0x2f)==5) // BNE
5249         {
5250           if(s2h>=0) emit_cmp(s1h,s2h);
5251           else emit_test(s1h,s1h);
5252           taken=(int)out;
5253           emit_jne(1);
5254         }
5255         if((opcode[i]&0x2f)==6) // BLEZ
5256         {
5257           emit_test(s1h,s1h);
5258           taken=(int)out;
5259           emit_js(1);
5260           nottaken1=(int)out;
5261           emit_jne(2);
5262         }
5263         if((opcode[i]&0x2f)==7) // BGTZ
5264         {
5265           emit_test(s1h,s1h);
5266           nottaken1=(int)out;
5267           emit_js(2);
5268           taken=(int)out;
5269           emit_jne(1);
5270         }
5271       } // if(!only32)
5272           
5273       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5274       assert(s1l>=0);
5275       if((opcode[i]&0x2f)==4) // BEQ
5276       {
5277         if(s2l>=0) emit_cmp(s1l,s2l);
5278         else emit_test(s1l,s1l);
5279         nottaken=(int)out;
5280         emit_jne(2);
5281       }
5282       if((opcode[i]&0x2f)==5) // BNE
5283       {
5284         if(s2l>=0) emit_cmp(s1l,s2l);
5285         else emit_test(s1l,s1l);
5286         nottaken=(int)out;
5287         emit_jeq(2);
5288       }
5289       if((opcode[i]&0x2f)==6) // BLEZ
5290       {
5291         emit_cmpimm(s1l,1);
5292         nottaken=(int)out;
5293         emit_jge(2);
5294       }
5295       if((opcode[i]&0x2f)==7) // BGTZ
5296       {
5297         emit_cmpimm(s1l,1);
5298         nottaken=(int)out;
5299         emit_jl(2);
5300       }
5301     } // if(!unconditional)
5302     int adj;
5303     uint64_t ds_unneeded=branch_regs[i].u;
5304     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5305     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5306     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5307     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5308     ds_unneeded|=1;
5309     ds_unneeded_upper|=1;
5310     // branch taken
5311     if(!nop) {
5312       if(taken) set_jump_target(taken,(int)out);
5313       assem_debug("1:\n");
5314       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5315                     ds_unneeded,ds_unneeded_upper);
5316       // load regs
5317       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5318       address_generation(i+1,&branch_regs[i],0);
5319       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5320       ds_assemble(i+1,&branch_regs[i]);
5321       cc=get_reg(branch_regs[i].regmap,CCREG);
5322       if(cc==-1) {
5323         emit_loadreg(CCREG,cc=HOST_CCREG);
5324         // CHECK: Is the following instruction (fall thru) allocated ok?
5325       }
5326       assert(cc==HOST_CCREG);
5327       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5328       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5329       assem_debug("cycle count (adj)\n");
5330       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5331       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5332       if(internal)
5333         assem_debug("branch: internal\n");
5334       else
5335         assem_debug("branch: external\n");
5336       if(internal&&is_ds[(ba[i]-start)>>2]) {
5337         ds_assemble_entry(i);
5338       }
5339       else {
5340         add_to_linker((int)out,ba[i],internal);
5341         emit_jmp(0);
5342       }
5343     }
5344     // branch not taken
5345     cop1_usable=prev_cop1_usable;
5346     if(!unconditional) {
5347       if(nottaken1) set_jump_target(nottaken1,(int)out);
5348       set_jump_target(nottaken,(int)out);
5349       assem_debug("2:\n");
5350       if(!likely[i]) {
5351         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5352                       ds_unneeded,ds_unneeded_upper);
5353         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5354         address_generation(i+1,&branch_regs[i],0);
5355         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5356         ds_assemble(i+1,&branch_regs[i]);
5357       }
5358       cc=get_reg(branch_regs[i].regmap,CCREG);
5359       if(cc==-1&&!likely[i]) {
5360         // Cycle count isn't in a register, temporarily load it then write it out
5361         emit_loadreg(CCREG,HOST_CCREG);
5362         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5363         int jaddr=(int)out;
5364         emit_jns(0);
5365         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5366         emit_storereg(CCREG,HOST_CCREG);
5367       }
5368       else{
5369         cc=get_reg(i_regmap,CCREG);
5370         assert(cc==HOST_CCREG);
5371         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5372         int jaddr=(int)out;
5373         emit_jns(0);
5374         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5375       }
5376     }
5377   }
5378 }
5379
5380 void sjump_assemble(int i,struct regstat *i_regs)
5381 {
5382   signed char *i_regmap=i_regs->regmap;
5383   int cc;
5384   int match;
5385   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5386   assem_debug("smatch=%d\n",match);
5387   int s1h,s1l;
5388   int prev_cop1_usable=cop1_usable;
5389   int unconditional=0,nevertaken=0;
5390   int only32=0;
5391   int ooo=1;
5392   int invert=0;
5393   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5394   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5395   if(likely[i]) ooo=0;
5396   if(!match) invert=1;
5397   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5398   if(i>(ba[i]-start)>>2) invert=1;
5399   #endif
5400
5401   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5402   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5403
5404   if(ooo)
5405     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5406   {
5407     // Write-after-read dependency prevents out of order execution
5408     // First test branch condition, then execute delay slot, then branch
5409     ooo=0;
5410   }
5411   // TODO: Conditional branches w/link must execute in-order so that
5412   // condition test and write to r31 occur before cycle count test
5413
5414   if(ooo) {
5415     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5416     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5417   }
5418   else {
5419     s1l=get_reg(i_regmap,rs1[i]);
5420     s1h=get_reg(i_regmap,rs1[i]|64);
5421   }
5422   if(rs1[i]==0)
5423   {
5424     if(opcode2[i]&1) unconditional=1;
5425     else nevertaken=1;
5426     // These are never taken (r0 is never less than zero)
5427     //assert(opcode2[i]!=0);
5428     //assert(opcode2[i]!=2);
5429     //assert(opcode2[i]!=0x10);
5430     //assert(opcode2[i]!=0x12);
5431   }
5432   else {
5433     only32=(regs[i].was32>>rs1[i])&1;
5434   }
5435
5436   if(ooo) {
5437     // Out of order execution (delay slot first)
5438     //printf("OOOE\n");
5439     address_generation(i+1,i_regs,regs[i].regmap_entry);
5440     ds_assemble(i+1,i_regs);
5441     int adj;
5442     uint64_t bc_unneeded=branch_regs[i].u;
5443     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5444     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5445     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5446     bc_unneeded|=1;
5447     bc_unneeded_upper|=1;
5448     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5449                   bc_unneeded,bc_unneeded_upper);
5450     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5451     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5452     if(rt1[i]==31) {
5453       int rt,return_address;
5454       assert(rt1[i+1]!=31);
5455       assert(rt2[i+1]!=31);
5456       rt=get_reg(branch_regs[i].regmap,31);
5457       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5458       if(rt>=0) {
5459         // Save the PC even if the branch is not taken
5460         return_address=start+i*4+8;
5461         emit_movimm(return_address,rt); // PC into link register
5462         #ifdef IMM_PREFETCH
5463         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5464         #endif
5465       }
5466     }
5467     cc=get_reg(branch_regs[i].regmap,CCREG);
5468     assert(cc==HOST_CCREG);
5469     if(unconditional) 
5470       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5471     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5472     assem_debug("cycle count (adj)\n");
5473     if(unconditional) {
5474       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5475       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5476         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5477         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5478         if(internal)
5479           assem_debug("branch: internal\n");
5480         else
5481           assem_debug("branch: external\n");
5482         if(internal&&is_ds[(ba[i]-start)>>2]) {
5483           ds_assemble_entry(i);
5484         }
5485         else {
5486           add_to_linker((int)out,ba[i],internal);
5487           emit_jmp(0);
5488         }
5489         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5490         if(((u_int)out)&7) emit_addnop(0);
5491         #endif
5492       }
5493     }
5494     else if(nevertaken) {
5495       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5496       int jaddr=(int)out;
5497       emit_jns(0);
5498       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5499     }
5500     else {
5501       int nottaken=0;
5502       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5503       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5504       if(!only32)
5505       {
5506         assert(s1h>=0);
5507         if(opcode2[i]==0) // BLTZ
5508         {
5509           emit_test(s1h,s1h);
5510           if(invert){
5511             nottaken=(int)out;
5512             emit_jns(1);
5513           }else{
5514             add_to_linker((int)out,ba[i],internal);
5515             emit_js(0);
5516           }
5517         }
5518         if(opcode2[i]==1) // BGEZ
5519         {
5520           emit_test(s1h,s1h);
5521           if(invert){
5522             nottaken=(int)out;
5523             emit_js(1);
5524           }else{
5525             add_to_linker((int)out,ba[i],internal);
5526             emit_jns(0);
5527           }
5528         }
5529       } // if(!only32)
5530       else
5531       {
5532         assert(s1l>=0);
5533         if(opcode2[i]==0) // BLTZ
5534         {
5535           emit_test(s1l,s1l);
5536           if(invert){
5537             nottaken=(int)out;
5538             emit_jns(1);
5539           }else{
5540             add_to_linker((int)out,ba[i],internal);
5541             emit_js(0);
5542           }
5543         }
5544         if(opcode2[i]==1) // BGEZ
5545         {
5546           emit_test(s1l,s1l);
5547           if(invert){
5548             nottaken=(int)out;
5549             emit_js(1);
5550           }else{
5551             add_to_linker((int)out,ba[i],internal);
5552             emit_jns(0);
5553           }
5554         }
5555       } // if(!only32)
5556           
5557       if(invert) {
5558         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5559         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5560           if(adj) {
5561             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5562             add_to_linker((int)out,ba[i],internal);
5563           }else{
5564             emit_addnop(13);
5565             add_to_linker((int)out,ba[i],internal*2);
5566           }
5567           emit_jmp(0);
5568         }else
5569         #endif
5570         {
5571           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5572           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5573           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5574           if(internal)
5575             assem_debug("branch: internal\n");
5576           else
5577             assem_debug("branch: external\n");
5578           if(internal&&is_ds[(ba[i]-start)>>2]) {
5579             ds_assemble_entry(i);
5580           }
5581           else {
5582             add_to_linker((int)out,ba[i],internal);
5583             emit_jmp(0);
5584           }
5585         }
5586         set_jump_target(nottaken,(int)out);
5587       }
5588
5589       if(adj) {
5590         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5591       }
5592     } // (!unconditional)
5593   } // if(ooo)
5594   else
5595   {
5596     // In-order execution (branch first)
5597     //printf("IOE\n");
5598     int nottaken=0;
5599     if(!unconditional) {
5600       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5601       if(!only32)
5602       {
5603         assert(s1h>=0);
5604         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5605         {
5606           emit_test(s1h,s1h);
5607           nottaken=(int)out;
5608           emit_jns(1);
5609         }
5610         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5611         {
5612           emit_test(s1h,s1h);
5613           nottaken=(int)out;
5614           emit_js(1);
5615         }
5616       } // if(!only32)
5617       else
5618       {
5619         assert(s1l>=0);
5620         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5621         {
5622           emit_test(s1l,s1l);
5623           nottaken=(int)out;
5624           emit_jns(1);
5625         }
5626         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5627         {
5628           emit_test(s1l,s1l);
5629           nottaken=(int)out;
5630           emit_js(1);
5631         }
5632       }
5633     } // if(!unconditional)
5634     int adj;
5635     uint64_t ds_unneeded=branch_regs[i].u;
5636     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5637     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5638     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5639     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5640     ds_unneeded|=1;
5641     ds_unneeded_upper|=1;
5642     // branch taken
5643     if(!nevertaken) {
5644       //assem_debug("1:\n");
5645       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5646                     ds_unneeded,ds_unneeded_upper);
5647       // load regs
5648       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5649       address_generation(i+1,&branch_regs[i],0);
5650       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5651       ds_assemble(i+1,&branch_regs[i]);
5652       cc=get_reg(branch_regs[i].regmap,CCREG);
5653       if(cc==-1) {
5654         emit_loadreg(CCREG,cc=HOST_CCREG);
5655         // CHECK: Is the following instruction (fall thru) allocated ok?
5656       }
5657       assert(cc==HOST_CCREG);
5658       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5659       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5660       assem_debug("cycle count (adj)\n");
5661       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5662       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5663       if(internal)
5664         assem_debug("branch: internal\n");
5665       else
5666         assem_debug("branch: external\n");
5667       if(internal&&is_ds[(ba[i]-start)>>2]) {
5668         ds_assemble_entry(i);
5669       }
5670       else {
5671         add_to_linker((int)out,ba[i],internal);
5672         emit_jmp(0);
5673       }
5674     }
5675     // branch not taken
5676     cop1_usable=prev_cop1_usable;
5677     if(!unconditional) {
5678       set_jump_target(nottaken,(int)out);
5679       assem_debug("1:\n");
5680       if(!likely[i]) {
5681         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5682                       ds_unneeded,ds_unneeded_upper);
5683         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5684         address_generation(i+1,&branch_regs[i],0);
5685         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5686         ds_assemble(i+1,&branch_regs[i]);
5687       }
5688       cc=get_reg(branch_regs[i].regmap,CCREG);
5689       if(cc==-1&&!likely[i]) {
5690         // Cycle count isn't in a register, temporarily load it then write it out
5691         emit_loadreg(CCREG,HOST_CCREG);
5692         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5693         int jaddr=(int)out;
5694         emit_jns(0);
5695         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5696         emit_storereg(CCREG,HOST_CCREG);
5697       }
5698       else{
5699         cc=get_reg(i_regmap,CCREG);
5700         assert(cc==HOST_CCREG);
5701         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5702         int jaddr=(int)out;
5703         emit_jns(0);
5704         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5705       }
5706     }
5707   }
5708 }
5709
5710 void fjump_assemble(int i,struct regstat *i_regs)
5711 {
5712   signed char *i_regmap=i_regs->regmap;
5713   int cc;
5714   int match;
5715   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5716   assem_debug("fmatch=%d\n",match);
5717   int fs,cs;
5718   int eaddr;
5719   int ooo=1;
5720   int invert=0;
5721   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5722   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5723   if(likely[i]) ooo=0;
5724   if(!match) invert=1;
5725   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5726   if(i>(ba[i]-start)>>2) invert=1;
5727   #endif
5728
5729   if(ooo)
5730     if(itype[i+1]==FCOMP)
5731   {
5732     // Write-after-read dependency prevents out of order execution
5733     // First test branch condition, then execute delay slot, then branch
5734     ooo=0;
5735   }
5736
5737   if(ooo) {
5738     fs=get_reg(branch_regs[i].regmap,FSREG);
5739     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5740   }
5741   else {
5742     fs=get_reg(i_regmap,FSREG);
5743   }
5744
5745   // Check cop1 unusable
5746   if(!cop1_usable) {
5747     cs=get_reg(i_regmap,CSREG);
5748     assert(cs>=0);
5749     emit_testimm(cs,0x20000000);
5750     eaddr=(int)out;
5751     emit_jeq(0);
5752     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5753     cop1_usable=1;
5754   }
5755
5756   if(ooo) {
5757     // Out of order execution (delay slot first)
5758     //printf("OOOE\n");
5759     ds_assemble(i+1,i_regs);
5760     int adj;
5761     uint64_t bc_unneeded=branch_regs[i].u;
5762     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5763     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5764     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5765     bc_unneeded|=1;
5766     bc_unneeded_upper|=1;
5767     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5768                   bc_unneeded,bc_unneeded_upper);
5769     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5770     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5771     cc=get_reg(branch_regs[i].regmap,CCREG);
5772     assert(cc==HOST_CCREG);
5773     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5774     assem_debug("cycle count (adj)\n");
5775     if(1) {
5776       int nottaken=0;
5777       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5778       if(1) {
5779         assert(fs>=0);
5780         emit_testimm(fs,0x800000);
5781         if(source[i]&0x10000) // BC1T
5782         {
5783           if(invert){
5784             nottaken=(int)out;
5785             emit_jeq(1);
5786           }else{
5787             add_to_linker((int)out,ba[i],internal);
5788             emit_jne(0);
5789           }
5790         }
5791         else // BC1F
5792           if(invert){
5793             nottaken=(int)out;
5794             emit_jne(1);
5795           }else{
5796             add_to_linker((int)out,ba[i],internal);
5797             emit_jeq(0);
5798           }
5799         {
5800         }
5801       } // if(!only32)
5802           
5803       if(invert) {
5804         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5805         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5806         else if(match) emit_addnop(13);
5807         #endif
5808         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5809         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5810         if(internal)
5811           assem_debug("branch: internal\n");
5812         else
5813           assem_debug("branch: external\n");
5814         if(internal&&is_ds[(ba[i]-start)>>2]) {
5815           ds_assemble_entry(i);
5816         }
5817         else {
5818           add_to_linker((int)out,ba[i],internal);
5819           emit_jmp(0);
5820         }
5821         set_jump_target(nottaken,(int)out);
5822       }
5823
5824       if(adj) {
5825         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5826       }
5827     } // (!unconditional)
5828   } // if(ooo)
5829   else
5830   {
5831     // In-order execution (branch first)
5832     //printf("IOE\n");
5833     int nottaken=0;
5834     if(1) {
5835       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5836       if(1) {
5837         assert(fs>=0);
5838         emit_testimm(fs,0x800000);
5839         if(source[i]&0x10000) // BC1T
5840         {
5841           nottaken=(int)out;
5842           emit_jeq(1);
5843         }
5844         else // BC1F
5845         {
5846           nottaken=(int)out;
5847           emit_jne(1);
5848         }
5849       }
5850     } // if(!unconditional)
5851     int adj;
5852     uint64_t ds_unneeded=branch_regs[i].u;
5853     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5854     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5855     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5856     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5857     ds_unneeded|=1;
5858     ds_unneeded_upper|=1;
5859     // branch taken
5860     //assem_debug("1:\n");
5861     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5862                   ds_unneeded,ds_unneeded_upper);
5863     // load regs
5864     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5865     address_generation(i+1,&branch_regs[i],0);
5866     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5867     ds_assemble(i+1,&branch_regs[i]);
5868     cc=get_reg(branch_regs[i].regmap,CCREG);
5869     if(cc==-1) {
5870       emit_loadreg(CCREG,cc=HOST_CCREG);
5871       // CHECK: Is the following instruction (fall thru) allocated ok?
5872     }
5873     assert(cc==HOST_CCREG);
5874     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5875     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5876     assem_debug("cycle count (adj)\n");
5877     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5878     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5879     if(internal)
5880       assem_debug("branch: internal\n");
5881     else
5882       assem_debug("branch: external\n");
5883     if(internal&&is_ds[(ba[i]-start)>>2]) {
5884       ds_assemble_entry(i);
5885     }
5886     else {
5887       add_to_linker((int)out,ba[i],internal);
5888       emit_jmp(0);
5889     }
5890
5891     // branch not taken
5892     if(1) { // <- FIXME (don't need this)
5893       set_jump_target(nottaken,(int)out);
5894       assem_debug("1:\n");
5895       if(!likely[i]) {
5896         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5897                       ds_unneeded,ds_unneeded_upper);
5898         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5899         address_generation(i+1,&branch_regs[i],0);
5900         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5901         ds_assemble(i+1,&branch_regs[i]);
5902       }
5903       cc=get_reg(branch_regs[i].regmap,CCREG);
5904       if(cc==-1&&!likely[i]) {
5905         // Cycle count isn't in a register, temporarily load it then write it out
5906         emit_loadreg(CCREG,HOST_CCREG);
5907         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5908         int jaddr=(int)out;
5909         emit_jns(0);
5910         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5911         emit_storereg(CCREG,HOST_CCREG);
5912       }
5913       else{
5914         cc=get_reg(i_regmap,CCREG);
5915         assert(cc==HOST_CCREG);
5916         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5917         int jaddr=(int)out;
5918         emit_jns(0);
5919         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5920       }
5921     }
5922   }
5923 }
5924
5925 static void pagespan_assemble(int i,struct regstat *i_regs)
5926 {
5927   int s1l=get_reg(i_regs->regmap,rs1[i]);
5928   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5929   int s2l=get_reg(i_regs->regmap,rs2[i]);
5930   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5931   void *nt_branch=NULL;
5932   int taken=0;
5933   int nottaken=0;
5934   int unconditional=0;
5935   if(rs1[i]==0)
5936   {
5937     s1l=s2l;s1h=s2h;
5938     s2l=s2h=-1;
5939   }
5940   else if(rs2[i]==0)
5941   {
5942     s2l=s2h=-1;
5943   }
5944   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5945     s1h=s2h=-1;
5946   }
5947   int hr=0;
5948   int addr,alt,ntaddr;
5949   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5950   else {
5951     while(hr<HOST_REGS)
5952     {
5953       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5954          (i_regs->regmap[hr]&63)!=rs1[i] &&
5955          (i_regs->regmap[hr]&63)!=rs2[i] )
5956       {
5957         addr=hr++;break;
5958       }
5959       hr++;
5960     }
5961   }
5962   while(hr<HOST_REGS)
5963   {
5964     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5965        (i_regs->regmap[hr]&63)!=rs1[i] &&
5966        (i_regs->regmap[hr]&63)!=rs2[i] )
5967     {
5968       alt=hr++;break;
5969     }
5970     hr++;
5971   }
5972   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5973   {
5974     while(hr<HOST_REGS)
5975     {
5976       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5977          (i_regs->regmap[hr]&63)!=rs1[i] &&
5978          (i_regs->regmap[hr]&63)!=rs2[i] )
5979       {
5980         ntaddr=hr;break;
5981       }
5982       hr++;
5983     }
5984   }
5985   assert(hr<HOST_REGS);
5986   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5987     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5988   }
5989   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5990   if(opcode[i]==2) // J
5991   {
5992     unconditional=1;
5993   }
5994   if(opcode[i]==3) // JAL
5995   {
5996     // TODO: mini_ht
5997     int rt=get_reg(i_regs->regmap,31);
5998     emit_movimm(start+i*4+8,rt);
5999     unconditional=1;
6000   }
6001   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6002   {
6003     emit_mov(s1l,addr);
6004     if(opcode2[i]==9) // JALR
6005     {
6006       int rt=get_reg(i_regs->regmap,31);
6007       emit_movimm(start+i*4+8,rt);
6008     }
6009   }
6010   if((opcode[i]&0x3f)==4) // BEQ
6011   {
6012     if(rs1[i]==rs2[i])
6013     {
6014       unconditional=1;
6015     }
6016     else
6017     #ifdef HAVE_CMOV_IMM
6018     if(s1h<0) {
6019       if(s2l>=0) emit_cmp(s1l,s2l);
6020       else emit_test(s1l,s1l);
6021       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6022     }
6023     else
6024     #endif
6025     {
6026       assert(s1l>=0);
6027       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6028       if(s1h>=0) {
6029         if(s2h>=0) emit_cmp(s1h,s2h);
6030         else emit_test(s1h,s1h);
6031         emit_cmovne_reg(alt,addr);
6032       }
6033       if(s2l>=0) emit_cmp(s1l,s2l);
6034       else emit_test(s1l,s1l);
6035       emit_cmovne_reg(alt,addr);
6036     }
6037   }
6038   if((opcode[i]&0x3f)==5) // BNE
6039   {
6040     #ifdef HAVE_CMOV_IMM
6041     if(s1h<0) {
6042       if(s2l>=0) emit_cmp(s1l,s2l);
6043       else emit_test(s1l,s1l);
6044       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6045     }
6046     else
6047     #endif
6048     {
6049       assert(s1l>=0);
6050       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6051       if(s1h>=0) {
6052         if(s2h>=0) emit_cmp(s1h,s2h);
6053         else emit_test(s1h,s1h);
6054         emit_cmovne_reg(alt,addr);
6055       }
6056       if(s2l>=0) emit_cmp(s1l,s2l);
6057       else emit_test(s1l,s1l);
6058       emit_cmovne_reg(alt,addr);
6059     }
6060   }
6061   if((opcode[i]&0x3f)==0x14) // BEQL
6062   {
6063     if(s1h>=0) {
6064       if(s2h>=0) emit_cmp(s1h,s2h);
6065       else emit_test(s1h,s1h);
6066       nottaken=(int)out;
6067       emit_jne(0);
6068     }
6069     if(s2l>=0) emit_cmp(s1l,s2l);
6070     else emit_test(s1l,s1l);
6071     if(nottaken) set_jump_target(nottaken,(int)out);
6072     nottaken=(int)out;
6073     emit_jne(0);
6074   }
6075   if((opcode[i]&0x3f)==0x15) // BNEL
6076   {
6077     if(s1h>=0) {
6078       if(s2h>=0) emit_cmp(s1h,s2h);
6079       else emit_test(s1h,s1h);
6080       taken=(int)out;
6081       emit_jne(0);
6082     }
6083     if(s2l>=0) emit_cmp(s1l,s2l);
6084     else emit_test(s1l,s1l);
6085     nottaken=(int)out;
6086     emit_jeq(0);
6087     if(taken) set_jump_target(taken,(int)out);
6088   }
6089   if((opcode[i]&0x3f)==6) // BLEZ
6090   {
6091     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6092     emit_cmpimm(s1l,1);
6093     if(s1h>=0) emit_mov(addr,ntaddr);
6094     emit_cmovl_reg(alt,addr);
6095     if(s1h>=0) {
6096       emit_test(s1h,s1h);
6097       emit_cmovne_reg(ntaddr,addr);
6098       emit_cmovs_reg(alt,addr);
6099     }
6100   }
6101   if((opcode[i]&0x3f)==7) // BGTZ
6102   {
6103     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6104     emit_cmpimm(s1l,1);
6105     if(s1h>=0) emit_mov(addr,alt);
6106     emit_cmovl_reg(ntaddr,addr);
6107     if(s1h>=0) {
6108       emit_test(s1h,s1h);
6109       emit_cmovne_reg(alt,addr);
6110       emit_cmovs_reg(ntaddr,addr);
6111     }
6112   }
6113   if((opcode[i]&0x3f)==0x16) // BLEZL
6114   {
6115     assert((opcode[i]&0x3f)!=0x16);
6116   }
6117   if((opcode[i]&0x3f)==0x17) // BGTZL
6118   {
6119     assert((opcode[i]&0x3f)!=0x17);
6120   }
6121   assert(opcode[i]!=1); // BLTZ/BGEZ
6122
6123   //FIXME: Check CSREG
6124   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6125     if((source[i]&0x30000)==0) // BC1F
6126     {
6127       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6128       emit_testimm(s1l,0x800000);
6129       emit_cmovne_reg(alt,addr);
6130     }
6131     if((source[i]&0x30000)==0x10000) // BC1T
6132     {
6133       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6134       emit_testimm(s1l,0x800000);
6135       emit_cmovne_reg(alt,addr);
6136     }
6137     if((source[i]&0x30000)==0x20000) // BC1FL
6138     {
6139       emit_testimm(s1l,0x800000);
6140       nottaken=(int)out;
6141       emit_jne(0);
6142     }
6143     if((source[i]&0x30000)==0x30000) // BC1TL
6144     {
6145       emit_testimm(s1l,0x800000);
6146       nottaken=(int)out;
6147       emit_jeq(0);
6148     }
6149   }
6150
6151   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6152   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6153   if(likely[i]||unconditional)
6154   {
6155     emit_movimm(ba[i],HOST_BTREG);
6156   }
6157   else if(addr!=HOST_BTREG)
6158   {
6159     emit_mov(addr,HOST_BTREG);
6160   }
6161   void *branch_addr=out;
6162   emit_jmp(0);
6163   int target_addr=start+i*4+5;
6164   void *stub=out;
6165   void *compiled_target_addr=check_addr(target_addr);
6166   emit_extjump_ds((int)branch_addr,target_addr);
6167   if(compiled_target_addr) {
6168     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6169     add_link(target_addr,stub);
6170   }
6171   else set_jump_target((int)branch_addr,(int)stub);
6172   if(likely[i]) {
6173     // Not-taken path
6174     set_jump_target((int)nottaken,(int)out);
6175     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6176     void *branch_addr=out;
6177     emit_jmp(0);
6178     int target_addr=start+i*4+8;
6179     void *stub=out;
6180     void *compiled_target_addr=check_addr(target_addr);
6181     emit_extjump_ds((int)branch_addr,target_addr);
6182     if(compiled_target_addr) {
6183       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6184       add_link(target_addr,stub);
6185     }
6186     else set_jump_target((int)branch_addr,(int)stub);
6187   }
6188 }
6189
6190 // Assemble the delay slot for the above
6191 static void pagespan_ds()
6192 {
6193   assem_debug("initial delay slot:\n");
6194   u_int vaddr=start+1;
6195   u_int page=(0x80000000^vaddr)>>12;
6196   u_int vpage=page;
6197   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[page^0x80000]^0x80000000)>>12;
6198   if(page>2048) page=2048+(page&2047);
6199   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
6200   if(vpage>2048) vpage=2048+(vpage&2047);
6201   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6202   do_dirty_stub_ds();
6203   ll_add(jump_in+page,vaddr,(void *)out);
6204   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6205   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6206     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6207   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6208     emit_writeword(HOST_BTREG,(int)&branch_target);
6209   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6210   address_generation(0,&regs[0],regs[0].regmap_entry);
6211   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6212     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6213   cop1_usable=0;
6214   is_delayslot=0;
6215   switch(itype[0]) {
6216     case ALU:
6217       alu_assemble(0,&regs[0]);break;
6218     case IMM16:
6219       imm16_assemble(0,&regs[0]);break;
6220     case SHIFT:
6221       shift_assemble(0,&regs[0]);break;
6222     case SHIFTIMM:
6223       shiftimm_assemble(0,&regs[0]);break;
6224     case LOAD:
6225       load_assemble(0,&regs[0]);break;
6226     case LOADLR:
6227       loadlr_assemble(0,&regs[0]);break;
6228     case STORE:
6229       store_assemble(0,&regs[0]);break;
6230     case STORELR:
6231       storelr_assemble(0,&regs[0]);break;
6232     case COP0:
6233       cop0_assemble(0,&regs[0]);break;
6234     case COP1:
6235       cop1_assemble(0,&regs[0]);break;
6236     case C1LS:
6237       c1ls_assemble(0,&regs[0]);break;
6238     case FCONV:
6239       fconv_assemble(0,&regs[0]);break;
6240     case FLOAT:
6241       float_assemble(0,&regs[0]);break;
6242     case FCOMP:
6243       fcomp_assemble(0,&regs[0]);break;
6244     case MULTDIV:
6245       multdiv_assemble(0,&regs[0]);break;
6246     case MOV:
6247       mov_assemble(0,&regs[0]);break;
6248     case SYSCALL:
6249     case SPAN:
6250     case UJUMP:
6251     case RJUMP:
6252     case CJUMP:
6253     case SJUMP:
6254     case FJUMP:
6255       printf("Jump in the delay slot.  This is probably a bug.\n");
6256   }
6257   int btaddr=get_reg(regs[0].regmap,BTREG);
6258   if(btaddr<0) {
6259     btaddr=get_reg(regs[0].regmap,-1);
6260     emit_readword((int)&branch_target,btaddr);
6261   }
6262   assert(btaddr!=HOST_CCREG);
6263   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6264 #ifdef HOST_IMM8
6265   emit_movimm(start+4,HOST_TEMPREG);
6266   emit_cmp(btaddr,HOST_TEMPREG);
6267 #else
6268   emit_cmpimm(btaddr,start+4);
6269 #endif
6270   int branch=(int)out;
6271   emit_jeq(0);
6272   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6273   emit_jmp(jump_vaddr_reg[btaddr]);
6274   set_jump_target(branch,(int)out);
6275   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6276   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6277 }
6278
6279 // Basic liveness analysis for MIPS registers
6280 void unneeded_registers(int istart,int iend,int r)
6281 {
6282   int i;
6283   uint64_t u,uu,b,bu;
6284   uint64_t temp_u,temp_uu;
6285   uint64_t tdep;
6286   if(iend==slen-1) {
6287     u=1;uu=1;
6288   }else{
6289     u=unneeded_reg[iend+1];
6290     uu=unneeded_reg_upper[iend+1];
6291     u=1;uu=1;
6292   }
6293   for (i=iend;i>=istart;i--)
6294   {
6295     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6296     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6297     {
6298       // If subroutine call, flag return address as a possible branch target
6299       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6300       
6301       if(ba[i]<start || ba[i]>=(start+slen*4))
6302       {
6303         // Branch out of this block, flush all regs
6304         u=1;
6305         uu=1;
6306         /* Hexagon hack 
6307         if(itype[i]==UJUMP&&rt1[i]==31)
6308         {
6309           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6310         }
6311         if(itype[i]==RJUMP&&rs1[i]==31)
6312         {
6313           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6314         }
6315         if(start>0x80000400&&start<0x80800000) {
6316           if(itype[i]==UJUMP&&rt1[i]==31)
6317           {
6318             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6319             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6320           }
6321           if(itype[i]==RJUMP&&rs1[i]==31)
6322           {
6323             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6324             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6325           }
6326         }*/
6327         branch_unneeded_reg[i]=u;
6328         branch_unneeded_reg_upper[i]=uu;
6329         // Merge in delay slot
6330         tdep=(~uu>>rt1[i+1])&1;
6331         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6332         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6333         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6334         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6335         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6336         u|=1;uu|=1;
6337         // If branch is "likely" (and conditional)
6338         // then we skip the delay slot on the fall-thru path
6339         if(likely[i]) {
6340           if(i<slen-1) {
6341             u&=unneeded_reg[i+2];
6342             uu&=unneeded_reg_upper[i+2];
6343           }
6344           else
6345           {
6346             u=1;
6347             uu=1;
6348           }
6349         }
6350       }
6351       else
6352       {
6353         // Internal branch, flag target
6354         bt[(ba[i]-start)>>2]=1;
6355         if(ba[i]<=start+i*4) {
6356           // Backward branch
6357           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6358           {
6359             // Unconditional branch
6360             temp_u=1;temp_uu=1;
6361           } else {
6362             // Conditional branch (not taken case)
6363             temp_u=unneeded_reg[i+2];
6364             temp_uu=unneeded_reg_upper[i+2];
6365           }
6366           // Merge in delay slot
6367           tdep=(~temp_uu>>rt1[i+1])&1;
6368           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6369           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6370           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6371           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6372           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6373           temp_u|=1;temp_uu|=1;
6374           // If branch is "likely" (and conditional)
6375           // then we skip the delay slot on the fall-thru path
6376           if(likely[i]) {
6377             if(i<slen-1) {
6378               temp_u&=unneeded_reg[i+2];
6379               temp_uu&=unneeded_reg_upper[i+2];
6380             }
6381             else
6382             {
6383               temp_u=1;
6384               temp_uu=1;
6385             }
6386           }
6387           tdep=(~temp_uu>>rt1[i])&1;
6388           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6389           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6390           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6391           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6392           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6393           temp_u|=1;temp_uu|=1;
6394           unneeded_reg[i]=temp_u;
6395           unneeded_reg_upper[i]=temp_uu;
6396           // Only go three levels deep.  This recursion can take an
6397           // excessive amount of time if there are a lot of nested loops.
6398           if(r<2) {
6399             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6400           }else{
6401             unneeded_reg[(ba[i]-start)>>2]=1;
6402             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6403           }
6404         } /*else*/ if(1) {
6405           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6406           {
6407             // Unconditional branch
6408             u=unneeded_reg[(ba[i]-start)>>2];
6409             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6410             branch_unneeded_reg[i]=u;
6411             branch_unneeded_reg_upper[i]=uu;
6412         //u=1;
6413         //uu=1;
6414         //branch_unneeded_reg[i]=u;
6415         //branch_unneeded_reg_upper[i]=uu;
6416             // Merge in delay slot
6417             tdep=(~uu>>rt1[i+1])&1;
6418             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6419             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6420             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6421             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6422             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6423             u|=1;uu|=1;
6424           } else {
6425             // Conditional branch
6426             b=unneeded_reg[(ba[i]-start)>>2];
6427             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6428             branch_unneeded_reg[i]=b;
6429             branch_unneeded_reg_upper[i]=bu;
6430         //b=1;
6431         //bu=1;
6432         //branch_unneeded_reg[i]=b;
6433         //branch_unneeded_reg_upper[i]=bu;
6434             // Branch delay slot
6435             tdep=(~uu>>rt1[i+1])&1;
6436             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6437             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6438             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6439             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6440             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6441             b|=1;bu|=1;
6442             // If branch is "likely" then we skip the
6443             // delay slot on the fall-thru path
6444             if(likely[i]) {
6445               u=b;
6446               uu=bu;
6447               if(i<slen-1) {
6448                 u&=unneeded_reg[i+2];
6449                 uu&=unneeded_reg_upper[i+2];
6450         //u=1;
6451         //uu=1;
6452               }
6453             } else {
6454               u&=b;
6455               uu&=bu;
6456         //u=1;
6457         //uu=1;
6458             }
6459             if(i<slen-1) {
6460               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6461               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6462         //branch_unneeded_reg[i]=1;
6463         //branch_unneeded_reg_upper[i]=1;
6464             } else {
6465               branch_unneeded_reg[i]=1;
6466               branch_unneeded_reg_upper[i]=1;
6467             }
6468           }
6469         }
6470       }
6471     }
6472     else if(itype[i]==SYSCALL)
6473     {
6474       // SYSCALL instruction (software interrupt)
6475       u=1;
6476       uu=1;
6477     }
6478     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6479     {
6480       // ERET instruction (return from interrupt)
6481       u=1;
6482       uu=1;
6483     }
6484     //u=uu=1; // DEBUG
6485     tdep=(~uu>>rt1[i])&1;
6486     // Written registers are unneeded
6487     u|=1LL<<rt1[i];
6488     u|=1LL<<rt2[i];
6489     uu|=1LL<<rt1[i];
6490     uu|=1LL<<rt2[i];
6491     // Accessed registers are needed
6492     u&=~(1LL<<rs1[i]);
6493     u&=~(1LL<<rs2[i]);
6494     uu&=~(1LL<<us1[i]);
6495     uu&=~(1LL<<us2[i]);
6496     // Source-target dependencies
6497     uu&=~(tdep<<dep1[i]);
6498     uu&=~(tdep<<dep2[i]);
6499     // R0 is always unneeded
6500     u|=1;uu|=1;
6501     // Save it
6502     unneeded_reg[i]=u;
6503     unneeded_reg_upper[i]=uu;
6504     /*
6505     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6506     printf("U:");
6507     int r;
6508     for(r=1;r<=CCREG;r++) {
6509       if((unneeded_reg[i]>>r)&1) {
6510         if(r==HIREG) printf(" HI");
6511         else if(r==LOREG) printf(" LO");
6512         else printf(" r%d",r);
6513       }
6514     }
6515     printf(" UU:");
6516     for(r=1;r<=CCREG;r++) {
6517       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6518         if(r==HIREG) printf(" HI");
6519         else if(r==LOREG) printf(" LO");
6520         else printf(" r%d",r);
6521       }
6522     }
6523     printf("\n");*/
6524   }
6525 }
6526
6527 // Identify registers which are likely to contain 32-bit values
6528 // This is used to predict whether any branches will jump to a
6529 // location with 64-bit values in registers.
6530 static void provisional_32bit()
6531 {
6532   int i,j;
6533   uint64_t is32=1;
6534   uint64_t lastbranch=1;
6535   
6536   for(i=0;i<slen;i++)
6537   {
6538     if(i>0) {
6539       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6540         if(i>1) is32=lastbranch;
6541         else is32=1;
6542       }
6543     }
6544     if(i>1)
6545     {
6546       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6547         if(likely[i-2]) {
6548           if(i>2) is32=lastbranch;
6549           else is32=1;
6550         }
6551       }
6552       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6553       {
6554         if(rs1[i-2]==0||rs2[i-2]==0)
6555         {
6556           if(rs1[i-2]) {
6557             is32|=1LL<<rs1[i-2];
6558           }
6559           if(rs2[i-2]) {
6560             is32|=1LL<<rs2[i-2];
6561           }
6562         }
6563       }
6564     }
6565     // If something jumps here with 64-bit values
6566     // then promote those registers to 64 bits
6567     if(bt[i])
6568     {
6569       uint64_t temp_is32=is32;
6570       for(j=i-1;j>=0;j--)
6571       {
6572         if(ba[j]==start+i*4) 
6573           //temp_is32&=branch_regs[j].is32;
6574           temp_is32&=p32[j];
6575       }
6576       for(j=i;j<slen;j++)
6577       {
6578         if(ba[j]==start+i*4) 
6579           temp_is32=1;
6580       }
6581       is32=temp_is32;
6582     }
6583     int type=itype[i];
6584     int op=opcode[i];
6585     int op2=opcode2[i];
6586     int rt=rt1[i];
6587     int s1=rs1[i];
6588     int s2=rs2[i];
6589     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6590       // Branches don't write registers, consider the delay slot instead.
6591       type=itype[i+1];
6592       op=opcode[i+1];
6593       op2=opcode2[i+1];
6594       rt=rt1[i+1];
6595       s1=rs1[i+1];
6596       s2=rs2[i+1];
6597       lastbranch=is32;
6598     }
6599     switch(type) {
6600       case LOAD:
6601         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6602            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6603           is32&=~(1LL<<rt);
6604         else
6605           is32|=1LL<<rt;
6606         break;
6607       case STORE:
6608       case STORELR:
6609         break;
6610       case LOADLR:
6611         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6612         if(op==0x22) is32|=1LL<<rt; // LWL
6613         break;
6614       case IMM16:
6615         if (op==0x08||op==0x09|| // ADDI/ADDIU
6616             op==0x0a||op==0x0b|| // SLTI/SLTIU
6617             op==0x0c|| // ANDI
6618             op==0x0f)  // LUI
6619         {
6620           is32|=1LL<<rt;
6621         }
6622         if(op==0x18||op==0x19) { // DADDI/DADDIU
6623           is32&=~(1LL<<rt);
6624           //if(imm[i]==0)
6625           //  is32|=((is32>>s1)&1LL)<<rt;
6626         }
6627         if(op==0x0d||op==0x0e) { // ORI/XORI
6628           uint64_t sr=((is32>>s1)&1LL);
6629           is32&=~(1LL<<rt);
6630           is32|=sr<<rt;
6631         }
6632         break;
6633       case UJUMP:
6634         break;
6635       case RJUMP:
6636         break;
6637       case CJUMP:
6638         break;
6639       case SJUMP:
6640         break;
6641       case FJUMP:
6642         break;
6643       case ALU:
6644         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6645           is32|=1LL<<rt;
6646         }
6647         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6648           is32|=1LL<<rt;
6649         }
6650         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6651           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6652           is32&=~(1LL<<rt);
6653           is32|=sr<<rt;
6654         }
6655         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6656           if(s1==0&&s2==0) {
6657             is32|=1LL<<rt;
6658           }
6659           else if(s2==0) {
6660             uint64_t sr=((is32>>s1)&1LL);
6661             is32&=~(1LL<<rt);
6662             is32|=sr<<rt;
6663           }
6664           else if(s1==0) {
6665             uint64_t sr=((is32>>s2)&1LL);
6666             is32&=~(1LL<<rt);
6667             is32|=sr<<rt;
6668           }
6669           else {
6670             is32&=~(1LL<<rt);
6671           }
6672         }
6673         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6674           if(s1==0&&s2==0) {
6675             is32|=1LL<<rt;
6676           }
6677           else if(s2==0) {
6678             uint64_t sr=((is32>>s1)&1LL);
6679             is32&=~(1LL<<rt);
6680             is32|=sr<<rt;
6681           }
6682           else {
6683             is32&=~(1LL<<rt);
6684           }
6685         }
6686         break;
6687       case MULTDIV:
6688         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6689           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6690         }
6691         else {
6692           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6693         }
6694         break;
6695       case MOV:
6696         {
6697           uint64_t sr=((is32>>s1)&1LL);
6698           is32&=~(1LL<<rt);
6699           is32|=sr<<rt;
6700         }
6701         break;
6702       case SHIFT:
6703         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6704         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6705         break;
6706       case SHIFTIMM:
6707         is32|=1LL<<rt;
6708         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6709         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6710         break;
6711       case COP0:
6712         if(op2==0) is32|=1LL<<rt; // MFC0
6713         break;
6714       case COP1:
6715         if(op2==0) is32|=1LL<<rt; // MFC1
6716         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6717         if(op2==2) is32|=1LL<<rt; // CFC1
6718         break;
6719       case C1LS:
6720         break;
6721       case FLOAT:
6722       case FCONV:
6723         break;
6724       case FCOMP:
6725         break;
6726       case SYSCALL:
6727         break;
6728       default:
6729         break;
6730     }
6731     is32|=1;
6732     p32[i]=is32;
6733
6734     if(i>0)
6735     {
6736       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6737       {
6738         if(rt1[i-1]==31) // JAL/JALR
6739         {
6740           // Subroutine call will return here, don't alloc any registers
6741           is32=1;
6742         }
6743         else if(i+1<slen)
6744         {
6745           // Internal branch will jump here, match registers to caller
6746           is32=0x3FFFFFFFFLL;
6747         }
6748       }
6749     }
6750   }
6751 }
6752
6753 // Identify registers which may be assumed to contain 32-bit values
6754 // and where optimizations will rely on this.
6755 // This is used to determine whether backward branches can safely
6756 // jump to a location with 64-bit values in registers.
6757 static void provisional_r32()
6758 {
6759   u_int r32=0;
6760   int i;
6761   
6762   for (i=slen-1;i>=0;i--)
6763   {
6764     int hr;
6765     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6766     {
6767       if(ba[i]<start || ba[i]>=(start+slen*4))
6768       {
6769         // Branch out of this block, don't need anything
6770         r32=0;
6771       }
6772       else
6773       {
6774         // Internal branch
6775         // Need whatever matches the target
6776         // (and doesn't get overwritten by the delay slot instruction)
6777         r32=0;
6778         int t=(ba[i]-start)>>2;
6779         if(ba[i]>start+i*4) {
6780           // Forward branch
6781           //if(!(requires_32bit[t]&~regs[i].was32))
6782           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6783           if(!(pr32[t]&~regs[i].was32))
6784             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6785         }else{
6786           // Backward branch
6787           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6788             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6789         }
6790       }
6791       // Conditional branch may need registers for following instructions
6792       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6793       {
6794         if(i<slen-2) {
6795           //r32|=requires_32bit[i+2];
6796           r32|=pr32[i+2];
6797           r32&=regs[i].was32;
6798           // Mark this address as a branch target since it may be called
6799           // upon return from interrupt
6800           //bt[i+2]=1;
6801         }
6802       }
6803       // Merge in delay slot
6804       if(!likely[i]) {
6805         // These are overwritten unless the branch is "likely"
6806         // and the delay slot is nullified if not taken
6807         r32&=~(1LL<<rt1[i+1]);
6808         r32&=~(1LL<<rt2[i+1]);
6809       }
6810       // Assume these are needed (delay slot)
6811       if(us1[i+1]>0)
6812       {
6813         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6814       }
6815       if(us2[i+1]>0)
6816       {
6817         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6818       }
6819       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6820       {
6821         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6822       }
6823       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6824       {
6825         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6826       }
6827     }
6828     else if(itype[i]==SYSCALL)
6829     {
6830       // SYSCALL instruction (software interrupt)
6831       r32=0;
6832     }
6833     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6834     {
6835       // ERET instruction (return from interrupt)
6836       r32=0;
6837     }
6838     // Check 32 bits
6839     r32&=~(1LL<<rt1[i]);
6840     r32&=~(1LL<<rt2[i]);
6841     if(us1[i]>0)
6842     {
6843       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6844     }
6845     if(us2[i]>0)
6846     {
6847       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6848     }
6849     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6850     {
6851       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6852     }
6853     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6854     {
6855       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6856     }
6857     //requires_32bit[i]=r32;
6858     pr32[i]=r32;
6859     
6860     // Dirty registers which are 32-bit, require 32-bit input
6861     // as they will be written as 32-bit values
6862     for(hr=0;hr<HOST_REGS;hr++)
6863     {
6864       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6865         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6866           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6867           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6868           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6869         }
6870       }
6871     }
6872   }
6873 }
6874
6875 // Write back dirty registers as soon as we will no longer modify them,
6876 // so that we don't end up with lots of writes at the branches.
6877 void clean_registers(int istart,int iend,int wr)
6878 {
6879   int i;
6880   int r;
6881   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6882   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6883   if(iend==slen-1) {
6884     will_dirty_i=will_dirty_next=0;
6885     wont_dirty_i=wont_dirty_next=0;
6886   }else{
6887     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6888     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6889   }
6890   for (i=iend;i>=istart;i--)
6891   {
6892     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6893     {
6894       if(ba[i]<start || ba[i]>=(start+slen*4))
6895       {
6896         // Branch out of this block, flush all regs
6897         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6898         {
6899           // Unconditional branch
6900           will_dirty_i=0;
6901           wont_dirty_i=0;
6902           // Merge in delay slot (will dirty)
6903           for(r=0;r<HOST_REGS;r++) {
6904             if(r!=EXCLUDE_REG) {
6905               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6906               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6907               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6908               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6909               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6910               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6911               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6912               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6913               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6914               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6915               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6916               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6917               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6918               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6919             }
6920           }
6921         }
6922         else
6923         {
6924           // Conditional branch
6925           will_dirty_i=0;
6926           wont_dirty_i=wont_dirty_next;
6927           // Merge in delay slot (will dirty)
6928           for(r=0;r<HOST_REGS;r++) {
6929             if(r!=EXCLUDE_REG) {
6930               if(!likely[i]) {
6931                 // Might not dirty if likely branch is not taken
6932                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6933                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6934                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6935                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6936                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6937                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6938                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6939                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6940                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6941                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6942                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6943                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6944                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6945                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6946               }
6947             }
6948           }
6949         }
6950         // Merge in delay slot (wont dirty)
6951         for(r=0;r<HOST_REGS;r++) {
6952           if(r!=EXCLUDE_REG) {
6953             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6954             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6955             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6956             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6957             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6958             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6959             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6960             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6961             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6962             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6963           }
6964         }
6965         if(wr) {
6966           #ifndef DESTRUCTIVE_WRITEBACK
6967           branch_regs[i].dirty&=wont_dirty_i;
6968           #endif
6969           branch_regs[i].dirty|=will_dirty_i;
6970         }
6971       }
6972       else
6973       {
6974         // Internal branch
6975         if(ba[i]<=start+i*4) {
6976           // Backward branch
6977           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6978           {
6979             // Unconditional branch
6980             temp_will_dirty=0;
6981             temp_wont_dirty=0;
6982             // Merge in delay slot (will dirty)
6983             for(r=0;r<HOST_REGS;r++) {
6984               if(r!=EXCLUDE_REG) {
6985                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6986                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6987                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6988                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6989                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6990                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6991                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6992                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6993                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6994                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6995                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6996                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6997                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6998                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6999               }
7000             }
7001           } else {
7002             // Conditional branch (not taken case)
7003             temp_will_dirty=will_dirty_next;
7004             temp_wont_dirty=wont_dirty_next;
7005             // Merge in delay slot (will dirty)
7006             for(r=0;r<HOST_REGS;r++) {
7007               if(r!=EXCLUDE_REG) {
7008                 if(!likely[i]) {
7009                   // Will not dirty if likely branch is not taken
7010                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7011                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7012                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7013                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7014                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7015                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7016                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7017                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7018                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7019                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7020                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7021                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7022                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7023                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7024                 }
7025               }
7026             }
7027           }
7028           // Merge in delay slot (wont dirty)
7029           for(r=0;r<HOST_REGS;r++) {
7030             if(r!=EXCLUDE_REG) {
7031               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7032               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7033               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7034               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7035               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7036               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7037               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7038               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7039               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7040               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7041             }
7042           }
7043           // Deal with changed mappings
7044           if(i<iend) {
7045             for(r=0;r<HOST_REGS;r++) {
7046               if(r!=EXCLUDE_REG) {
7047                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7048                   temp_will_dirty&=~(1<<r);
7049                   temp_wont_dirty&=~(1<<r);
7050                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7051                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7052                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7053                   } else {
7054                     temp_will_dirty|=1<<r;
7055                     temp_wont_dirty|=1<<r;
7056                   }
7057                 }
7058               }
7059             }
7060           }
7061           if(wr) {
7062             will_dirty[i]=temp_will_dirty;
7063             wont_dirty[i]=temp_wont_dirty;
7064             clean_registers((ba[i]-start)>>2,i-1,0);
7065           }else{
7066             // Limit recursion.  It can take an excessive amount
7067             // of time if there are a lot of nested loops.
7068             will_dirty[(ba[i]-start)>>2]=0;
7069             wont_dirty[(ba[i]-start)>>2]=-1;
7070           }
7071         }
7072         /*else*/ if(1)
7073         {
7074           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7075           {
7076             // Unconditional branch
7077             will_dirty_i=0;
7078             wont_dirty_i=0;
7079           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7080             for(r=0;r<HOST_REGS;r++) {
7081               if(r!=EXCLUDE_REG) {
7082                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7083                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7084                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7085                 }
7086               }
7087             }
7088           //}
7089             // Merge in delay slot
7090             for(r=0;r<HOST_REGS;r++) {
7091               if(r!=EXCLUDE_REG) {
7092                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7093                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7094                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7095                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7096                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7097                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7098                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7099                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7100                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7101                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7102                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7103                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7104                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7105                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7106               }
7107             }
7108           } else {
7109             // Conditional branch
7110             will_dirty_i=will_dirty_next;
7111             wont_dirty_i=wont_dirty_next;
7112           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7113             for(r=0;r<HOST_REGS;r++) {
7114               if(r!=EXCLUDE_REG) {
7115                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7116                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7117                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7118                 }
7119                 else
7120                 {
7121                   will_dirty_i&=~(1<<r);
7122                 }
7123                 // Treat delay slot as part of branch too
7124                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7125                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7126                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7127                 }
7128                 else
7129                 {
7130                   will_dirty[i+1]&=~(1<<r);
7131                 }*/
7132               }
7133             }
7134           //}
7135             // Merge in delay slot
7136             for(r=0;r<HOST_REGS;r++) {
7137               if(r!=EXCLUDE_REG) {
7138                 if(!likely[i]) {
7139                   // Might not dirty if likely branch is not taken
7140                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7141                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7142                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7143                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7144                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7145                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7146                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7147                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7148                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7149                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7150                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7151                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7152                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7153                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7154                 }
7155               }
7156             }
7157           }
7158           // Merge in delay slot
7159           for(r=0;r<HOST_REGS;r++) {
7160             if(r!=EXCLUDE_REG) {
7161               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7162               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7163               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7164               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7165               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7166               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7167               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7168               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7169               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7170               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7171             }
7172           }
7173           if(wr) {
7174             #ifndef DESTRUCTIVE_WRITEBACK
7175             branch_regs[i].dirty&=wont_dirty_i;
7176             #endif
7177             branch_regs[i].dirty|=will_dirty_i;
7178           }
7179         }
7180       }
7181     }
7182     else if(itype[i]==SYSCALL)
7183     {
7184       // SYSCALL instruction (software interrupt)
7185       will_dirty_i=0;
7186       wont_dirty_i=0;
7187     }
7188     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7189     {
7190       // ERET instruction (return from interrupt)
7191       will_dirty_i=0;
7192       wont_dirty_i=0;
7193     }
7194     will_dirty_next=will_dirty_i;
7195     wont_dirty_next=wont_dirty_i;
7196     for(r=0;r<HOST_REGS;r++) {
7197       if(r!=EXCLUDE_REG) {
7198         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7199         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7200         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7201         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7202         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7203         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7204         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7205         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7206         if(i>istart) {
7207           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7208           {
7209             // Don't store a register immediately after writing it,
7210             // may prevent dual-issue.
7211             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7212             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7213           }
7214         }
7215       }
7216     }
7217     // Save it
7218     will_dirty[i]=will_dirty_i;
7219     wont_dirty[i]=wont_dirty_i;
7220     // Mark registers that won't be dirtied as not dirty
7221     if(wr) {
7222       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7223       for(r=0;r<HOST_REGS;r++) {
7224         if((will_dirty_i>>r)&1) {
7225           printf(" r%d",r);
7226         }
7227       }
7228       printf("\n");*/
7229
7230       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7231         regs[i].dirty|=will_dirty_i;
7232         #ifndef DESTRUCTIVE_WRITEBACK
7233         regs[i].dirty&=wont_dirty_i;
7234         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7235         {
7236           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7237             for(r=0;r<HOST_REGS;r++) {
7238               if(r!=EXCLUDE_REG) {
7239                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7240                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7241                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7242               }
7243             }
7244           }
7245         }
7246         else
7247         {
7248           if(i<iend) {
7249             for(r=0;r<HOST_REGS;r++) {
7250               if(r!=EXCLUDE_REG) {
7251                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7252                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7253                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7254               }
7255             }
7256           }
7257         }
7258         #endif
7259       //}
7260     }
7261     // Deal with changed mappings
7262     temp_will_dirty=will_dirty_i;
7263     temp_wont_dirty=wont_dirty_i;
7264     for(r=0;r<HOST_REGS;r++) {
7265       if(r!=EXCLUDE_REG) {
7266         int nr;
7267         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7268           if(wr) {
7269             #ifndef DESTRUCTIVE_WRITEBACK
7270             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7271             #endif
7272             regs[i].wasdirty|=will_dirty_i&(1<<r);
7273           }
7274         }
7275         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7276           // Register moved to a different register
7277           will_dirty_i&=~(1<<r);
7278           wont_dirty_i&=~(1<<r);
7279           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7280           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7281           if(wr) {
7282             #ifndef DESTRUCTIVE_WRITEBACK
7283             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7284             #endif
7285             regs[i].wasdirty|=will_dirty_i&(1<<r);
7286           }
7287         }
7288         else {
7289           will_dirty_i&=~(1<<r);
7290           wont_dirty_i&=~(1<<r);
7291           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7292             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7293             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7294           } else {
7295             wont_dirty_i|=1<<r;
7296             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7297           }
7298         }
7299       }
7300     }
7301   }
7302 }
7303
7304   /* disassembly */
7305 void disassemble_inst(int i)
7306 {
7307     if (bt[i]) printf("*"); else printf(" ");
7308     switch(itype[i]) {
7309       case UJUMP:
7310         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7311       case CJUMP:
7312         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7313       case SJUMP:
7314         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7315       case FJUMP:
7316         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7317       case RJUMP:
7318         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7319       case SPAN:
7320         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7321       case IMM16:
7322         if(opcode[i]==0xf) //LUI
7323           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7324         else
7325           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7326         break;
7327       case LOAD:
7328       case LOADLR:
7329         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7330         break;
7331       case STORE:
7332       case STORELR:
7333         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7334         break;
7335       case ALU:
7336       case SHIFT:
7337         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7338         break;
7339       case MULTDIV:
7340         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7341         break;
7342       case SHIFTIMM:
7343         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7344         break;
7345       case MOV:
7346         if((opcode2[i]&0x1d)==0x10)
7347           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7348         else if((opcode2[i]&0x1d)==0x11)
7349           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7350         else
7351           printf (" %x: %s\n",start+i*4,insn[i]);
7352         break;
7353       case COP0:
7354         if(opcode2[i]==0)
7355           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7356         else if(opcode2[i]==4)
7357           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7358         else printf (" %x: %s\n",start+i*4,insn[i]);
7359         break;
7360       case COP1:
7361         if(opcode2[i]<3)
7362           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7363         else if(opcode2[i]>3)
7364           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7365         else printf (" %x: %s\n",start+i*4,insn[i]);
7366         break;
7367       case C1LS:
7368         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7369         break;
7370       default:
7371         //printf (" %s %8x\n",insn[i],source[i]);
7372         printf (" %x: %s\n",start+i*4,insn[i]);
7373     }
7374 }
7375
7376 void new_dynarec_init()
7377 {
7378   printf("Init new dynarec\n");
7379   out=(u_char *)BASE_ADDR;
7380   if (mmap (out, 1<<TARGET_SIZE_2,
7381             PROT_READ | PROT_WRITE | PROT_EXEC,
7382             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7383             -1, 0) <= 0) {printf("mmap() failed\n");}
7384   rdword=&readmem_dword;
7385   fake_pc.f.r.rs=&readmem_dword;
7386   fake_pc.f.r.rt=&readmem_dword;
7387   fake_pc.f.r.rd=&readmem_dword;
7388   int n;
7389   for(n=0x80000;n<0x80800;n++)
7390     invalid_code[n]=1;
7391   for(n=0;n<65536;n++)
7392     hash_table[n][0]=hash_table[n][2]=-1;
7393   memset(mini_ht,-1,sizeof(mini_ht));
7394   memset(restore_candidate,0,sizeof(restore_candidate));
7395   copy=shadow;
7396   expirep=16384; // Expiry pointer, +2 blocks
7397   pending_exception=0;
7398   literalcount=0;
7399 #ifdef HOST_IMM8
7400   // Copy this into local area so we don't have to put it in every literal pool
7401   invc_ptr=invalid_code;
7402 #endif
7403   stop_after_jal=0;
7404   // TLB
7405   using_tlb=0;
7406   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7407     memory_map[n]=-1;
7408   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7409     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7410   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7411     memory_map[n]=-1;
7412   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7413     writemem[n] = write_nomem_new;
7414     writememb[n] = write_nomemb_new;
7415     writememh[n] = write_nomemh_new;
7416     writememd[n] = write_nomemd_new;
7417     readmem[n] = read_nomem_new;
7418     readmemb[n] = read_nomemb_new;
7419     readmemh[n] = read_nomemh_new;
7420     readmemd[n] = read_nomemd_new;
7421   }
7422   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7423     writemem[n] = write_rdram_new;
7424     writememb[n] = write_rdramb_new;
7425     writememh[n] = write_rdramh_new;
7426     writememd[n] = write_rdramd_new;
7427   }
7428   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7429     writemem[n] = write_nomem_new;
7430     writememb[n] = write_nomemb_new;
7431     writememh[n] = write_nomemh_new;
7432     writememd[n] = write_nomemd_new;
7433     readmem[n] = read_nomem_new;
7434     readmemb[n] = read_nomemb_new;
7435     readmemh[n] = read_nomemh_new;
7436     readmemd[n] = read_nomemd_new;
7437   }
7438   tlb_hacks();
7439   arch_init();
7440 }
7441
7442 void new_dynarec_cleanup()
7443 {
7444   int n;
7445   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7446   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7447   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7448   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7449   #ifdef ROM_COPY
7450   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7451   #endif
7452 }
7453
7454 int new_recompile_block(int addr)
7455 {
7456 /*
7457   if(addr==0x800cd050) {
7458     int block;
7459     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7460     int n;
7461     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7462   }
7463 */
7464   //if(Count==365117028) tracedebug=1;
7465   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7466   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7467   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7468   //if(debug) 
7469   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7470   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7471   /*if(Count>=312978186) {
7472     rlist();
7473   }*/
7474   //rlist();
7475   start = (u_int)addr&~3;
7476   //assert(((u_int)addr&1)==0);
7477   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7478     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7479     pagelimit = 0xa4001000;
7480   }
7481   else if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7482     source = (u_int *)((u_int)rdram+start-0x80000000);
7483     pagelimit = 0x80800000;
7484   }
7485   else if ((signed int)addr >= (signed int)0xC0000000) {
7486     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7487     //if(tlb_LUT_r[start>>12])
7488       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7489     if((signed int)memory_map[start>>12]>=0) {
7490       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7491       pagelimit=(start+4096)&0xFFFFF000;
7492       int map=memory_map[start>>12];
7493       int i;
7494       for(i=0;i<5;i++) {
7495         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7496         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7497       }
7498       assem_debug("pagelimit=%x\n",pagelimit);
7499       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7500     }
7501     else {
7502       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7503       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7504       return 1; // Caller will invoke exception handler
7505     }
7506     //printf("source= %x\n",(int)source);
7507   }
7508   else {
7509     printf("Compile at bogus memory address: %x \n", (int)addr);
7510     exit(1);
7511   }
7512
7513   /* Pass 1: disassemble */
7514   /* Pass 2: register dependencies, branch targets */
7515   /* Pass 3: register allocation */
7516   /* Pass 4: branch dependencies */
7517   /* Pass 5: pre-alloc */
7518   /* Pass 6: optimize clean/dirty state */
7519   /* Pass 7: flag 32-bit registers */
7520   /* Pass 8: assembly */
7521   /* Pass 9: linker */
7522   /* Pass 10: garbage collection / free memory */
7523
7524   int i,j;
7525   int done=0;
7526   unsigned int type,op,op2;
7527
7528   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7529   
7530   /* Pass 1 disassembly */
7531
7532   for(i=0;!done;i++) {
7533     bt[i]=0;likely[i]=0;op2=0;
7534     opcode[i]=op=source[i]>>26;
7535     switch(op)
7536     {
7537       case 0x00: strcpy(insn[i],"special"); type=NI;
7538         op2=source[i]&0x3f;
7539         switch(op2)
7540         {
7541           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7542           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7543           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7544           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7545           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7546           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7547           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7548           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7549           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7550           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7551           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7552           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7553           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7554           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7555           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7556           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7557           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7558           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7559           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7560           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7561           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7562           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7563           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7564           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7565           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7566           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7567           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7568           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7569           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7570           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7571           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7572           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7573           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7574           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7575           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7576           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7577           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7578           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7579           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7580           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7581           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7582           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7583           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7584           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7585           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7586           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7587           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7588           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7589           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7590           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7591           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7592           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7593         }
7594         break;
7595       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7596         op2=(source[i]>>16)&0x1f;
7597         switch(op2)
7598         {
7599           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7600           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7601           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7602           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7603           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7604           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7605           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7606           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7607           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7608           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7609           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7610           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7611           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7612           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7613         }
7614         break;
7615       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7616       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7617       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7618       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7619       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7620       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7621       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7622       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7623       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7624       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7625       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7626       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7627       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7628       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7629       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7630         op2=(source[i]>>21)&0x1f;
7631         switch(op2)
7632         {
7633           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7634           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7635           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7636           switch(source[i]&0x3f)
7637           {
7638             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7639             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7640             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7641             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7642             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7643           }
7644         }
7645         break;
7646       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7647         op2=(source[i]>>21)&0x1f;
7648         switch(op2)
7649         {
7650           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7651           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7652           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7653           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7654           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7655           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7656           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7657           switch((source[i]>>16)&0x3)
7658           {
7659             case 0x00: strcpy(insn[i],"BC1F"); break;
7660             case 0x01: strcpy(insn[i],"BC1T"); break;
7661             case 0x02: strcpy(insn[i],"BC1FL"); break;
7662             case 0x03: strcpy(insn[i],"BC1TL"); break;
7663           }
7664           break;
7665           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7666           switch(source[i]&0x3f)
7667           {
7668             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7669             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7670             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7671             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7672             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7673             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7674             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7675             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7676             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7677             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7678             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7679             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7680             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7681             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7682             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7683             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7684             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7685             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7686             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7687             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7688             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7689             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7690             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7691             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7692             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7693             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7694             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7695             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7696             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7697             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7698             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7699             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7700             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7701             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7702             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7703           }
7704           break;
7705           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7706           switch(source[i]&0x3f)
7707           {
7708             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7709             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7710             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7711             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7712             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7713             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7714             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7715             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7716             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7717             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7718             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7719             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7720             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7721             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7722             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7723             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7724             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7725             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7726             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7727             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7728             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7729             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7730             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7731             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7732             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7733             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7734             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7735             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7736             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7737             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7738             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7739             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7740             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7741             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7742             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7743           }
7744           break;
7745           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7746           switch(source[i]&0x3f)
7747           {
7748             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7749             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7750           }
7751           break;
7752           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7753           switch(source[i]&0x3f)
7754           {
7755             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7756             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7757           }
7758           break;
7759         }
7760         break;
7761       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7762       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7763       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7764       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7765       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7766       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7767       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7768       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7769       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7770       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7771       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7772       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7773       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7774       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7775       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7776       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7777       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7778       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7779       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7780       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7781       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7782       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7783       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7784       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7785       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7786       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7787       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7788       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7789       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7790       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7791       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7792       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7793       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7794       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7795       default: strcpy(insn[i],"???"); type=NI; break;
7796     }
7797     itype[i]=type;
7798     opcode2[i]=op2;
7799     /* Get registers/immediates */
7800     lt1[i]=0;
7801     us1[i]=0;
7802     us2[i]=0;
7803     dep1[i]=0;
7804     dep2[i]=0;
7805     switch(type) {
7806       case LOAD:
7807         rs1[i]=(source[i]>>21)&0x1f;
7808         rs2[i]=0;
7809         rt1[i]=(source[i]>>16)&0x1f;
7810         rt2[i]=0;
7811         imm[i]=(short)source[i];
7812         break;
7813       case STORE:
7814       case STORELR:
7815         rs1[i]=(source[i]>>21)&0x1f;
7816         rs2[i]=(source[i]>>16)&0x1f;
7817         rt1[i]=0;
7818         rt2[i]=0;
7819         imm[i]=(short)source[i];
7820         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7821         break;
7822       case LOADLR:
7823         // LWL/LWR only load part of the register,
7824         // therefore the target register must be treated as a source too
7825         rs1[i]=(source[i]>>21)&0x1f;
7826         rs2[i]=(source[i]>>16)&0x1f;
7827         rt1[i]=(source[i]>>16)&0x1f;
7828         rt2[i]=0;
7829         imm[i]=(short)source[i];
7830         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7831         if(op==0x26) dep1[i]=rt1[i]; // LWR
7832         break;
7833       case IMM16:
7834         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7835         else rs1[i]=(source[i]>>21)&0x1f;
7836         rs2[i]=0;
7837         rt1[i]=(source[i]>>16)&0x1f;
7838         rt2[i]=0;
7839         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7840           imm[i]=(unsigned short)source[i];
7841         }else{
7842           imm[i]=(short)source[i];
7843         }
7844         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7845         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7846         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7847         break;
7848       case UJUMP:
7849         rs1[i]=0;
7850         rs2[i]=0;
7851         rt1[i]=0;
7852         rt2[i]=0;
7853         // The JAL instruction writes to r31.
7854         if (op&1) {
7855           rt1[i]=31;
7856         }
7857         rs2[i]=CCREG;
7858         break;
7859       case RJUMP:
7860         rs1[i]=(source[i]>>21)&0x1f;
7861         rs2[i]=0;
7862         rt1[i]=0;
7863         rt2[i]=0;
7864         // The JALR instruction writes to r31.
7865         if (op2&1) {
7866           rt1[i]=31;   
7867         }
7868         rs2[i]=CCREG;
7869         break;
7870       case CJUMP:
7871         rs1[i]=(source[i]>>21)&0x1f;
7872         rs2[i]=(source[i]>>16)&0x1f;
7873         rt1[i]=0;
7874         rt2[i]=0;
7875         if(op&2) { // BGTZ/BLEZ
7876           rs2[i]=0;
7877         }
7878         us1[i]=rs1[i];
7879         us2[i]=rs2[i];
7880         likely[i]=op>>4;
7881         break;
7882       case SJUMP:
7883         rs1[i]=(source[i]>>21)&0x1f;
7884         rs2[i]=CCREG;
7885         rt1[i]=0;
7886         rt2[i]=0;
7887         us1[i]=rs1[i];
7888         if(op2&0x10) { // BxxAL
7889           rt1[i]=31;
7890           // NOTE: If the branch is not taken, r31 is still overwritten
7891         }
7892         likely[i]=(op2&2)>>1;
7893         break;
7894       case FJUMP:
7895         rs1[i]=FSREG;
7896         rs2[i]=CSREG;
7897         rt1[i]=0;
7898         rt2[i]=0;
7899         likely[i]=((source[i])>>17)&1;
7900         break;
7901       case ALU:
7902         rs1[i]=(source[i]>>21)&0x1f; // source
7903         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7904         rt1[i]=(source[i]>>11)&0x1f; // destination
7905         rt2[i]=0;
7906         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7907           us1[i]=rs1[i];us2[i]=rs2[i];
7908         }
7909         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7910           dep1[i]=rs1[i];dep2[i]=rs2[i];
7911         }
7912         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7913           dep1[i]=rs1[i];dep2[i]=rs2[i];
7914         }
7915         break;
7916       case MULTDIV:
7917         rs1[i]=(source[i]>>21)&0x1f; // source
7918         rs2[i]=(source[i]>>16)&0x1f; // divisor
7919         rt1[i]=HIREG;
7920         rt2[i]=LOREG;
7921         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7922           us1[i]=rs1[i];us2[i]=rs2[i];
7923         }
7924         break;
7925       case MOV:
7926         rs1[i]=0;
7927         rs2[i]=0;
7928         rt1[i]=0;
7929         rt2[i]=0;
7930         if(op2==0x10) rs1[i]=HIREG; // MFHI
7931         if(op2==0x11) rt1[i]=HIREG; // MTHI
7932         if(op2==0x12) rs1[i]=LOREG; // MFLO
7933         if(op2==0x13) rt1[i]=LOREG; // MTLO
7934         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7935         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7936         dep1[i]=rs1[i];
7937         break;
7938       case SHIFT:
7939         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7940         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7941         rt1[i]=(source[i]>>11)&0x1f; // destination
7942         rt2[i]=0;
7943         // DSLLV/DSRLV/DSRAV are 64-bit
7944         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7945         break;
7946       case SHIFTIMM:
7947         rs1[i]=(source[i]>>16)&0x1f;
7948         rs2[i]=0;
7949         rt1[i]=(source[i]>>11)&0x1f;
7950         rt2[i]=0;
7951         imm[i]=(source[i]>>6)&0x1f;
7952         // DSxx32 instructions
7953         if(op2>=0x3c) imm[i]|=0x20;
7954         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7955         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7956         break;
7957       case COP0:
7958         rs1[i]=0;
7959         rs2[i]=0;
7960         rt1[i]=0;
7961         rt2[i]=0;
7962         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7963         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7964         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7965         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7966         break;
7967       case COP1:
7968         rs1[i]=0;
7969         rs2[i]=0;
7970         rt1[i]=0;
7971         rt2[i]=0;
7972         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7973         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7974         if(op2==5) us1[i]=rs1[i]; // DMTC1
7975         rs2[i]=CSREG;
7976         break;
7977       case C1LS:
7978         rs1[i]=(source[i]>>21)&0x1F;
7979         rs2[i]=CSREG;
7980         rt1[i]=0;
7981         rt2[i]=0;
7982         imm[i]=(short)source[i];
7983         break;
7984       case FLOAT:
7985       case FCONV:
7986         rs1[i]=0;
7987         rs2[i]=CSREG;
7988         rt1[i]=0;
7989         rt2[i]=0;
7990         break;
7991       case FCOMP:
7992         rs1[i]=FSREG;
7993         rs2[i]=CSREG;
7994         rt1[i]=FSREG;
7995         rt2[i]=0;
7996         break;
7997       case SYSCALL:
7998         rs1[i]=CCREG;
7999         rs2[i]=0;
8000         rt1[i]=0;
8001         rt2[i]=0;
8002         break;
8003       default:
8004         rs1[i]=0;
8005         rs2[i]=0;
8006         rt1[i]=0;
8007         rt2[i]=0;
8008     }
8009     /* Calculate branch target addresses */
8010     if(type==UJUMP)
8011       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8012     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8013       ba[i]=start+i*4+8; // Ignore never taken branch
8014     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8015       ba[i]=start+i*4+8; // Ignore never taken branch
8016     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8017       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8018     else ba[i]=-1;
8019     /* Is this the end of the block? */
8020     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8021       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8022         done=1;
8023         // Does the block continue due to a branch?
8024         for(j=i-1;j>=0;j--)
8025         {
8026           if(ba[j]==start+i*4+4) done=j=0;
8027           if(ba[j]==start+i*4+8) done=j=0;
8028         }
8029       }
8030       else {
8031         if(stop_after_jal) done=1;
8032         // Stop on BREAK
8033         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8034       }
8035       // Don't recompile stuff that's already compiled
8036       if(check_addr(start+i*4+4)) done=1;
8037       // Don't get too close to the limit
8038       if(i>MAXBLOCK/2) done=1;
8039     }
8040     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8041     assert(i<MAXBLOCK-1);
8042     if(start+i*4==pagelimit-4) done=1;
8043     assert(start+i*4<pagelimit);
8044     if (i==MAXBLOCK-1) done=1;
8045     // Stop if we're compiling junk
8046     if(itype[i]==NI&&opcode[i]==0x11) {
8047       done=stop_after_jal=1;
8048       printf("Disabled speculative precompilation\n");
8049     }
8050   }
8051   slen=i;
8052   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8053     if(start+i*4==pagelimit) {
8054       itype[i-1]=SPAN;
8055     }
8056   }
8057   assert(slen>0);
8058
8059   /* Pass 2 - Register dependencies and branch targets */
8060
8061   unneeded_registers(0,slen-1,0);
8062   
8063   /* Pass 3 - Register allocation */
8064
8065   struct regstat current; // Current register allocations/status
8066   current.is32=1;
8067   current.dirty=0;
8068   current.u=unneeded_reg[0];
8069   current.uu=unneeded_reg_upper[0];
8070   clear_all_regs(current.regmap);
8071   alloc_reg(&current,0,CCREG);
8072   dirty_reg(&current,CCREG);
8073   current.isconst=0;
8074   current.wasconst=0;
8075   int ds=0;
8076   int cc=0;
8077   int hr;
8078   
8079   provisional_32bit();
8080   
8081   if((u_int)addr&1) {
8082     // First instruction is delay slot
8083     cc=-1;
8084     bt[1]=1;
8085     ds=1;
8086     unneeded_reg[0]=1;
8087     unneeded_reg_upper[0]=1;
8088     current.regmap[HOST_BTREG]=BTREG;
8089   }
8090   
8091   for(i=0;i<slen;i++)
8092   {
8093     if(bt[i])
8094     {
8095       int hr;
8096       for(hr=0;hr<HOST_REGS;hr++)
8097       {
8098         // Is this really necessary?
8099         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8100       }
8101       current.isconst=0;
8102     }
8103     if(i>1)
8104     {
8105       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8106       {
8107         if(rs1[i-2]==0||rs2[i-2]==0)
8108         {
8109           if(rs1[i-2]) {
8110             current.is32|=1LL<<rs1[i-2];
8111             int hr=get_reg(current.regmap,rs1[i-2]|64);
8112             if(hr>=0) current.regmap[hr]=-1;
8113           }
8114           if(rs2[i-2]) {
8115             current.is32|=1LL<<rs2[i-2];
8116             int hr=get_reg(current.regmap,rs2[i-2]|64);
8117             if(hr>=0) current.regmap[hr]=-1;
8118           }
8119         }
8120       }
8121     }
8122     // If something jumps here with 64-bit values
8123     // then promote those registers to 64 bits
8124     if(bt[i])
8125     {
8126       uint64_t temp_is32=current.is32;
8127       for(j=i-1;j>=0;j--)
8128       {
8129         if(ba[j]==start+i*4) 
8130           temp_is32&=branch_regs[j].is32;
8131       }
8132       for(j=i;j<slen;j++)
8133       {
8134         if(ba[j]==start+i*4) 
8135           //temp_is32=1;
8136           temp_is32&=p32[j];
8137       }
8138       if(temp_is32!=current.is32) {
8139         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8140         #ifdef DESTRUCTIVE_WRITEBACK
8141         for(hr=0;hr<HOST_REGS;hr++)
8142         {
8143           int r=current.regmap[hr];
8144           if(r>0&&r<64)
8145           {
8146             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8147               temp_is32|=1LL<<r;
8148               //printf("restore %d\n",r);
8149             }
8150           }
8151         }
8152         #endif
8153         current.is32=temp_is32;
8154       }
8155     }
8156     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8157     regs[i].wasconst=current.isconst;
8158     regs[i].was32=current.is32;
8159     regs[i].wasdirty=current.dirty;
8160     #ifdef DESTRUCTIVE_WRITEBACK
8161     // To change a dirty register from 32 to 64 bits, we must write
8162     // it out during the previous cycle (for branches, 2 cycles)
8163     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8164     {
8165       uint64_t temp_is32=current.is32;
8166       for(j=i-1;j>=0;j--)
8167       {
8168         if(ba[j]==start+i*4+4) 
8169           temp_is32&=branch_regs[j].is32;
8170       }
8171       for(j=i;j<slen;j++)
8172       {
8173         if(ba[j]==start+i*4+4) 
8174           //temp_is32=1;
8175           temp_is32&=p32[j];
8176       }
8177       if(temp_is32!=current.is32) {
8178         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8179         for(hr=0;hr<HOST_REGS;hr++)
8180         {
8181           int r=current.regmap[hr];
8182           if(r>0)
8183           {
8184             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8185               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8186               {
8187                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8188                 {
8189                   //printf("dump %d/r%d\n",hr,r);
8190                   current.regmap[hr]=-1;
8191                   if(get_reg(current.regmap,r|64)>=0) 
8192                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8193                 }
8194               }
8195             }
8196           }
8197         }
8198       }
8199     }
8200     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8201     {
8202       uint64_t temp_is32=current.is32;
8203       for(j=i-1;j>=0;j--)
8204       {
8205         if(ba[j]==start+i*4+8) 
8206           temp_is32&=branch_regs[j].is32;
8207       }
8208       for(j=i;j<slen;j++)
8209       {
8210         if(ba[j]==start+i*4+8) 
8211           //temp_is32=1;
8212           temp_is32&=p32[j];
8213       }
8214       if(temp_is32!=current.is32) {
8215         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8216         for(hr=0;hr<HOST_REGS;hr++)
8217         {
8218           int r=current.regmap[hr];
8219           if(r>0)
8220           {
8221             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8222               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8223               {
8224                 //printf("dump %d/r%d\n",hr,r);
8225                 current.regmap[hr]=-1;
8226                 if(get_reg(current.regmap,r|64)>=0) 
8227                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8228               }
8229             }
8230           }
8231         }
8232       }
8233     }
8234     #endif
8235     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8236       if(i+1<slen) {
8237         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8238         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8239         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8240         current.u|=1;
8241         current.uu|=1;
8242       } else {
8243         current.u=1;
8244         current.uu=1;
8245       }
8246     } else {
8247       if(i+1<slen) {
8248         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8249         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8250         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8251         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8252         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8253         current.u|=1;
8254         current.uu|=1;
8255       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8256     }
8257     is_ds[i]=ds;
8258     if(ds) {
8259       ds=0; // Skip delay slot, already allocated as part of branch
8260       // ...but we need to alloc it in case something jumps here
8261       if(i+1<slen) {
8262         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8263         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8264       }else{
8265         current.u=branch_unneeded_reg[i-1];
8266         current.uu=branch_unneeded_reg_upper[i-1];
8267       }
8268       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8269       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8270       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8271       current.u|=1;
8272       current.uu|=1;
8273       struct regstat temp;
8274       memcpy(&temp,&current,sizeof(current));
8275       temp.wasdirty=temp.dirty;
8276       temp.was32=temp.is32;
8277       // TODO: Take into account unconditional branches, as below
8278       delayslot_alloc(&temp,i);
8279       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8280       regs[i].wasdirty=temp.wasdirty;
8281       regs[i].was32=temp.was32;
8282       regs[i].dirty=temp.dirty;
8283       regs[i].is32=temp.is32;
8284       regs[i].isconst=0;
8285       regs[i].wasconst=0;
8286       current.isconst=0;
8287       // Create entry (branch target) regmap
8288       for(hr=0;hr<HOST_REGS;hr++)
8289       {
8290         int r=temp.regmap[hr];
8291         if(r>=0) {
8292           if(r!=regmap_pre[i][hr]) {
8293             regs[i].regmap_entry[hr]=-1;
8294           }
8295           else
8296           {
8297             if(r<64){
8298               if((current.u>>r)&1) {
8299                 regs[i].regmap_entry[hr]=-1;
8300                 regs[i].regmap[hr]=-1;
8301                 //Don't clear regs in the delay slot as the branch might need them
8302                 //current.regmap[hr]=-1;
8303               }else
8304                 regs[i].regmap_entry[hr]=r;
8305             }
8306             else {
8307               if((current.uu>>(r&63))&1) {
8308                 regs[i].regmap_entry[hr]=-1;
8309                 regs[i].regmap[hr]=-1;
8310                 //Don't clear regs in the delay slot as the branch might need them
8311                 //current.regmap[hr]=-1;
8312               }else
8313                 regs[i].regmap_entry[hr]=r;
8314             }
8315           }
8316         } else {
8317           // First instruction expects CCREG to be allocated
8318           if(i==0&&hr==HOST_CCREG) 
8319             regs[i].regmap_entry[hr]=CCREG;
8320           else
8321             regs[i].regmap_entry[hr]=-1;
8322         }
8323       }
8324     }
8325     else { // Not delay slot
8326       switch(itype[i]) {
8327         case UJUMP:
8328           //current.isconst=0; // DEBUG
8329           //current.wasconst=0; // DEBUG
8330           //regs[i].wasconst=0; // DEBUG
8331           clear_const(&current,rt1[i]);
8332           alloc_cc(&current,i);
8333           dirty_reg(&current,CCREG);
8334           if (rt1[i]==31) {
8335             alloc_reg(&current,i,31);
8336             dirty_reg(&current,31);
8337             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8338             #ifdef REG_PREFETCH
8339             alloc_reg(&current,i,PTEMP);
8340             #endif
8341             //current.is32|=1LL<<rt1[i];
8342           }
8343           delayslot_alloc(&current,i+1);
8344           //current.isconst=0; // DEBUG
8345           ds=1;
8346           //printf("i=%d, isconst=%x\n",i,current.isconst);
8347           break;
8348         case RJUMP:
8349           //current.isconst=0;
8350           //current.wasconst=0;
8351           //regs[i].wasconst=0;
8352           clear_const(&current,rs1[i]);
8353           clear_const(&current,rt1[i]);
8354           alloc_cc(&current,i);
8355           dirty_reg(&current,CCREG);
8356           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8357             alloc_reg(&current,i,rs1[i]);
8358             if (rt1[i]==31) {
8359               alloc_reg(&current,i,31);
8360               dirty_reg(&current,31);
8361               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8362               #ifdef REG_PREFETCH
8363               alloc_reg(&current,i,PTEMP);
8364               #endif
8365             }
8366             #ifdef USE_MINI_HT
8367             if(rs1[i]==31) { // JALR
8368               alloc_reg(&current,i,RHASH);
8369               #ifndef HOST_IMM_ADDR32
8370               alloc_reg(&current,i,RHTBL);
8371               #endif
8372             }
8373             #endif
8374             delayslot_alloc(&current,i+1);
8375           } else {
8376             // The delay slot overwrites our source register,
8377             // allocate a temporary register to hold the old value.
8378             current.isconst=0;
8379             current.wasconst=0;
8380             regs[i].wasconst=0;
8381             delayslot_alloc(&current,i+1);
8382             current.isconst=0;
8383             alloc_reg(&current,i,RTEMP);
8384           }
8385           //current.isconst=0; // DEBUG
8386           ds=1;
8387           break;
8388         case CJUMP:
8389           //current.isconst=0;
8390           //current.wasconst=0;
8391           //regs[i].wasconst=0;
8392           clear_const(&current,rs1[i]);
8393           clear_const(&current,rs2[i]);
8394           if((opcode[i]&0x3E)==4) // BEQ/BNE
8395           {
8396             alloc_cc(&current,i);
8397             dirty_reg(&current,CCREG);
8398             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8399             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8400             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8401             {
8402               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8403               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8404             }
8405             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8406                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8407               // The delay slot overwrites one of our conditions.
8408               // Allocate the branch condition registers instead.
8409               // Note that such a sequence of instructions could
8410               // be considered a bug since the branch can not be
8411               // re-executed if an exception occurs.
8412               current.isconst=0;
8413               current.wasconst=0;
8414               regs[i].wasconst=0;
8415               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8416               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8417               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8418               {
8419                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8420                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8421               }
8422             }
8423             else delayslot_alloc(&current,i+1);
8424           }
8425           else
8426           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8427           {
8428             alloc_cc(&current,i);
8429             dirty_reg(&current,CCREG);
8430             alloc_reg(&current,i,rs1[i]);
8431             if(!(current.is32>>rs1[i]&1))
8432             {
8433               alloc_reg64(&current,i,rs1[i]);
8434             }
8435             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8436               // The delay slot overwrites one of our conditions.
8437               // Allocate the branch condition registers instead.
8438               // Note that such a sequence of instructions could
8439               // be considered a bug since the branch can not be
8440               // re-executed if an exception occurs.
8441               current.isconst=0;
8442               current.wasconst=0;
8443               regs[i].wasconst=0;
8444               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8445               if(!((current.is32>>rs1[i])&1))
8446               {
8447                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8448               }
8449             }
8450             else delayslot_alloc(&current,i+1);
8451           }
8452           else
8453           // Don't alloc the delay slot yet because we might not execute it
8454           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8455           {
8456             current.isconst=0;
8457             current.wasconst=0;
8458             regs[i].wasconst=0;
8459             alloc_cc(&current,i);
8460             dirty_reg(&current,CCREG);
8461             alloc_reg(&current,i,rs1[i]);
8462             alloc_reg(&current,i,rs2[i]);
8463             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8464             {
8465               alloc_reg64(&current,i,rs1[i]);
8466               alloc_reg64(&current,i,rs2[i]);
8467             }
8468           }
8469           else
8470           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8471           {
8472             current.isconst=0;
8473             current.wasconst=0;
8474             regs[i].wasconst=0;
8475             alloc_cc(&current,i);
8476             dirty_reg(&current,CCREG);
8477             alloc_reg(&current,i,rs1[i]);
8478             if(!(current.is32>>rs1[i]&1))
8479             {
8480               alloc_reg64(&current,i,rs1[i]);
8481             }
8482           }
8483           ds=1;
8484           //current.isconst=0;
8485           break;
8486         case SJUMP:
8487           //current.isconst=0;
8488           //current.wasconst=0;
8489           //regs[i].wasconst=0;
8490           clear_const(&current,rs1[i]);
8491           clear_const(&current,rt1[i]);
8492           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8493           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8494           {
8495             alloc_cc(&current,i);
8496             dirty_reg(&current,CCREG);
8497             alloc_reg(&current,i,rs1[i]);
8498             if(!(current.is32>>rs1[i]&1))
8499             {
8500               alloc_reg64(&current,i,rs1[i]);
8501             }
8502             if (rt1[i]==31) { // BLTZAL/BGEZAL
8503               alloc_reg(&current,i,31);
8504               dirty_reg(&current,31);
8505               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8506               //#ifdef REG_PREFETCH
8507               //alloc_reg(&current,i,PTEMP);
8508               //#endif
8509               //current.is32|=1LL<<rt1[i];
8510             }
8511             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8512               // The delay slot overwrites the branch condition.
8513               // Allocate the branch condition registers instead.
8514               // Note that such a sequence of instructions could
8515               // be considered a bug since the branch can not be
8516               // re-executed if an exception occurs.
8517               current.isconst=0;
8518               current.wasconst=0;
8519               regs[i].wasconst=0;
8520               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8521               if(!((current.is32>>rs1[i])&1))
8522               {
8523                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8524               }
8525             }
8526             else delayslot_alloc(&current,i+1);
8527           }
8528           else
8529           // Don't alloc the delay slot yet because we might not execute it
8530           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8531           {
8532             current.isconst=0;
8533             current.wasconst=0;
8534             regs[i].wasconst=0;
8535             alloc_cc(&current,i);
8536             dirty_reg(&current,CCREG);
8537             alloc_reg(&current,i,rs1[i]);
8538             if(!(current.is32>>rs1[i]&1))
8539             {
8540               alloc_reg64(&current,i,rs1[i]);
8541             }
8542           }
8543           ds=1;
8544           //current.isconst=0;
8545           break;
8546         case FJUMP:
8547           current.isconst=0;
8548           current.wasconst=0;
8549           regs[i].wasconst=0;
8550           if(likely[i]==0) // BC1F/BC1T
8551           {
8552             // TODO: Theoretically we can run out of registers here on x86.
8553             // The delay slot can allocate up to six, and we need to check
8554             // CSREG before executing the delay slot.  Possibly we can drop
8555             // the cycle count and then reload it after checking that the
8556             // FPU is in a usable state, or don't do out-of-order execution.
8557             alloc_cc(&current,i);
8558             dirty_reg(&current,CCREG);
8559             alloc_reg(&current,i,FSREG);
8560             alloc_reg(&current,i,CSREG);
8561             if(itype[i+1]==FCOMP) {
8562               // The delay slot overwrites the branch condition.
8563               // Allocate the branch condition registers instead.
8564               // Note that such a sequence of instructions could
8565               // be considered a bug since the branch can not be
8566               // re-executed if an exception occurs.
8567               alloc_cc(&current,i);
8568               dirty_reg(&current,CCREG);
8569               alloc_reg(&current,i,CSREG);
8570               alloc_reg(&current,i,FSREG);
8571             }
8572             else {
8573               delayslot_alloc(&current,i+1);
8574               alloc_reg(&current,i+1,CSREG);
8575             }
8576           }
8577           else
8578           // Don't alloc the delay slot yet because we might not execute it
8579           if(likely[i]) // BC1FL/BC1TL
8580           {
8581             alloc_cc(&current,i);
8582             dirty_reg(&current,CCREG);
8583             alloc_reg(&current,i,CSREG);
8584             alloc_reg(&current,i,FSREG);
8585           }
8586           ds=1;
8587           current.isconst=0;
8588           break;
8589         case IMM16:
8590           imm16_alloc(&current,i);
8591           break;
8592         case LOAD:
8593         case LOADLR:
8594           load_alloc(&current,i);
8595           break;
8596         case STORE:
8597         case STORELR:
8598           store_alloc(&current,i);
8599           break;
8600         case ALU:
8601           alu_alloc(&current,i);
8602           break;
8603         case SHIFT:
8604           shift_alloc(&current,i);
8605           break;
8606         case MULTDIV:
8607           multdiv_alloc(&current,i);
8608           break;
8609         case SHIFTIMM:
8610           shiftimm_alloc(&current,i);
8611           break;
8612         case MOV:
8613           mov_alloc(&current,i);
8614           break;
8615         case COP0:
8616           cop0_alloc(&current,i);
8617           break;
8618         case COP1:
8619           cop1_alloc(&current,i);
8620           break;
8621         case C1LS:
8622           c1ls_alloc(&current,i);
8623           break;
8624         case FCONV:
8625           fconv_alloc(&current,i);
8626           break;
8627         case FLOAT:
8628           float_alloc(&current,i);
8629           break;
8630         case FCOMP:
8631           fcomp_alloc(&current,i);
8632           break;
8633         case SYSCALL:
8634           syscall_alloc(&current,i);
8635           break;
8636         case SPAN:
8637           pagespan_alloc(&current,i);
8638           break;
8639       }
8640       
8641       // Drop the upper half of registers that have become 32-bit
8642       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8643       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8644         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8645         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8646         current.uu|=1;
8647       } else {
8648         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8649         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8650         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8651         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8652         current.uu|=1;
8653       }
8654
8655       // Create entry (branch target) regmap
8656       for(hr=0;hr<HOST_REGS;hr++)
8657       {
8658         int r,or,er;
8659         r=current.regmap[hr];
8660         if(r>=0) {
8661           if(r!=regmap_pre[i][hr]) {
8662             // TODO: delay slot (?)
8663             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8664             if(or<0||(r&63)>=TEMPREG){
8665               regs[i].regmap_entry[hr]=-1;
8666             }
8667             else
8668             {
8669               // Just move it to a different register
8670               regs[i].regmap_entry[hr]=r;
8671               // If it was dirty before, it's still dirty
8672               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8673             }
8674           }
8675           else
8676           {
8677             // Unneeded
8678             if(r==0){
8679               regs[i].regmap_entry[hr]=0;
8680             }
8681             else
8682             if(r<64){
8683               if((current.u>>r)&1) {
8684                 regs[i].regmap_entry[hr]=-1;
8685                 //regs[i].regmap[hr]=-1;
8686                 current.regmap[hr]=-1;
8687               }else
8688                 regs[i].regmap_entry[hr]=r;
8689             }
8690             else {
8691               if((current.uu>>(r&63))&1) {
8692                 regs[i].regmap_entry[hr]=-1;
8693                 //regs[i].regmap[hr]=-1;
8694                 current.regmap[hr]=-1;
8695               }else
8696                 regs[i].regmap_entry[hr]=r;
8697             }
8698           }
8699         } else {
8700           // Branches expect CCREG to be allocated at the target
8701           if(regmap_pre[i][hr]==CCREG) 
8702             regs[i].regmap_entry[hr]=CCREG;
8703           else
8704             regs[i].regmap_entry[hr]=-1;
8705         }
8706       }
8707       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8708     }
8709     /* Branch post-alloc */
8710     if(i>0)
8711     {
8712       current.was32=current.is32;
8713       current.wasdirty=current.dirty;
8714       switch(itype[i-1]) {
8715         case UJUMP:
8716           memcpy(&branch_regs[i-1],&current,sizeof(current));
8717           branch_regs[i-1].isconst=0;
8718           branch_regs[i-1].wasconst=0;
8719           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8720           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8721           alloc_cc(&branch_regs[i-1],i-1);
8722           dirty_reg(&branch_regs[i-1],CCREG);
8723           if(rt1[i-1]==31) { // JAL
8724             alloc_reg(&branch_regs[i-1],i-1,31);
8725             dirty_reg(&branch_regs[i-1],31);
8726             branch_regs[i-1].is32|=1LL<<31;
8727           }
8728           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8729           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8730           break;
8731         case RJUMP:
8732           memcpy(&branch_regs[i-1],&current,sizeof(current));
8733           branch_regs[i-1].isconst=0;
8734           branch_regs[i-1].wasconst=0;
8735           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8736           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8737           alloc_cc(&branch_regs[i-1],i-1);
8738           dirty_reg(&branch_regs[i-1],CCREG);
8739           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8740           if(rt1[i-1]==31) { // JALR
8741             alloc_reg(&branch_regs[i-1],i-1,31);
8742             dirty_reg(&branch_regs[i-1],31);
8743             branch_regs[i-1].is32|=1LL<<31;
8744           }
8745           #ifdef USE_MINI_HT
8746           if(rs1[i-1]==31) { // JALR
8747             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8748             #ifndef HOST_IMM_ADDR32
8749             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8750             #endif
8751           }
8752           #endif
8753           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8754           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8755           break;
8756         case CJUMP:
8757           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8758           {
8759             alloc_cc(&current,i-1);
8760             dirty_reg(&current,CCREG);
8761             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8762                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8763               // The delay slot overwrote one of our conditions
8764               // Delay slot goes after the test (in order)
8765               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8766               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8767               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8768               current.u|=1;
8769               current.uu|=1;
8770               delayslot_alloc(&current,i);
8771               current.isconst=0;
8772             }
8773             else
8774             {
8775               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8776               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8777               // Alloc the branch condition registers
8778               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8779               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8780               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8781               {
8782                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8783                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8784               }
8785             }
8786             memcpy(&branch_regs[i-1],&current,sizeof(current));
8787             branch_regs[i-1].isconst=0;
8788             branch_regs[i-1].wasconst=0;
8789             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8790             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8791           }
8792           else
8793           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8794           {
8795             alloc_cc(&current,i-1);
8796             dirty_reg(&current,CCREG);
8797             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8798               // The delay slot overwrote the branch condition
8799               // Delay slot goes after the test (in order)
8800               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8801               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8802               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8803               current.u|=1;
8804               current.uu|=1;
8805               delayslot_alloc(&current,i);
8806               current.isconst=0;
8807             }
8808             else
8809             {
8810               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8811               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8812               // Alloc the branch condition register
8813               alloc_reg(&current,i-1,rs1[i-1]);
8814               if(!(current.is32>>rs1[i-1]&1))
8815               {
8816                 alloc_reg64(&current,i-1,rs1[i-1]);
8817               }
8818             }
8819             memcpy(&branch_regs[i-1],&current,sizeof(current));
8820             branch_regs[i-1].isconst=0;
8821             branch_regs[i-1].wasconst=0;
8822             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8823             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8824           }
8825           else
8826           // Alloc the delay slot in case the branch is taken
8827           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8828           {
8829             memcpy(&branch_regs[i-1],&current,sizeof(current));
8830             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8831             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8832             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8833             alloc_cc(&branch_regs[i-1],i);
8834             dirty_reg(&branch_regs[i-1],CCREG);
8835             delayslot_alloc(&branch_regs[i-1],i);
8836             branch_regs[i-1].isconst=0;
8837             alloc_reg(&current,i,CCREG); // Not taken path
8838             dirty_reg(&current,CCREG);
8839             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8840           }
8841           else
8842           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8843           {
8844             memcpy(&branch_regs[i-1],&current,sizeof(current));
8845             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8846             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8847             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8848             alloc_cc(&branch_regs[i-1],i);
8849             dirty_reg(&branch_regs[i-1],CCREG);
8850             delayslot_alloc(&branch_regs[i-1],i);
8851             branch_regs[i-1].isconst=0;
8852             alloc_reg(&current,i,CCREG); // Not taken path
8853             dirty_reg(&current,CCREG);
8854             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8855           }
8856           break;
8857         case SJUMP:
8858           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8859           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8860           {
8861             alloc_cc(&current,i-1);
8862             dirty_reg(&current,CCREG);
8863             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8864               // The delay slot overwrote the branch condition
8865               // Delay slot goes after the test (in order)
8866               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8867               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8868               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8869               current.u|=1;
8870               current.uu|=1;
8871               delayslot_alloc(&current,i);
8872               current.isconst=0;
8873             }
8874             else
8875             {
8876               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8877               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8878               // Alloc the branch condition register
8879               alloc_reg(&current,i-1,rs1[i-1]);
8880               if(!(current.is32>>rs1[i-1]&1))
8881               {
8882                 alloc_reg64(&current,i-1,rs1[i-1]);
8883               }
8884             }
8885             memcpy(&branch_regs[i-1],&current,sizeof(current));
8886             branch_regs[i-1].isconst=0;
8887             branch_regs[i-1].wasconst=0;
8888             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8889             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8890           }
8891           else
8892           // Alloc the delay slot in case the branch is taken
8893           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8894           {
8895             memcpy(&branch_regs[i-1],&current,sizeof(current));
8896             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8897             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8898             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8899             alloc_cc(&branch_regs[i-1],i);
8900             dirty_reg(&branch_regs[i-1],CCREG);
8901             delayslot_alloc(&branch_regs[i-1],i);
8902             branch_regs[i-1].isconst=0;
8903             alloc_reg(&current,i,CCREG); // Not taken path
8904             dirty_reg(&current,CCREG);
8905             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8906           }
8907           // FIXME: BLTZAL/BGEZAL
8908           if(opcode2[i-1]&0x10) { // BxxZAL
8909             alloc_reg(&branch_regs[i-1],i-1,31);
8910             dirty_reg(&branch_regs[i-1],31);
8911             branch_regs[i-1].is32|=1LL<<31;
8912           }
8913           break;
8914         case FJUMP:
8915           if(likely[i-1]==0) // BC1F/BC1T
8916           {
8917             alloc_cc(&current,i-1);
8918             dirty_reg(&current,CCREG);
8919             if(itype[i]==FCOMP) {
8920               // The delay slot overwrote the branch condition
8921               // Delay slot goes after the test (in order)
8922               delayslot_alloc(&current,i);
8923               current.isconst=0;
8924             }
8925             else
8926             {
8927               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8928               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8929               // Alloc the branch condition register
8930               alloc_reg(&current,i-1,FSREG);
8931             }
8932             memcpy(&branch_regs[i-1],&current,sizeof(current));
8933             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8934           }
8935           else // BC1FL/BC1TL
8936           {
8937             // Alloc the delay slot in case the branch is taken
8938             memcpy(&branch_regs[i-1],&current,sizeof(current));
8939             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8940             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8941             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8942             alloc_cc(&branch_regs[i-1],i);
8943             dirty_reg(&branch_regs[i-1],CCREG);
8944             delayslot_alloc(&branch_regs[i-1],i);
8945             branch_regs[i-1].isconst=0;
8946             alloc_reg(&current,i,CCREG); // Not taken path
8947             dirty_reg(&current,CCREG);
8948             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8949           }
8950           break;
8951       }
8952
8953       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8954       {
8955         if(rt1[i-1]==31) // JAL/JALR
8956         {
8957           // Subroutine call will return here, don't alloc any registers
8958           current.is32=1;
8959           current.dirty=0;
8960           clear_all_regs(current.regmap);
8961           alloc_reg(&current,i,CCREG);
8962           dirty_reg(&current,CCREG);
8963         }
8964         else if(i+1<slen)
8965         {
8966           // Internal branch will jump here, match registers to caller
8967           current.is32=0x3FFFFFFFFLL;
8968           current.dirty=0;
8969           clear_all_regs(current.regmap);
8970           alloc_reg(&current,i,CCREG);
8971           dirty_reg(&current,CCREG);
8972           for(j=i-1;j>=0;j--)
8973           {
8974             if(ba[j]==start+i*4+4) {
8975               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8976               current.is32=branch_regs[j].is32;
8977               current.dirty=branch_regs[j].dirty;
8978               break;
8979             }
8980           }
8981           while(j>=0) {
8982             if(ba[j]==start+i*4+4) {
8983               for(hr=0;hr<HOST_REGS;hr++) {
8984                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8985                   current.regmap[hr]=-1;
8986                 }
8987                 current.is32&=branch_regs[j].is32;
8988                 current.dirty&=branch_regs[j].dirty;
8989               }
8990             }
8991             j--;
8992           }
8993         }
8994       }
8995     }
8996
8997     // Count cycles in between branches
8998     ccadj[i]=cc;
8999     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL))
9000     {
9001       cc=0;
9002     }
9003     else
9004     {
9005       cc++;
9006     }
9007
9008     flush_dirty_uppers(&current);
9009     if(!is_ds[i]) {
9010       regs[i].is32=current.is32;
9011       regs[i].dirty=current.dirty;
9012       regs[i].isconst=current.isconst;
9013       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9014     }
9015     for(hr=0;hr<HOST_REGS;hr++) {
9016       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9017         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9018           regs[i].wasconst&=~(1<<hr);
9019         }
9020       }
9021     }
9022     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9023   }
9024   
9025   /* Pass 4 - Cull unused host registers */
9026   
9027   uint64_t nr=0;
9028   
9029   for (i=slen-1;i>=0;i--)
9030   {
9031     int hr;
9032     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9033     {
9034       if(ba[i]<start || ba[i]>=(start+slen*4))
9035       {
9036         // Branch out of this block, don't need anything
9037         nr=0;
9038       }
9039       else
9040       {
9041         // Internal branch
9042         // Need whatever matches the target
9043         nr=0;
9044         int t=(ba[i]-start)>>2;
9045         for(hr=0;hr<HOST_REGS;hr++)
9046         {
9047           if(regs[i].regmap_entry[hr]>=0) {
9048             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9049           }
9050         }
9051       }
9052       // Conditional branch may need registers for following instructions
9053       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9054       {
9055         if(i<slen-2) {
9056           nr|=needed_reg[i+2];
9057           for(hr=0;hr<HOST_REGS;hr++)
9058           {
9059             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9060             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9061           }
9062         }
9063       }
9064       // Don't need stuff which is overwritten
9065       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9066       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9067       // Merge in delay slot
9068       for(hr=0;hr<HOST_REGS;hr++)
9069       {
9070         if(!likely[i]) {
9071           // These are overwritten unless the branch is "likely"
9072           // and the delay slot is nullified if not taken
9073           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9074           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9075         }
9076         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9077         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9078         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9079         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9080         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9081         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9082         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9083         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9084         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9085           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9086           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9087         }
9088         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9089           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9090           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9091         }
9092         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9093           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9094           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9095         }
9096       }
9097     }
9098     else if(itype[i]==SYSCALL)
9099     {
9100       // SYSCALL instruction (software interrupt)
9101       nr=0;
9102     }
9103     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9104     {
9105       // ERET instruction (return from interrupt)
9106       nr=0;
9107     }
9108     else // Non-branch
9109     {
9110       if(i<slen-1) {
9111         for(hr=0;hr<HOST_REGS;hr++) {
9112           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9113           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9114           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9115           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9116         }
9117       }
9118     }
9119     for(hr=0;hr<HOST_REGS;hr++)
9120     {
9121       // Overwritten registers are not needed
9122       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9123       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9124       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9125       // Source registers are needed
9126       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9127       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9128       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9129       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9130       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9131       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9132       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9133       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9134       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9135         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9136         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9137       }
9138       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9139         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9140         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9141       }
9142       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9143         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9144         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9145       }
9146       // Don't store a register immediately after writing it,
9147       // may prevent dual-issue.
9148       // But do so if this is a branch target, otherwise we
9149       // might have to load the register before the branch.
9150       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9151         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9152            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9153           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9154           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9155         }
9156         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9157            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9158           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9159           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9160         }
9161       }
9162     }
9163     // Cycle count is needed at branches.  Assume it is needed at the target too.
9164     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9165       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9166       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9167     }
9168     // Save it
9169     needed_reg[i]=nr;
9170     
9171     // Deallocate unneeded registers
9172     for(hr=0;hr<HOST_REGS;hr++)
9173     {
9174       if(!((nr>>hr)&1)) {
9175         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9176         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9177            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9178            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9179         {
9180           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9181           {
9182             if(likely[i]) {
9183               regs[i].regmap[hr]=-1;
9184               regs[i].isconst&=~(1<<hr);
9185               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9186             }
9187           }
9188         }
9189         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9190         {
9191           int d1=0,d2=0,map=0,temp=0;
9192           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9193           {
9194             d1=dep1[i+1];
9195             d2=dep2[i+1];
9196           }
9197           if(using_tlb) {
9198             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9199                itype[i+1]==STORE || itype[i+1]==STORELR ||
9200                itype[i+1]==C1LS )
9201             map=TLREG;
9202           } else
9203           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9204             map=INVCP;
9205           }
9206           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9207              itype[i+1]==C1LS )
9208             temp=FTEMP;
9209           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9210              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9211              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9212              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9213              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9214              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9215              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9216              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9217              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9218              regs[i].regmap[hr]!=map )
9219           {
9220             regs[i].regmap[hr]=-1;
9221             regs[i].isconst&=~(1<<hr);
9222             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9223                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9224                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9225                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9226                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9227                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9228                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9229                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9230                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9231                branch_regs[i].regmap[hr]!=map)
9232             {
9233               branch_regs[i].regmap[hr]=-1;
9234               branch_regs[i].regmap_entry[hr]=-1;
9235               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9236               {
9237                 if(!likely[i]&&i<slen-2) {
9238                   regmap_pre[i+2][hr]=-1;
9239                 }
9240               }
9241             }
9242           }
9243         }
9244         else
9245         {
9246           // Non-branch
9247           if(i>0)
9248           {
9249             int d1=0,d2=0,map=-1,temp=-1;
9250             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9251             {
9252               d1=dep1[i];
9253               d2=dep2[i];
9254             }
9255             if(using_tlb) {
9256               if(itype[i]==LOAD || itype[i]==LOADLR ||
9257                  itype[i]==STORE || itype[i]==STORELR ||
9258                  itype[i]==C1LS )
9259               map=TLREG;
9260             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9261               map=INVCP;
9262             }
9263             if(itype[i]==LOADLR || itype[i]==STORELR ||
9264                itype[i]==C1LS )
9265               temp=FTEMP;
9266             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9267                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9268                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9269                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9270                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9271                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9272             {
9273               if(i<slen-1&&!is_ds[i]) {
9274                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9275                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9276                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9277                 {
9278                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9279                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9280                 }
9281                 regmap_pre[i+1][hr]=-1;
9282                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9283               }
9284               regs[i].regmap[hr]=-1;
9285               regs[i].isconst&=~(1<<hr);
9286             }
9287           }
9288         }
9289       }
9290     }
9291   }
9292   
9293   /* Pass 5 - Pre-allocate registers */
9294   
9295   // If a register is allocated during a loop, try to allocate it for the
9296   // entire loop, if possible.  This avoids loading/storing registers
9297   // inside of the loop.
9298
9299   signed char f_regmap[HOST_REGS];
9300   clear_all_regs(f_regmap);
9301   for(i=0;i<slen-1;i++)
9302   {
9303     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9304     {
9305       if(ba[i]>=start && ba[i]<(start+i*4)) 
9306       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9307       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9308       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9309       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9310       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9311       {
9312         int t=(ba[i]-start)>>2;
9313         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9314         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9315         for(hr=0;hr<HOST_REGS;hr++)
9316         {
9317           if(regs[i].regmap[hr]>64) {
9318             if(!((regs[i].dirty>>hr)&1))
9319               f_regmap[hr]=regs[i].regmap[hr];
9320             else f_regmap[hr]=-1;
9321           }
9322           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9323           if(branch_regs[i].regmap[hr]>64) {
9324             if(!((branch_regs[i].dirty>>hr)&1))
9325               f_regmap[hr]=branch_regs[i].regmap[hr];
9326             else f_regmap[hr]=-1;
9327           }
9328           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9329           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9330           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9331           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9332           {
9333             // Test both in case the delay slot is ooo,
9334             // could be done better...
9335             if(count_free_regs(branch_regs[i].regmap)<2
9336              ||count_free_regs(regs[i].regmap)<2) 
9337               f_regmap[hr]=branch_regs[i].regmap[hr];
9338           }
9339           // Avoid dirty->clean transition
9340           // #ifdef DESTRUCTIVE_WRITEBACK here?
9341           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9342           if(f_regmap[hr]>0) {
9343             if(regs[t].regmap_entry[hr]<0) {
9344               int r=f_regmap[hr];
9345               for(j=t;j<=i;j++)
9346               {
9347                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9348                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9349                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9350                 if(r>63) {
9351                   // NB This can exclude the case where the upper-half
9352                   // register is lower numbered than the lower-half
9353                   // register.  Not sure if it's worth fixing...
9354                   if(get_reg(regs[j].regmap,r&63)<0) break;
9355                   if(regs[j].is32&(1LL<<(r&63))) break;
9356                 }
9357                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9358                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9359                   int k;
9360                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9361                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9362                     if(r>63) {
9363                       if(get_reg(regs[i].regmap,r&63)<0) break;
9364                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9365                     }
9366                     k=i;
9367                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9368                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9369                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9370                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9371                       ||itype[k-1]==FCOMP) {
9372                         if(count_free_regs(regs[k-1].regmap)<2) {
9373                           //printf("no free regs for store %x\n",start+(k-1)*4);
9374                           break;
9375                         }
9376                       }
9377                       else
9378                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9379                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9380                         //printf("no-match due to different register\n");
9381                         break;
9382                       }
9383                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9384                         //printf("no-match due to branch\n");
9385                         break;
9386                       }
9387                       // call/ret fast path assumes no registers allocated
9388                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9389                         break;
9390                       }
9391                       if(r>63) {
9392                         // NB This can exclude the case where the upper-half
9393                         // register is lower numbered than the lower-half
9394                         // register.  Not sure if it's worth fixing...
9395                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9396                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9397                       }
9398                       k--;
9399                     }
9400                     if(i<slen-1) {
9401                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9402                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9403                         //printf("bad match after branch\n");
9404                         break;
9405                       }
9406                     }
9407                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9408                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9409                       while(k<i) {
9410                         regs[k].regmap_entry[hr]=f_regmap[hr];
9411                         regs[k].regmap[hr]=f_regmap[hr];
9412                         regmap_pre[k+1][hr]=f_regmap[hr];
9413                         regs[k].wasdirty&=~(1<<hr);
9414                         regs[k].dirty&=~(1<<hr);
9415                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9416                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9417                         regs[k].wasconst&=~(1<<hr);
9418                         regs[k].isconst&=~(1<<hr);
9419                         k++;
9420                       }
9421                     }
9422                     else {
9423                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9424                       break;
9425                     }
9426                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9427                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9428                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9429                       regs[i].regmap_entry[hr]=f_regmap[hr];
9430                       regs[i].regmap[hr]=f_regmap[hr];
9431                       regs[i].wasdirty&=~(1<<hr);
9432                       regs[i].dirty&=~(1<<hr);
9433                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9434                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9435                       regs[i].wasconst&=~(1<<hr);
9436                       regs[i].isconst&=~(1<<hr);
9437                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9438                       branch_regs[i].wasdirty&=~(1<<hr);
9439                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9440                       branch_regs[i].regmap[hr]=f_regmap[hr];
9441                       branch_regs[i].dirty&=~(1<<hr);
9442                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9443                       branch_regs[i].wasconst&=~(1<<hr);
9444                       branch_regs[i].isconst&=~(1<<hr);
9445                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9446                         regmap_pre[i+2][hr]=f_regmap[hr];
9447                         regs[i+2].wasdirty&=~(1<<hr);
9448                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9449                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9450                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9451                       }
9452                     }
9453                   }
9454                   for(k=t;k<j;k++) {
9455                     regs[k].regmap_entry[hr]=f_regmap[hr];
9456                     regs[k].regmap[hr]=f_regmap[hr];
9457                     regmap_pre[k+1][hr]=f_regmap[hr];
9458                     regs[k+1].wasdirty&=~(1<<hr);
9459                     regs[k].dirty&=~(1<<hr);
9460                     regs[k].wasconst&=~(1<<hr);
9461                     regs[k].isconst&=~(1<<hr);
9462                   }
9463                   if(regs[j].regmap[hr]==f_regmap[hr])
9464                     regs[j].regmap_entry[hr]=f_regmap[hr];
9465                   break;
9466                 }
9467                 if(j==i) break;
9468                 if(regs[j].regmap[hr]>=0)
9469                   break;
9470                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9471                   //printf("no-match due to different register\n");
9472                   break;
9473                 }
9474                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9475                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9476                   break;
9477                 }
9478                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9479                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9480                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9481                   if(count_free_regs(regs[j].regmap)<2) {
9482                     //printf("No free regs for store %x\n",start+j*4);
9483                     break;
9484                   }
9485                 }
9486                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9487                 if(f_regmap[hr]>=64) {
9488                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9489                     break;
9490                   }
9491                   else
9492                   {
9493                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9494                       break;
9495                     }
9496                   }
9497                 }
9498               }
9499             }
9500           }
9501         }
9502       }
9503     }else{
9504       int count=0;
9505       for(hr=0;hr<HOST_REGS;hr++)
9506       {
9507         if(hr!=EXCLUDE_REG) {
9508           if(regs[i].regmap[hr]>64) {
9509             if(!((regs[i].dirty>>hr)&1))
9510               f_regmap[hr]=regs[i].regmap[hr];
9511           }
9512           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9513           else if(regs[i].regmap[hr]<0) count++;
9514         }
9515       }
9516       // Try to restore cycle count at branch targets
9517       if(bt[i]) {
9518         for(j=i;j<slen-1;j++) {
9519           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9520           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9521           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9522           ||itype[j]==FCOMP||itype[j]==FCONV) {
9523             if(count_free_regs(regs[j].regmap)<2) {
9524               //printf("no free regs for store %x\n",start+j*4);
9525               break;
9526             }
9527           }
9528           else
9529           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9530         }
9531         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9532           int k=i;
9533           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9534           while(k<j) {
9535             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9536             regs[k].regmap[HOST_CCREG]=CCREG;
9537             regmap_pre[k+1][HOST_CCREG]=CCREG;
9538             regs[k+1].wasdirty|=1<<HOST_CCREG;
9539             regs[k].dirty|=1<<HOST_CCREG;
9540             regs[k].wasconst&=~(1<<HOST_CCREG);
9541             regs[k].isconst&=~(1<<HOST_CCREG);
9542             k++;
9543           }
9544           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9545         }
9546         // Work backwards from the branch target
9547         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9548         {
9549           //printf("Extend backwards\n");
9550           int k;
9551           k=i;
9552           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9553             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9554             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9555             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9556               if(count_free_regs(regs[k-1].regmap)<2) {
9557                 //printf("no free regs for store %x\n",start+(k-1)*4);
9558                 break;
9559               }
9560             }
9561             else
9562             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9563             k--;
9564           }
9565           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9566             //printf("Extend CC, %x ->\n",start+k*4);
9567             while(k<=i) {
9568               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9569               regs[k].regmap[HOST_CCREG]=CCREG;
9570               regmap_pre[k+1][HOST_CCREG]=CCREG;
9571               regs[k+1].wasdirty|=1<<HOST_CCREG;
9572               regs[k].dirty|=1<<HOST_CCREG;
9573               regs[k].wasconst&=~(1<<HOST_CCREG);
9574               regs[k].isconst&=~(1<<HOST_CCREG);
9575               k++;
9576             }
9577           }
9578           else {
9579             //printf("Fail Extend CC, %x ->\n",start+k*4);
9580           }
9581         }
9582       }
9583       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9584          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9585          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9586          itype[i]!=FCONV&&itype[i]!=FCOMP)
9587       {
9588         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9589       }
9590     }
9591   }
9592   
9593   // This allocates registers (if possible) one instruction prior
9594   // to use, which can avoid a load-use penalty on certain CPUs.
9595   for(i=0;i<slen-1;i++)
9596   {
9597     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9598     {
9599       if(!bt[i+1])
9600       {
9601         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9602         {
9603           if(rs1[i+1]) {
9604             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9605             {
9606               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9607               {
9608                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9609                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9610                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9611                 regs[i].isconst&=~(1<<hr);
9612                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9613                 constmap[i][hr]=constmap[i+1][hr];
9614                 regs[i+1].wasdirty&=~(1<<hr);
9615                 regs[i].dirty&=~(1<<hr);
9616               }
9617             }
9618           }
9619           if(rs2[i+1]) {
9620             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9621             {
9622               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9623               {
9624                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9625                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9626                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9627                 regs[i].isconst&=~(1<<hr);
9628                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9629                 constmap[i][hr]=constmap[i+1][hr];
9630                 regs[i+1].wasdirty&=~(1<<hr);
9631                 regs[i].dirty&=~(1<<hr);
9632               }
9633             }
9634           }
9635           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9636             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9637             {
9638               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9639               {
9640                 regs[i].regmap[hr]=rs1[i+1];
9641                 regmap_pre[i+1][hr]=rs1[i+1];
9642                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9643                 regs[i].isconst&=~(1<<hr);
9644                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9645                 constmap[i][hr]=constmap[i+1][hr];
9646                 regs[i+1].wasdirty&=~(1<<hr);
9647                 regs[i].dirty&=~(1<<hr);
9648               }
9649             }
9650           }
9651           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9652             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9653             {
9654               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9655               {
9656                 regs[i].regmap[hr]=rs1[i+1];
9657                 regmap_pre[i+1][hr]=rs1[i+1];
9658                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9659                 regs[i].isconst&=~(1<<hr);
9660                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9661                 constmap[i][hr]=constmap[i+1][hr];
9662                 regs[i+1].wasdirty&=~(1<<hr);
9663                 regs[i].dirty&=~(1<<hr);
9664               }
9665             }
9666           }
9667           #ifndef HOST_IMM_ADDR32
9668           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9669             hr=get_reg(regs[i+1].regmap,TLREG);
9670             if(hr>=0) {
9671               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9672               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9673                 int nr;
9674                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9675                 {
9676                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9677                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9678                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9679                   regs[i].isconst&=~(1<<hr);
9680                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9681                   constmap[i][hr]=constmap[i+1][hr];
9682                   regs[i+1].wasdirty&=~(1<<hr);
9683                   regs[i].dirty&=~(1<<hr);
9684                 }
9685                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9686                 {
9687                   // move it to another register
9688                   regs[i+1].regmap[hr]=-1;
9689                   regmap_pre[i+2][hr]=-1;
9690                   regs[i+1].regmap[nr]=TLREG;
9691                   regmap_pre[i+2][nr]=TLREG;
9692                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9693                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9694                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9695                   regs[i].isconst&=~(1<<nr);
9696                   regs[i+1].isconst&=~(1<<nr);
9697                   regs[i].dirty&=~(1<<nr);
9698                   regs[i+1].wasdirty&=~(1<<nr);
9699                   regs[i+1].dirty&=~(1<<nr);
9700                   regs[i+2].wasdirty&=~(1<<nr);
9701                 }
9702               }
9703             }
9704           }
9705           #endif
9706           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9707             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9708               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9709               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9710               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9711               assert(hr>=0);
9712               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9713               {
9714                 regs[i].regmap[hr]=rs1[i+1];
9715                 regmap_pre[i+1][hr]=rs1[i+1];
9716                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9717                 regs[i].isconst&=~(1<<hr);
9718                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9719                 constmap[i][hr]=constmap[i+1][hr];
9720                 regs[i+1].wasdirty&=~(1<<hr);
9721                 regs[i].dirty&=~(1<<hr);
9722               }
9723             }
9724           }
9725           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9726             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9727               int nr;
9728               hr=get_reg(regs[i+1].regmap,FTEMP);
9729               assert(hr>=0);
9730               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9731               {
9732                 regs[i].regmap[hr]=rs1[i+1];
9733                 regmap_pre[i+1][hr]=rs1[i+1];
9734                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9735                 regs[i].isconst&=~(1<<hr);
9736                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9737                 constmap[i][hr]=constmap[i+1][hr];
9738                 regs[i+1].wasdirty&=~(1<<hr);
9739                 regs[i].dirty&=~(1<<hr);
9740               }
9741               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9742               {
9743                 // move it to another register
9744                 regs[i+1].regmap[hr]=-1;
9745                 regmap_pre[i+2][hr]=-1;
9746                 regs[i+1].regmap[nr]=FTEMP;
9747                 regmap_pre[i+2][nr]=FTEMP;
9748                 regs[i].regmap[nr]=rs1[i+1];
9749                 regmap_pre[i+1][nr]=rs1[i+1];
9750                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9751                 regs[i].isconst&=~(1<<nr);
9752                 regs[i+1].isconst&=~(1<<nr);
9753                 regs[i].dirty&=~(1<<nr);
9754                 regs[i+1].wasdirty&=~(1<<nr);
9755                 regs[i+1].dirty&=~(1<<nr);
9756                 regs[i+2].wasdirty&=~(1<<nr);
9757               }
9758             }
9759           }
9760           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9761             if(itype[i+1]==LOAD) 
9762               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9763             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9764               hr=get_reg(regs[i+1].regmap,FTEMP);
9765             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9766               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9767               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9768             }
9769             if(hr>=0&&regs[i].regmap[hr]<0) {
9770               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9771               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9772                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9773                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9774                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9775                 regs[i].isconst&=~(1<<hr);
9776                 regs[i+1].wasdirty&=~(1<<hr);
9777                 regs[i].dirty&=~(1<<hr);
9778               }
9779             }
9780           }
9781         }
9782       }
9783     }
9784   }
9785   
9786   /* Pass 6 - Optimize clean/dirty state */
9787   clean_registers(0,slen-1,1);
9788   
9789   /* Pass 7 - Identify 32-bit registers */
9790   
9791   provisional_r32();
9792
9793   u_int r32=0;
9794   
9795   for (i=slen-1;i>=0;i--)
9796   {
9797     int hr;
9798     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9799     {
9800       if(ba[i]<start || ba[i]>=(start+slen*4))
9801       {
9802         // Branch out of this block, don't need anything
9803         r32=0;
9804       }
9805       else
9806       {
9807         // Internal branch
9808         // Need whatever matches the target
9809         // (and doesn't get overwritten by the delay slot instruction)
9810         r32=0;
9811         int t=(ba[i]-start)>>2;
9812         if(ba[i]>start+i*4) {
9813           // Forward branch
9814           if(!(requires_32bit[t]&~regs[i].was32))
9815             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9816         }else{
9817           // Backward branch
9818           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9819           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9820           if(!(pr32[t]&~regs[i].was32))
9821             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9822         }
9823       }
9824       // Conditional branch may need registers for following instructions
9825       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9826       {
9827         if(i<slen-2) {
9828           r32|=requires_32bit[i+2];
9829           r32&=regs[i].was32;
9830           // Mark this address as a branch target since it may be called
9831           // upon return from interrupt
9832           bt[i+2]=1;
9833         }
9834       }
9835       // Merge in delay slot
9836       if(!likely[i]) {
9837         // These are overwritten unless the branch is "likely"
9838         // and the delay slot is nullified if not taken
9839         r32&=~(1LL<<rt1[i+1]);
9840         r32&=~(1LL<<rt2[i+1]);
9841       }
9842       // Assume these are needed (delay slot)
9843       if(us1[i+1]>0)
9844       {
9845         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9846       }
9847       if(us2[i+1]>0)
9848       {
9849         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9850       }
9851       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9852       {
9853         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9854       }
9855       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9856       {
9857         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9858       }
9859     }
9860     else if(itype[i]==SYSCALL)
9861     {
9862       // SYSCALL instruction (software interrupt)
9863       r32=0;
9864     }
9865     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9866     {
9867       // ERET instruction (return from interrupt)
9868       r32=0;
9869     }
9870     // Check 32 bits
9871     r32&=~(1LL<<rt1[i]);
9872     r32&=~(1LL<<rt2[i]);
9873     if(us1[i]>0)
9874     {
9875       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9876     }
9877     if(us2[i]>0)
9878     {
9879       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9880     }
9881     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
9882     {
9883       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
9884     }
9885     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
9886     {
9887       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
9888     }
9889     requires_32bit[i]=r32;
9890     
9891     // Dirty registers which are 32-bit, require 32-bit input
9892     // as they will be written as 32-bit values
9893     for(hr=0;hr<HOST_REGS;hr++)
9894     {
9895       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
9896         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
9897           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
9898           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
9899         }
9900       }
9901     }
9902     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
9903   }
9904
9905   if(itype[slen-1]==SPAN) {
9906     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9907   }
9908   
9909   /* Debug/disassembly */
9910   if((void*)assem_debug==(void*)printf) 
9911   for(i=0;i<slen;i++)
9912   {
9913     printf("U:");
9914     int r;
9915     for(r=1;r<=CCREG;r++) {
9916       if((unneeded_reg[i]>>r)&1) {
9917         if(r==HIREG) printf(" HI");
9918         else if(r==LOREG) printf(" LO");
9919         else printf(" r%d",r);
9920       }
9921     }
9922     printf(" UU:");
9923     for(r=1;r<=CCREG;r++) {
9924       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
9925         if(r==HIREG) printf(" HI");
9926         else if(r==LOREG) printf(" LO");
9927         else printf(" r%d",r);
9928       }
9929     }
9930     printf(" 32:");
9931     for(r=0;r<=CCREG;r++) {
9932       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9933       if((regs[i].was32>>r)&1) {
9934         if(r==CCREG) printf(" CC");
9935         else if(r==HIREG) printf(" HI");
9936         else if(r==LOREG) printf(" LO");
9937         else printf(" r%d",r);
9938       }
9939     }
9940     printf("\n");
9941     #if defined(__i386__) || defined(__x86_64__)
9942     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9943     #endif
9944     #ifdef __arm__
9945     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9946     #endif
9947     printf("needs: ");
9948     if(needed_reg[i]&1) printf("eax ");
9949     if((needed_reg[i]>>1)&1) printf("ecx ");
9950     if((needed_reg[i]>>2)&1) printf("edx ");
9951     if((needed_reg[i]>>3)&1) printf("ebx ");
9952     if((needed_reg[i]>>5)&1) printf("ebp ");
9953     if((needed_reg[i]>>6)&1) printf("esi ");
9954     if((needed_reg[i]>>7)&1) printf("edi ");
9955     printf("r:");
9956     for(r=0;r<=CCREG;r++) {
9957       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9958       if((requires_32bit[i]>>r)&1) {
9959         if(r==CCREG) printf(" CC");
9960         else if(r==HIREG) printf(" HI");
9961         else if(r==LOREG) printf(" LO");
9962         else printf(" r%d",r);
9963       }
9964     }
9965     printf("\n");
9966     /*printf("pr:");
9967     for(r=0;r<=CCREG;r++) {
9968       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9969       if((pr32[i]>>r)&1) {
9970         if(r==CCREG) printf(" CC");
9971         else if(r==HIREG) printf(" HI");
9972         else if(r==LOREG) printf(" LO");
9973         else printf(" r%d",r);
9974       }
9975     }
9976     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
9977     printf("\n");*/
9978     #if defined(__i386__) || defined(__x86_64__)
9979     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9980     printf("dirty: ");
9981     if(regs[i].wasdirty&1) printf("eax ");
9982     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9983     if((regs[i].wasdirty>>2)&1) printf("edx ");
9984     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9985     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9986     if((regs[i].wasdirty>>6)&1) printf("esi ");
9987     if((regs[i].wasdirty>>7)&1) printf("edi ");
9988     #endif
9989     #ifdef __arm__
9990     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9991     printf("dirty: ");
9992     if(regs[i].wasdirty&1) printf("r0 ");
9993     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9994     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9995     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9996     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9997     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9998     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9999     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10000     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10001     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10002     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10003     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10004     #endif
10005     printf("\n");
10006     disassemble_inst(i);
10007     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10008     #if defined(__i386__) || defined(__x86_64__)
10009     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10010     if(regs[i].dirty&1) printf("eax ");
10011     if((regs[i].dirty>>1)&1) printf("ecx ");
10012     if((regs[i].dirty>>2)&1) printf("edx ");
10013     if((regs[i].dirty>>3)&1) printf("ebx ");
10014     if((regs[i].dirty>>5)&1) printf("ebp ");
10015     if((regs[i].dirty>>6)&1) printf("esi ");
10016     if((regs[i].dirty>>7)&1) printf("edi ");
10017     #endif
10018     #ifdef __arm__
10019     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10020     if(regs[i].dirty&1) printf("r0 ");
10021     if((regs[i].dirty>>1)&1) printf("r1 ");
10022     if((regs[i].dirty>>2)&1) printf("r2 ");
10023     if((regs[i].dirty>>3)&1) printf("r3 ");
10024     if((regs[i].dirty>>4)&1) printf("r4 ");
10025     if((regs[i].dirty>>5)&1) printf("r5 ");
10026     if((regs[i].dirty>>6)&1) printf("r6 ");
10027     if((regs[i].dirty>>7)&1) printf("r7 ");
10028     if((regs[i].dirty>>8)&1) printf("r8 ");
10029     if((regs[i].dirty>>9)&1) printf("r9 ");
10030     if((regs[i].dirty>>10)&1) printf("r10 ");
10031     if((regs[i].dirty>>12)&1) printf("r12 ");
10032     #endif
10033     printf("\n");
10034     if(regs[i].isconst) {
10035       printf("constants: ");
10036       #if defined(__i386__) || defined(__x86_64__)
10037       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10038       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10039       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10040       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10041       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10042       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10043       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10044       #endif
10045       #ifdef __arm__
10046       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10047       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10048       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10049       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10050       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10051       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10052       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10053       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10054       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10055       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10056       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10057       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10058       #endif
10059       printf("\n");
10060     }
10061     printf(" 32:");
10062     for(r=0;r<=CCREG;r++) {
10063       if((regs[i].is32>>r)&1) {
10064         if(r==CCREG) printf(" CC");
10065         else if(r==HIREG) printf(" HI");
10066         else if(r==LOREG) printf(" LO");
10067         else printf(" r%d",r);
10068       }
10069     }
10070     printf("\n");
10071     /*printf(" p32:");
10072     for(r=0;r<=CCREG;r++) {
10073       if((p32[i]>>r)&1) {
10074         if(r==CCREG) printf(" CC");
10075         else if(r==HIREG) printf(" HI");
10076         else if(r==LOREG) printf(" LO");
10077         else printf(" r%d",r);
10078       }
10079     }
10080     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10081     else printf("\n");*/
10082     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10083       #if defined(__i386__) || defined(__x86_64__)
10084       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10085       if(branch_regs[i].dirty&1) printf("eax ");
10086       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10087       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10088       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10089       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10090       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10091       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10092       #endif
10093       #ifdef __arm__
10094       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10095       if(branch_regs[i].dirty&1) printf("r0 ");
10096       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10097       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10098       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10099       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10100       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10101       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10102       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10103       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10104       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10105       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10106       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10107       #endif
10108       printf(" 32:");
10109       for(r=0;r<=CCREG;r++) {
10110         if((branch_regs[i].is32>>r)&1) {
10111           if(r==CCREG) printf(" CC");
10112           else if(r==HIREG) printf(" HI");
10113           else if(r==LOREG) printf(" LO");
10114           else printf(" r%d",r);
10115         }
10116       }
10117       printf("\n");
10118     }
10119   }
10120
10121   /* Pass 8 - Assembly */
10122   linkcount=0;stubcount=0;
10123   ds=0;is_delayslot=0;
10124   cop1_usable=0;
10125   uint64_t is32_pre=0;
10126   u_int dirty_pre=0;
10127   u_int beginning=(u_int)out;
10128   if((u_int)addr&1) {
10129     ds=1;
10130     pagespan_ds();
10131   }
10132   for(i=0;i<slen;i++)
10133   {
10134     //if(ds) printf("ds: ");
10135     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10136     if(ds) {
10137       ds=0; // Skip delay slot
10138       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10139       instr_addr[i]=0;
10140     } else {
10141       #ifndef DESTRUCTIVE_WRITEBACK
10142       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10143       {
10144         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10145               unneeded_reg[i],unneeded_reg_upper[i]);
10146         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10147               unneeded_reg[i],unneeded_reg_upper[i]);
10148       }
10149       is32_pre=regs[i].is32;
10150       dirty_pre=regs[i].dirty;
10151       #endif
10152       // write back
10153       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10154       {
10155         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10156                       unneeded_reg[i],unneeded_reg_upper[i]);
10157         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10158       }
10159       // branch target entry point
10160       instr_addr[i]=(u_int)out;
10161       assem_debug("<->\n");
10162       // load regs
10163       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10164         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10165       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10166       address_generation(i,&regs[i],regs[i].regmap_entry);
10167       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10168       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10169       {
10170         // Load the delay slot registers if necessary
10171         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10172           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10173         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10174           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10175         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10176           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10177       }
10178       else if(i+1<slen)
10179       {
10180         // Preload registers for following instruction
10181         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10182           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10183             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10184         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10185           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10186             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10187       }
10188       // TODO: if(is_ooo(i)) address_generation(i+1);
10189       if(itype[i]==CJUMP||itype[i]==FJUMP)
10190         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10191       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10192         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10193       if(bt[i]) cop1_usable=0;
10194       // assemble
10195       switch(itype[i]) {
10196         case ALU:
10197           alu_assemble(i,&regs[i]);break;
10198         case IMM16:
10199           imm16_assemble(i,&regs[i]);break;
10200         case SHIFT:
10201           shift_assemble(i,&regs[i]);break;
10202         case SHIFTIMM:
10203           shiftimm_assemble(i,&regs[i]);break;
10204         case LOAD:
10205           load_assemble(i,&regs[i]);break;
10206         case LOADLR:
10207           loadlr_assemble(i,&regs[i]);break;
10208         case STORE:
10209           store_assemble(i,&regs[i]);break;
10210         case STORELR:
10211           storelr_assemble(i,&regs[i]);break;
10212         case COP0:
10213           cop0_assemble(i,&regs[i]);break;
10214         case COP1:
10215           cop1_assemble(i,&regs[i]);break;
10216         case C1LS:
10217           c1ls_assemble(i,&regs[i]);break;
10218         case FCONV:
10219           fconv_assemble(i,&regs[i]);break;
10220         case FLOAT:
10221           float_assemble(i,&regs[i]);break;
10222         case FCOMP:
10223           fcomp_assemble(i,&regs[i]);break;
10224         case MULTDIV:
10225           multdiv_assemble(i,&regs[i]);break;
10226         case MOV:
10227           mov_assemble(i,&regs[i]);break;
10228         case SYSCALL:
10229           syscall_assemble(i,&regs[i]);break;
10230         case UJUMP:
10231           ujump_assemble(i,&regs[i]);ds=1;break;
10232         case RJUMP:
10233           rjump_assemble(i,&regs[i]);ds=1;break;
10234         case CJUMP:
10235           cjump_assemble(i,&regs[i]);ds=1;break;
10236         case SJUMP:
10237           sjump_assemble(i,&regs[i]);ds=1;break;
10238         case FJUMP:
10239           fjump_assemble(i,&regs[i]);ds=1;break;
10240         case SPAN:
10241           pagespan_assemble(i,&regs[i]);break;
10242       }
10243       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10244         literal_pool(1024);
10245       else
10246         literal_pool_jumpover(256);
10247     }
10248   }
10249   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10250   // If the block did not end with an unconditional branch,
10251   // add a jump to the next instruction.
10252   if(i>1) {
10253     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10254       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10255       assert(i==slen);
10256       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10257         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10258         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10259           emit_loadreg(CCREG,HOST_CCREG);
10260         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10261       }
10262       else if(!likely[i-2])
10263       {
10264         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10265         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10266       }
10267       else
10268       {
10269         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10270         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10271       }
10272       add_to_linker((int)out,start+i*4,0);
10273       emit_jmp(0);
10274     }
10275   }
10276   else
10277   {
10278     assert(i>0);
10279     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10280     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10281     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10282       emit_loadreg(CCREG,HOST_CCREG);
10283     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10284     add_to_linker((int)out,start+i*4,0);
10285     emit_jmp(0);
10286   }
10287
10288   // TODO: delay slot stubs?
10289   // Stubs
10290   for(i=0;i<stubcount;i++)
10291   {
10292     switch(stubs[i][0])
10293     {
10294       case LOADB_STUB:
10295       case LOADH_STUB:
10296       case LOADW_STUB:
10297       case LOADD_STUB:
10298       case LOADBU_STUB:
10299       case LOADHU_STUB:
10300         do_readstub(i);break;
10301       case STOREB_STUB:
10302       case STOREH_STUB:
10303       case STOREW_STUB:
10304       case STORED_STUB:
10305         do_writestub(i);break;
10306       case CC_STUB:
10307         do_ccstub(i);break;
10308       case INVCODE_STUB:
10309         do_invstub(i);break;
10310       case FP_STUB:
10311         do_cop1stub(i);break;
10312       case STORELR_STUB:
10313         do_unalignedwritestub(i);break;
10314     }
10315   }
10316
10317   /* Pass 9 - Linker */
10318   for(i=0;i<linkcount;i++)
10319   {
10320     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10321     literal_pool(64);
10322     if(!link_addr[i][2])
10323     {
10324       void *stub=out;
10325       void *addr=check_addr(link_addr[i][1]);
10326       emit_extjump(link_addr[i][0],link_addr[i][1]);
10327       if(addr) {
10328         set_jump_target(link_addr[i][0],(int)addr);
10329         add_link(link_addr[i][1],stub);
10330       }
10331       else set_jump_target(link_addr[i][0],(int)stub);
10332     }
10333     else
10334     {
10335       // Internal branch
10336       int target=(link_addr[i][1]-start)>>2;
10337       assert(target>=0&&target<slen);
10338       assert(instr_addr[target]);
10339       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10340       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10341       //#else
10342       set_jump_target(link_addr[i][0],instr_addr[target]);
10343       //#endif
10344     }
10345   }
10346   // External Branch Targets (jump_in)
10347   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10348   for(i=0;i<slen;i++)
10349   {
10350     if(bt[i]||i==0)
10351     {
10352       if(instr_addr[i]) // TODO - delay slots (=null)
10353       {
10354         u_int vaddr=start+i*4;
10355         u_int page=(0x80000000^vaddr)>>12;
10356         u_int vpage=page;
10357         if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[page^0x80000]^0x80000000)>>12;
10358         if(page>2048) page=2048+(page&2047);
10359         if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
10360         if(vpage>2048) vpage=2048+(vpage&2047);
10361         literal_pool(256);
10362         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10363         if(!requires_32bit[i])
10364         {
10365           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10366           assem_debug("jump_in: %x\n",start+i*4);
10367           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10368           int entry_point=do_dirty_stub(i);
10369           ll_add(jump_in+page,vaddr,(void *)entry_point);
10370           // If there was an existing entry in the hash table,
10371           // replace it with the new address.
10372           // Don't add new entries.  We'll insert the
10373           // ones that actually get used in check_addr().
10374           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10375           if(ht_bin[0]==vaddr) {
10376             ht_bin[1]=entry_point;
10377           }
10378           if(ht_bin[2]==vaddr) {
10379             ht_bin[3]=entry_point;
10380           }
10381         }
10382         else
10383         {
10384           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10385           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10386           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10387           //int entry_point=(int)out;
10388           ////assem_debug("entry_point: %x\n",entry_point);
10389           //load_regs_entry(i);
10390           //if(entry_point==(int)out)
10391           //  entry_point=instr_addr[i];
10392           //else
10393           //  emit_jmp(instr_addr[i]);
10394           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10395           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10396           int entry_point=do_dirty_stub(i);
10397           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10398         }
10399       }
10400     }
10401   }
10402   // Write out the literal pool if necessary
10403   literal_pool(0);
10404   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10405   // Align code
10406   if(((u_int)out)&7) emit_addnop(13);
10407   #endif
10408   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10409   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10410   memcpy(copy,source,slen*4);
10411   copy+=slen*4;
10412   
10413   #ifdef __arm__
10414   __clear_cache((void *)beginning,out);
10415   #endif
10416   
10417   // If we're within 256K of the end of the buffer,
10418   // start over from the beginning. (Is 256K enough?)
10419   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10420   
10421   // Trap writes to any of the pages we compiled
10422   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10423     invalid_code[i]=0;
10424     memory_map[i]|=0x40000000;
10425     if((signed int)start>=(signed int)0xC0000000) {
10426       assert(using_tlb);
10427       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10428       invalid_code[j]=0;
10429       memory_map[j]|=0x40000000;
10430       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10431     }
10432   }
10433   
10434   /* Pass 10 - Free memory by expiring oldest blocks */
10435   
10436   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10437   while(expirep!=end)
10438   {
10439     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10440     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10441     inv_debug("EXP: Phase %d\n",expirep);
10442     switch((expirep>>11)&3)
10443     {
10444       case 0:
10445         // Clear jump_in and jump_dirty
10446         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10447         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10448         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10449         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10450         break;
10451       case 1:
10452         // Clear pointers
10453         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10454         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10455         break;
10456       case 2:
10457         // Clear hash table
10458         for(i=0;i<32;i++) {
10459           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10460           if((ht_bin[3]>>shift)==(base>>shift) ||
10461              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10462             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10463             ht_bin[2]=ht_bin[3]=-1;
10464           }
10465           if((ht_bin[1]>>shift)==(base>>shift) ||
10466              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10467             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10468             ht_bin[0]=ht_bin[2];
10469             ht_bin[1]=ht_bin[3];
10470             ht_bin[2]=ht_bin[3]=-1;
10471           }
10472         }
10473         break;
10474       case 3:
10475         // Clear jump_out
10476         #ifdef __arm__
10477         if((expirep&2047)==0)
10478           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10479         #endif
10480         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10481         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10482         break;
10483     }
10484     expirep=(expirep+1)&65535;
10485   }
10486   return 0;
10487 }