ebdab12755eabea26e0c8f959bfe6a0a0f520128
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "../recomp.h"
26 #include "../recomph.h" //include for function prototypes
27 #include "../macros.h"
28 #include "../r4300.h"
29 #include "../ops.h"
30 #include "../interupt.h"
31
32 #include "../../memory/memory.h"
33
34 #include <sys/mman.h>
35
36 #ifdef __i386__
37 #include "assem_x86.h"
38 #endif
39 #ifdef __x86_64__
40 #include "assem_x64.h"
41 #endif
42 #ifdef __arm__
43 #include "assem_arm.h"
44 #endif
45
46 #define MAXBLOCK 4096
47 #define MAX_OUTPUT_BLOCK_SIZE 262144
48 #define CLOCK_DIVIDER 2
49
50 struct regstat
51 {
52   signed char regmap_entry[HOST_REGS];
53   signed char regmap[HOST_REGS];
54   uint64_t was32;
55   uint64_t is32;
56   uint64_t wasdirty;
57   uint64_t dirty;
58   uint64_t u;
59   uint64_t uu;
60   u_int wasconst;
61   u_int isconst;
62   uint64_t constmap[HOST_REGS];
63 };
64
65 struct ll_entry
66 {
67   u_int vaddr;
68   u_int reg32;
69   void *addr;
70   struct ll_entry *next;
71 };
72
73   u_int start;
74   u_int *source;
75   u_int pagelimit;
76   char insn[MAXBLOCK][10];
77   u_char itype[MAXBLOCK];
78   u_char opcode[MAXBLOCK];
79   u_char opcode2[MAXBLOCK];
80   u_char bt[MAXBLOCK];
81   u_char rs1[MAXBLOCK];
82   u_char rs2[MAXBLOCK];
83   u_char rt1[MAXBLOCK];
84   u_char rt2[MAXBLOCK];
85   u_char us1[MAXBLOCK];
86   u_char us2[MAXBLOCK];
87   u_char dep1[MAXBLOCK];
88   u_char dep2[MAXBLOCK];
89   u_char lt1[MAXBLOCK];
90   int imm[MAXBLOCK];
91   u_int ba[MAXBLOCK];
92   char likely[MAXBLOCK];
93   char is_ds[MAXBLOCK];
94   uint64_t unneeded_reg[MAXBLOCK];
95   uint64_t unneeded_reg_upper[MAXBLOCK];
96   uint64_t branch_unneeded_reg[MAXBLOCK];
97   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
98   uint64_t p32[MAXBLOCK];
99   uint64_t pr32[MAXBLOCK];
100   signed char regmap_pre[MAXBLOCK][HOST_REGS];
101   signed char regmap[MAXBLOCK][HOST_REGS];
102   signed char regmap_entry[MAXBLOCK][HOST_REGS];
103   uint64_t constmap[MAXBLOCK][HOST_REGS];
104   uint64_t known_value[HOST_REGS];
105   u_int known_reg;
106   struct regstat regs[MAXBLOCK];
107   struct regstat branch_regs[MAXBLOCK];
108   u_int needed_reg[MAXBLOCK];
109   uint64_t requires_32bit[MAXBLOCK];
110   u_int wont_dirty[MAXBLOCK];
111   u_int will_dirty[MAXBLOCK];
112   int ccadj[MAXBLOCK];
113   int slen;
114   u_int instr_addr[MAXBLOCK];
115   u_int link_addr[MAXBLOCK][3];
116   int linkcount;
117   u_int stubs[MAXBLOCK*3][8];
118   int stubcount;
119   u_int literals[1024][2];
120   int literalcount;
121   int is_delayslot;
122   int cop1_usable;
123   u_char *out;
124   struct ll_entry *jump_in[4096];
125   struct ll_entry *jump_out[4096];
126   struct ll_entry *jump_dirty[4096];
127   u_int hash_table[65536][4]  __attribute__((aligned(16)));
128   char shadow[1048576]  __attribute__((aligned(16)));
129   void *copy;
130   int expirep;
131   u_int using_tlb;
132   u_int stop_after_jal;
133   extern u_char restore_candidate[512];
134   extern int cycle_count;
135
136   /* registers that may be allocated */
137   /* 1-31 gpr */
138 #define HIREG 32 // hi
139 #define LOREG 33 // lo
140 #define FSREG 34 // FPU status (FCSR)
141 #define CSREG 35 // Coprocessor status
142 #define CCREG 36 // Cycle count
143 #define INVCP 37 // Pointer to invalid_code
144 #define TEMPREG 38
145 #define FTEMP 38 // FPU temporary register
146 #define PTEMP 39 // Prefetch temporary register
147 #define TLREG 40 // TLB mapping offset
148 #define RHASH 41 // Return address hash
149 #define RHTBL 42 // Return address hash table address
150 #define RTEMP 43 // JR/JALR address register
151 #define MAXREG 43
152 #define AGEN1 44 // Address generation temporary register
153 #define AGEN2 45 // Address generation temporary register
154 #define MGEN1 46 // Maptable address generation temporary register
155 #define MGEN2 47 // Maptable address generation temporary register
156 #define BTREG 48 // Branch target temporary register
157
158   /* instruction types */
159 #define NOP 0     // No operation
160 #define LOAD 1    // Load
161 #define STORE 2   // Store
162 #define LOADLR 3  // Unaligned load
163 #define STORELR 4 // Unaligned store
164 #define MOV 5     // Move 
165 #define ALU 6     // Arithmetic/logic
166 #define MULTDIV 7 // Multiply/divide
167 #define SHIFT 8   // Shift by register
168 #define SHIFTIMM 9// Shift by immediate
169 #define IMM16 10  // 16-bit immediate
170 #define RJUMP 11  // Unconditional jump to register
171 #define UJUMP 12  // Unconditional jump
172 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
173 #define SJUMP 14  // Conditional branch (regimm format)
174 #define COP0 15   // Coprocessor 0
175 #define COP1 16   // Coprocessor 1
176 #define C1LS 17   // Coprocessor 1 load/store
177 #define FJUMP 18  // Conditional branch (floating point)
178 #define FLOAT 19  // Floating point unit
179 #define FCONV 20  // Convert integer to float
180 #define FCOMP 21  // Floating point compare (sets FSREG)
181 #define SYSCALL 22// SYSCALL
182 #define OTHER 23  // Other
183 #define SPAN 24   // Branch/delay slot spans 2 pages
184 #define NI 25     // Not implemented
185
186   /* stubs */
187 #define CC_STUB 1
188 #define FP_STUB 2
189 #define LOADB_STUB 3
190 #define LOADH_STUB 4
191 #define LOADW_STUB 5
192 #define LOADD_STUB 6
193 #define LOADBU_STUB 7
194 #define LOADHU_STUB 8
195 #define STOREB_STUB 9
196 #define STOREH_STUB 10
197 #define STOREW_STUB 11
198 #define STORED_STUB 12
199 #define STORELR_STUB 13
200 #define INVCODE_STUB 14
201
202   /* branch codes */
203 #define TAKEN 1
204 #define NOTTAKEN 2
205 #define NULLDS 3
206
207 // asm linkage
208 int new_recompile_block(int addr);
209 void *get_addr_ht(u_int vaddr);
210 void invalidate_block(u_int block);
211 void invalidate_addr(u_int addr);
212 void remove_hash(int vaddr);
213 void jump_vaddr();
214 void dyna_linker();
215 void dyna_linker_ds();
216 void verify_code();
217 void verify_code_vm();
218 void verify_code_ds();
219 void cc_interrupt();
220 void fp_exception();
221 void fp_exception_ds();
222 void jump_syscall();
223 void jump_eret();
224
225 // TLB
226 void TLBWI_new();
227 void TLBWR_new();
228 void read_nomem_new();
229 void read_nomemb_new();
230 void read_nomemh_new();
231 void read_nomemd_new();
232 void write_nomem_new();
233 void write_nomemb_new();
234 void write_nomemh_new();
235 void write_nomemd_new();
236 void write_rdram_new();
237 void write_rdramb_new();
238 void write_rdramh_new();
239 void write_rdramd_new();
240 extern u_int memory_map[1048576];
241
242 // Needed by assembler
243 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
244 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
245 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
246 void load_all_regs(signed char i_regmap[]);
247 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
248 void load_regs_entry(int t);
249 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
250
251 int tracedebug=0;
252
253 //#define DEBUG_CYCLE_COUNT 1
254
255 void nullf() {}
256 //#define assem_debug printf
257 //#define inv_debug printf
258 #define assem_debug nullf
259 #define inv_debug nullf
260
261 static void tlb_hacks()
262 {
263 #ifndef DISABLE_TLB
264   // Goldeneye hack
265   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
266   {
267     u_int addr;
268     int n;
269     switch (ROM_HEADER->Country_code&0xFF) 
270     {
271       case 0x45: // U
272         addr=0x34b30;
273         break;                   
274       case 0x4A: // J 
275         addr=0x34b70;    
276         break;    
277       case 0x50: // E 
278         addr=0x329f0;
279         break;                        
280       default: 
281         // Unknown country code
282         addr=0;
283         break;
284     }
285     u_int rom_addr=(u_int)rom;
286     #ifdef ROM_COPY
287     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
288     // in the lower 4G of memory to use this hack.  Copy it if necessary.
289     if((void *)rom>(void *)0xffffffff) {
290       munmap(ROM_COPY, 67108864);
291       if(mmap(ROM_COPY, 12582912,
292               PROT_READ | PROT_WRITE,
293               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
294               -1, 0) <= 0) {printf("mmap() failed\n");}
295       memcpy(ROM_COPY,rom,12582912);
296       rom_addr=(u_int)ROM_COPY;
297     }
298     #endif
299     if(addr) {
300       for(n=0x7F000;n<0x80000;n++) {
301         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
302       }
303     }
304   }
305 #endif
306 }
307
308 static u_int get_page(u_int vaddr)
309 {
310   u_int page=(vaddr^0x80000000)>>12;
311 #ifndef DISABLE_TLB
312   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
313 #endif
314   if(page>2048) page=2048+(page&2047);
315   return page;
316 }
317
318 static u_int get_vpage(u_int vaddr)
319 {
320   u_int vpage=(vaddr^0x80000000)>>12;
321 #ifndef DISABLE_TLB
322   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
323 #endif
324   if(vpage>2048) vpage=2048+(vpage&2047);
325   return vpage;
326 }
327
328 // Get address from virtual address
329 // This is called from the recompiled JR/JALR instructions
330 void *get_addr(u_int vaddr)
331 {
332   u_int page=get_page(vaddr);
333   u_int vpage=get_vpage(vaddr);
334   struct ll_entry *head;
335   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
336   head=jump_in[page];
337   while(head!=NULL) {
338     if(head->vaddr==vaddr&&head->reg32==0) {
339   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
340       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
341       ht_bin[3]=ht_bin[1];
342       ht_bin[2]=ht_bin[0];
343       ht_bin[1]=(int)head->addr;
344       ht_bin[0]=vaddr;
345       return head->addr;
346     }
347     head=head->next;
348   }
349   head=jump_dirty[vpage];
350   while(head!=NULL) {
351     if(head->vaddr==vaddr&&head->reg32==0) {
352       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
353       // Don't restore blocks which are about to expire from the cache
354       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
355       if(verify_dirty(head->addr)) {
356         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
357         invalid_code[vaddr>>12]=0;
358         memory_map[vaddr>>12]|=0x40000000;
359         if(vpage<2048) {
360 #ifndef DISABLE_TLB
361           if(tlb_LUT_r[vaddr>>12]) {
362             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
363             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
364           }
365 #endif
366           restore_candidate[vpage>>3]|=1<<(vpage&7);
367         }
368         else restore_candidate[page>>3]|=1<<(page&7);
369         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
370         if(ht_bin[0]==vaddr) {
371           ht_bin[1]=(int)head->addr; // Replace existing entry
372         }
373         else
374         {
375           ht_bin[3]=ht_bin[1];
376           ht_bin[2]=ht_bin[0];
377           ht_bin[1]=(int)head->addr;
378           ht_bin[0]=vaddr;
379         }
380         return head->addr;
381       }
382     }
383     head=head->next;
384   }
385   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
386   int r=new_recompile_block(vaddr);
387   if(r==0) return get_addr(vaddr);
388   // Execute in unmapped page, generate pagefault execption
389   Status|=2;
390   Cause=(vaddr<<31)|0x8;
391   EPC=(vaddr&1)?vaddr-5:vaddr;
392   BadVAddr=(vaddr&~1);
393   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
394   EntryHi=BadVAddr&0xFFFFE000;
395   return get_addr_ht(0x80000000);
396 }
397 // Look up address in hash table first
398 void *get_addr_ht(u_int vaddr)
399 {
400   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
401   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
402   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
403   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
404   return get_addr(vaddr);
405 }
406
407 void *get_addr_32(u_int vaddr,u_int flags)
408 {
409   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
410   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
411   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
412   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
413   u_int page=get_page(vaddr);
414   u_int vpage=get_vpage(vaddr);
415   struct ll_entry *head;
416   head=jump_in[page];
417   while(head!=NULL) {
418     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
419       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
420       if(head->reg32==0) {
421         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
422         if(ht_bin[0]==-1) {
423           ht_bin[1]=(int)head->addr;
424           ht_bin[0]=vaddr;
425         }else if(ht_bin[2]==-1) {
426           ht_bin[3]=(int)head->addr;
427           ht_bin[2]=vaddr;
428         }
429         //ht_bin[3]=ht_bin[1];
430         //ht_bin[2]=ht_bin[0];
431         //ht_bin[1]=(int)head->addr;
432         //ht_bin[0]=vaddr;
433       }
434       return head->addr;
435     }
436     head=head->next;
437   }
438   head=jump_dirty[vpage];
439   while(head!=NULL) {
440     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
441       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
442       // Don't restore blocks which are about to expire from the cache
443       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
444       if(verify_dirty(head->addr)) {
445         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
446         invalid_code[vaddr>>12]=0;
447         memory_map[vaddr>>12]|=0x40000000;
448         if(vpage<2048) {
449 #ifndef DISABLE_TLB
450           if(tlb_LUT_r[vaddr>>12]) {
451             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
452             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
453           }
454 #endif
455           restore_candidate[vpage>>3]|=1<<(vpage&7);
456         }
457         else restore_candidate[page>>3]|=1<<(page&7);
458         if(head->reg32==0) {
459           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
460           if(ht_bin[0]==-1) {
461             ht_bin[1]=(int)head->addr;
462             ht_bin[0]=vaddr;
463           }else if(ht_bin[2]==-1) {
464             ht_bin[3]=(int)head->addr;
465             ht_bin[2]=vaddr;
466           }
467           //ht_bin[3]=ht_bin[1];
468           //ht_bin[2]=ht_bin[0];
469           //ht_bin[1]=(int)head->addr;
470           //ht_bin[0]=vaddr;
471         }
472         return head->addr;
473       }
474     }
475     head=head->next;
476   }
477   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
478   int r=new_recompile_block(vaddr);
479   if(r==0) return get_addr(vaddr);
480   // Execute in unmapped page, generate pagefault execption
481   Status|=2;
482   Cause=(vaddr<<31)|0x8;
483   EPC=(vaddr&1)?vaddr-5:vaddr;
484   BadVAddr=(vaddr&~1);
485   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
486   EntryHi=BadVAddr&0xFFFFE000;
487   return get_addr_ht(0x80000000);
488 }
489
490 void clear_all_regs(signed char regmap[])
491 {
492   int hr;
493   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
494 }
495
496 signed char get_reg(signed char regmap[],int r)
497 {
498   int hr;
499   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
500   return -1;
501 }
502
503 // Find a register that is available for two consecutive cycles
504 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
505 {
506   int hr;
507   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
508   return -1;
509 }
510
511 int count_free_regs(signed char regmap[])
512 {
513   int count=0;
514   int hr;
515   for(hr=0;hr<HOST_REGS;hr++)
516   {
517     if(hr!=EXCLUDE_REG) {
518       if(regmap[hr]<0) count++;
519     }
520   }
521   return count;
522 }
523
524 void dirty_reg(struct regstat *cur,signed char reg)
525 {
526   int hr;
527   if(!reg) return;
528   for (hr=0;hr<HOST_REGS;hr++) {
529     if((cur->regmap[hr]&63)==reg) {
530       cur->dirty|=1<<hr;
531     }
532   }
533 }
534
535 // If we dirty the lower half of a 64 bit register which is now being
536 // sign-extended, we need to dump the upper half.
537 // Note: Do this only after completion of the instruction, because
538 // some instructions may need to read the full 64-bit value even if
539 // overwriting it (eg SLTI, DSRA32).
540 static void flush_dirty_uppers(struct regstat *cur)
541 {
542   int hr,reg;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->dirty>>hr)&1) {
545       reg=cur->regmap[hr];
546       if(reg>=64) 
547         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
548     }
549   }
550 }
551
552 void set_const(struct regstat *cur,signed char reg,uint64_t value)
553 {
554   int hr;
555   if(!reg) return;
556   for (hr=0;hr<HOST_REGS;hr++) {
557     if(cur->regmap[hr]==reg) {
558       cur->isconst|=1<<hr;
559       cur->constmap[hr]=value;
560     }
561     else if((cur->regmap[hr]^64)==reg) {
562       cur->isconst|=1<<hr;
563       cur->constmap[hr]=value>>32;
564     }
565   }
566 }
567
568 void clear_const(struct regstat *cur,signed char reg)
569 {
570   int hr;
571   if(!reg) return;
572   for (hr=0;hr<HOST_REGS;hr++) {
573     if((cur->regmap[hr]&63)==reg) {
574       cur->isconst&=~(1<<hr);
575     }
576   }
577 }
578
579 int is_const(struct regstat *cur,signed char reg)
580 {
581   int hr;
582   if(!reg) return 1;
583   for (hr=0;hr<HOST_REGS;hr++) {
584     if((cur->regmap[hr]&63)==reg) {
585       return (cur->isconst>>hr)&1;
586     }
587   }
588   return 0;
589 }
590 uint64_t get_const(struct regstat *cur,signed char reg)
591 {
592   int hr;
593   if(!reg) return 0;
594   for (hr=0;hr<HOST_REGS;hr++) {
595     if(cur->regmap[hr]==reg) {
596       return cur->constmap[hr];
597     }
598   }
599   printf("Unknown constant in r%d\n",reg);
600   exit(1);
601 }
602
603 // Least soon needed registers
604 // Look at the next ten instructions and see which registers
605 // will be used.  Try not to reallocate these.
606 void lsn(u_char hsn[], int i, int *preferred_reg)
607 {
608   int j;
609   int b=-1;
610   for(j=0;j<9;j++)
611   {
612     if(i+j>=slen) {
613       j=slen-i-1;
614       break;
615     }
616     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
617     {
618       // Don't go past an unconditonal jump
619       j++;
620       break;
621     }
622   }
623   for(;j>=0;j--)
624   {
625     if(rs1[i+j]) hsn[rs1[i+j]]=j;
626     if(rs2[i+j]) hsn[rs2[i+j]]=j;
627     if(rt1[i+j]) hsn[rt1[i+j]]=j;
628     if(rt2[i+j]) hsn[rt2[i+j]]=j;
629     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
630       // Stores can allocate zero
631       hsn[rs1[i+j]]=j;
632       hsn[rs2[i+j]]=j;
633     }
634     // On some architectures stores need invc_ptr
635     #if defined(HOST_IMM8)
636     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
637       hsn[INVCP]=j;
638     }
639     #endif
640     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
641     {
642       hsn[CCREG]=j;
643       b=j;
644     }
645   }
646   if(b>=0)
647   {
648     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
649     {
650       // Follow first branch
651       int t=(ba[i+b]-start)>>2;
652       j=7-b;if(t+j>=slen) j=slen-t-1;
653       for(;j>=0;j--)
654       {
655         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
656         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
657         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
658         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
659       }
660     }
661     // TODO: preferred register based on backward branch
662   }
663   // Delay slot should preferably not overwrite branch conditions or cycle count
664   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
665     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
666     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
667     hsn[CCREG]=1;
668     // ...or hash tables
669     hsn[RHASH]=1;
670     hsn[RHTBL]=1;
671   }
672   // Coprocessor load/store needs FTEMP, even if not declared
673   if(itype[i]==C1LS) {
674     hsn[FTEMP]=0;
675   }
676   // Load L/R also uses FTEMP as a temporary register
677   if(itype[i]==LOADLR) {
678     hsn[FTEMP]=0;
679   }
680   // Also 64-bit SDL/SDR
681   if(opcode[i]==0x2c||opcode[i]==0x2d) {
682     hsn[FTEMP]=0;
683   }
684   // Don't remove the TLB registers either
685   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
686     hsn[TLREG]=0;
687   }
688   // Don't remove the miniht registers
689   if(itype[i]==UJUMP||itype[i]==RJUMP)
690   {
691     hsn[RHASH]=0;
692     hsn[RHTBL]=0;
693   }
694 }
695
696 // We only want to allocate registers if we're going to use them again soon
697 int needed_again(int r, int i)
698 {
699   int j;
700   int b=-1;
701   int rn=10;
702   int hr;
703   u_char hsn[MAXREG+1];
704   int preferred_reg;
705   
706   memset(hsn,10,sizeof(hsn));
707   lsn(hsn,i,&preferred_reg);
708   
709   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
710   {
711     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
712       return 0; // Don't need any registers if exiting the block
713   }
714   for(j=0;j<9;j++)
715   {
716     if(i+j>=slen) {
717       j=slen-i-1;
718       break;
719     }
720     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
721     {
722       // Don't go past an unconditonal jump
723       j++;
724       break;
725     }
726     if(itype[i+j]==SYSCALL||((source[i+j]&0xfc00003f)==0x0d))
727     {
728       break;
729     }
730   }
731   for(;j>=1;j--)
732   {
733     if(rs1[i+j]==r) rn=j;
734     if(rs2[i+j]==r) rn=j;
735     if((unneeded_reg[i+j]>>r)&1) rn=10;
736     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
737     {
738       b=j;
739     }
740   }
741   /*
742   if(b>=0)
743   {
744     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
745     {
746       // Follow first branch
747       int o=rn;
748       int t=(ba[i+b]-start)>>2;
749       j=7-b;if(t+j>=slen) j=slen-t-1;
750       for(;j>=0;j--)
751       {
752         if(!((unneeded_reg[t+j]>>r)&1)) {
753           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
754           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
755         }
756         else rn=o;
757       }
758     }
759   }*/
760   for(hr=0;hr<HOST_REGS;hr++) {
761     if(hr!=EXCLUDE_REG) {
762       if(rn<hsn[hr]) return 1;
763     }
764   }
765   return 0;
766 }
767
768 // Try to match register allocations at the end of a loop with those
769 // at the beginning
770 int loop_reg(int i, int r, int hr)
771 {
772   int j,k;
773   for(j=0;j<9;j++)
774   {
775     if(i+j>=slen) {
776       j=slen-i-1;
777       break;
778     }
779     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
780     {
781       // Don't go past an unconditonal jump
782       j++;
783       break;
784     }
785   }
786   k=0;
787   if(i>0){
788     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
789       k--;
790   }
791   for(;k<j;k++)
792   {
793     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
794     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
795     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
796     {
797       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
798       {
799         int t=(ba[i+k]-start)>>2;
800         int reg=get_reg(regs[t].regmap_entry,r);
801         if(reg>=0) return reg;
802         //reg=get_reg(regs[t+1].regmap_entry,r);
803         //if(reg>=0) return reg;
804       }
805     }
806   }
807   return hr;
808 }
809
810
811 // Allocate every register, preserving source/target regs
812 void alloc_all(struct regstat *cur,int i)
813 {
814   int hr;
815   
816   for(hr=0;hr<HOST_REGS;hr++) {
817     if(hr!=EXCLUDE_REG) {
818       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
819          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
820       {
821         cur->regmap[hr]=-1;
822         cur->dirty&=~(1<<hr);
823       }
824       // Don't need zeros
825       if((cur->regmap[hr]&63)==0)
826       {
827         cur->regmap[hr]=-1;
828         cur->dirty&=~(1<<hr);
829       }
830     }
831   }
832 }
833
834
835 void div64(int64_t dividend,int64_t divisor)
836 {
837   lo=dividend/divisor;
838   hi=dividend%divisor;
839   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
840   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
841 }
842 void divu64(uint64_t dividend,uint64_t divisor)
843 {
844   lo=dividend/divisor;
845   hi=dividend%divisor;
846   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
847   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
848 }
849
850 void mult64(uint64_t m1,uint64_t m2)
851 {
852    unsigned long long int op1, op2, op3, op4;
853    unsigned long long int result1, result2, result3, result4;
854    unsigned long long int temp1, temp2, temp3, temp4;
855    int sign = 0;
856    
857    if (m1 < 0)
858      {
859     op2 = -m1;
860     sign = 1 - sign;
861      }
862    else op2 = m1;
863    if (m2 < 0)
864      {
865     op4 = -m2;
866     sign = 1 - sign;
867      }
868    else op4 = m2;
869    
870    op1 = op2 & 0xFFFFFFFF;
871    op2 = (op2 >> 32) & 0xFFFFFFFF;
872    op3 = op4 & 0xFFFFFFFF;
873    op4 = (op4 >> 32) & 0xFFFFFFFF;
874    
875    temp1 = op1 * op3;
876    temp2 = (temp1 >> 32) + op1 * op4;
877    temp3 = op2 * op3;
878    temp4 = (temp3 >> 32) + op2 * op4;
879    
880    result1 = temp1 & 0xFFFFFFFF;
881    result2 = temp2 + (temp3 & 0xFFFFFFFF);
882    result3 = (result2 >> 32) + temp4;
883    result4 = (result3 >> 32);
884    
885    lo = result1 | (result2 << 32);
886    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
887    if (sign)
888      {
889     hi = ~hi;
890     if (!lo) hi++;
891     else lo = ~lo + 1;
892      }
893 }
894
895 void multu64(uint64_t m1,uint64_t m2)
896 {
897    unsigned long long int op1, op2, op3, op4;
898    unsigned long long int result1, result2, result3, result4;
899    unsigned long long int temp1, temp2, temp3, temp4;
900    
901    op1 = m1 & 0xFFFFFFFF;
902    op2 = (m1 >> 32) & 0xFFFFFFFF;
903    op3 = m2 & 0xFFFFFFFF;
904    op4 = (m2 >> 32) & 0xFFFFFFFF;
905    
906    temp1 = op1 * op3;
907    temp2 = (temp1 >> 32) + op1 * op4;
908    temp3 = op2 * op3;
909    temp4 = (temp3 >> 32) + op2 * op4;
910    
911    result1 = temp1 & 0xFFFFFFFF;
912    result2 = temp2 + (temp3 & 0xFFFFFFFF);
913    result3 = (result2 >> 32) + temp4;
914    result4 = (result3 >> 32);
915    
916    lo = result1 | (result2 << 32);
917    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
918    
919   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
920   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
921 }
922
923 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
924 {
925   if(bits) {
926     original<<=64-bits;
927     original>>=64-bits;
928     loaded<<=bits;
929     original|=loaded;
930   }
931   else original=loaded;
932   return original;
933 }
934 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
935 {
936   if(bits^56) {
937     original>>=64-(bits^56);
938     original<<=64-(bits^56);
939     loaded>>=bits^56;
940     original|=loaded;
941   }
942   else original=loaded;
943   return original;
944 }
945
946 #ifdef __i386__
947 #include "assem_x86.c"
948 #endif
949 #ifdef __x86_64__
950 #include "assem_x64.c"
951 #endif
952 #ifdef __arm__
953 #include "assem_arm.c"
954 #endif
955
956 // Add virtual address mapping to linked list
957 void ll_add(struct ll_entry **head,int vaddr,void *addr)
958 {
959   struct ll_entry *new_entry;
960   new_entry=malloc(sizeof(struct ll_entry));
961   assert(new_entry!=NULL);
962   new_entry->vaddr=vaddr;
963   new_entry->reg32=0;
964   new_entry->addr=addr;
965   new_entry->next=*head;
966   *head=new_entry;
967 }
968
969 // Add virtual address mapping for 32-bit compiled block
970 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
971 {
972   struct ll_entry *new_entry;
973   new_entry=malloc(sizeof(struct ll_entry));
974   assert(new_entry!=NULL);
975   new_entry->vaddr=vaddr;
976   new_entry->reg32=reg32;
977   new_entry->addr=addr;
978   new_entry->next=*head;
979   *head=new_entry;
980 }
981
982 // Check if an address is already compiled
983 // but don't return addresses which are about to expire from the cache
984 void *check_addr(u_int vaddr)
985 {
986   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
987   if(ht_bin[0]==vaddr) {
988     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
989       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
990   }
991   if(ht_bin[2]==vaddr) {
992     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
993       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
994   }
995   u_int page=get_page(vaddr);
996   struct ll_entry *head;
997   head=jump_in[page];
998   while(head!=NULL) {
999     if(head->vaddr==vaddr&&head->reg32==0) {
1000       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1001         // Update existing entry with current address
1002         if(ht_bin[0]==vaddr) {
1003           ht_bin[1]=(int)head->addr;
1004           return head->addr;
1005         }
1006         if(ht_bin[2]==vaddr) {
1007           ht_bin[3]=(int)head->addr;
1008           return head->addr;
1009         }
1010         // Insert into hash table with low priority.
1011         // Don't evict existing entries, as they are probably
1012         // addresses that are being accessed frequently.
1013         if(ht_bin[0]==-1) {
1014           ht_bin[1]=(int)head->addr;
1015           ht_bin[0]=vaddr;
1016         }else if(ht_bin[2]==-1) {
1017           ht_bin[3]=(int)head->addr;
1018           ht_bin[2]=vaddr;
1019         }
1020         return head->addr;
1021       }
1022     }
1023     head=head->next;
1024   }
1025   return 0;
1026 }
1027
1028 void remove_hash(int vaddr)
1029 {
1030   //printf("remove hash: %x\n",vaddr);
1031   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1032   if(ht_bin[2]==vaddr) {
1033     ht_bin[2]=ht_bin[3]=-1;
1034   }
1035   if(ht_bin[0]==vaddr) {
1036     ht_bin[0]=ht_bin[2];
1037     ht_bin[1]=ht_bin[3];
1038     ht_bin[2]=ht_bin[3]=-1;
1039   }
1040 }
1041
1042 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1043 {
1044   struct ll_entry *next;
1045   while(*head) {
1046     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1047        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1048     {
1049       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1050       remove_hash((*head)->vaddr);
1051       next=(*head)->next;
1052       free(*head);
1053       *head=next;
1054     }
1055     else
1056     {
1057       head=&((*head)->next);
1058     }
1059   }
1060 }
1061
1062 // Remove all entries from linked list
1063 void ll_clear(struct ll_entry **head)
1064 {
1065   struct ll_entry *cur;
1066   struct ll_entry *next;
1067   if(cur=*head) {
1068     *head=0;
1069     while(cur) {
1070       next=cur->next;
1071       free(cur);
1072       cur=next;
1073     }
1074   }
1075 }
1076
1077 // Dereference the pointers and remove if it matches
1078 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1079 {
1080   while(head) {
1081     int ptr=get_pointer(head->addr);
1082     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1083     if(((ptr>>shift)==(addr>>shift)) ||
1084        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1085     {
1086       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1087       kill_pointer(head->addr);
1088     }
1089     head=head->next;
1090   }
1091 }
1092
1093 // This is called when we write to a compiled block (see do_invstub)
1094 int invalidate_page(u_int page)
1095 {
1096   int modified=0;
1097   struct ll_entry *head;
1098   struct ll_entry *next;
1099   head=jump_in[page];
1100   jump_in[page]=0;
1101   while(head!=NULL) {
1102     inv_debug("INVALIDATE: %x\n",head->vaddr);
1103     remove_hash(head->vaddr);
1104     next=head->next;
1105     free(head);
1106     head=next;
1107   }
1108   head=jump_out[page];
1109   jump_out[page]=0;
1110   while(head!=NULL) {
1111     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1112     kill_pointer(head->addr);
1113     modified=1;
1114     next=head->next;
1115     free(head);
1116     head=next;
1117   }
1118   return modified;
1119 }
1120 void invalidate_block(u_int block)
1121 {
1122   int modified;
1123   u_int page=get_page(block<<12);
1124   u_int vpage=get_vpage(block<<12);
1125   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1126   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1127   u_int first,last;
1128   first=last=page;
1129   struct ll_entry *head;
1130   head=jump_dirty[vpage];
1131   //printf("page=%d vpage=%d\n",page,vpage);
1132   while(head!=NULL) {
1133     u_int start,end;
1134     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1135       get_bounds((int)head->addr,&start,&end);
1136       //printf("start: %x end: %x\n",start,end);
1137       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1138         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1139           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1140           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1141         }
1142       }
1143       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1144         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1145           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1146           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1147         }
1148       }
1149     }
1150     head=head->next;
1151   }
1152   //printf("first=%d last=%d\n",first,last);
1153   modified=invalidate_page(page);
1154   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1155   assert(last<page+5);
1156   // Invalidate the adjacent pages if a block crosses a 4K boundary
1157   while(first<page) {
1158     invalidate_page(first);
1159     first++;
1160   }
1161   for(first=page+1;first<last;first++) {
1162     invalidate_page(first);
1163   }
1164   
1165   // Don't trap writes
1166   invalid_code[block]=1;
1167 #ifndef DISABLE_TLB
1168   // If there is a valid TLB entry for this page, remove write protect
1169   if(tlb_LUT_w[block]) {
1170     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1171     // CHECK: Is this right?
1172     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1173     u_int real_block=tlb_LUT_w[block]>>12;
1174     invalid_code[real_block]=1;
1175     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1176   }
1177   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1178 #endif
1179   #ifdef __arm__
1180   if(modified)
1181     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1182   #endif
1183   #ifdef USE_MINI_HT
1184   memset(mini_ht,-1,sizeof(mini_ht));
1185   #endif
1186 }
1187 void invalidate_addr(u_int addr)
1188 {
1189   invalidate_block(addr>>12);
1190 }
1191 void invalidate_all_pages()
1192 {
1193   u_int page,n;
1194   for(page=0;page<4096;page++)
1195     invalidate_page(page);
1196   for(page=0;page<1048576;page++)
1197     if(!invalid_code[page]) {
1198       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1199       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1200     }
1201   #ifdef __arm__
1202   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1203   #endif
1204   #ifdef USE_MINI_HT
1205   memset(mini_ht,-1,sizeof(mini_ht));
1206   #endif
1207   #ifndef DISABLE_TLB
1208   // TLB
1209   for(page=0;page<0x100000;page++) {
1210     if(tlb_LUT_r[page]) {
1211       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1212       if(!tlb_LUT_w[page]||!invalid_code[page])
1213         memory_map[page]|=0x40000000; // Write protect
1214     }
1215     else memory_map[page]=-1;
1216     if(page==0x80000) page=0xC0000;
1217   }
1218   tlb_hacks();
1219   #endif
1220 }
1221
1222 // Add an entry to jump_out after making a link
1223 void add_link(u_int vaddr,void *src)
1224 {
1225   u_int page=get_page(vaddr);
1226   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1227   ll_add(jump_out+page,vaddr,src);
1228   //int ptr=get_pointer(src);
1229   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1230 }
1231
1232 // If a code block was found to be unmodified (bit was set in
1233 // restore_candidate) and it remains unmodified (bit is clear
1234 // in invalid_code) then move the entries for that 4K page from
1235 // the dirty list to the clean list.
1236 void clean_blocks(u_int page)
1237 {
1238   struct ll_entry *head;
1239   inv_debug("INV: clean_blocks page=%d\n",page);
1240   head=jump_dirty[page];
1241   while(head!=NULL) {
1242     if(!invalid_code[head->vaddr>>12]) {
1243       // Don't restore blocks which are about to expire from the cache
1244       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1245         u_int start,end;
1246         if(verify_dirty((int)head->addr)) {
1247           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1248           u_int i;
1249           u_int inv=0;
1250           get_bounds((int)head->addr,&start,&end);
1251           if(start-(u_int)rdram<0x800000) {
1252             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1253               inv|=invalid_code[i];
1254             }
1255           }
1256           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1257             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1258             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1259             if(addr<start||addr>=end) inv=1;
1260           }
1261           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1262             inv=1;
1263           }
1264           if(!inv) {
1265             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1266             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1267               u_int ppage=page;
1268 #ifndef DISABLE_TLB
1269               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1270 #endif
1271               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1272               //printf("page=%x, addr=%x\n",page,head->vaddr);
1273               //assert(head->vaddr>>12==(page|0x80000));
1274               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1275               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1276               if(!head->reg32) {
1277                 if(ht_bin[0]==head->vaddr) {
1278                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1279                 }
1280                 if(ht_bin[2]==head->vaddr) {
1281                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1282                 }
1283               }
1284             }
1285           }
1286         }
1287       }
1288     }
1289     head=head->next;
1290   }
1291 }
1292
1293
1294 void mov_alloc(struct regstat *current,int i)
1295 {
1296   // Note: Don't need to actually alloc the source registers
1297   if((~current->is32>>rs1[i])&1) {
1298     //alloc_reg64(current,i,rs1[i]);
1299     alloc_reg64(current,i,rt1[i]);
1300     current->is32&=~(1LL<<rt1[i]);
1301   } else {
1302     //alloc_reg(current,i,rs1[i]);
1303     alloc_reg(current,i,rt1[i]);
1304     current->is32|=(1LL<<rt1[i]);
1305   }
1306   clear_const(current,rs1[i]);
1307   clear_const(current,rt1[i]);
1308   dirty_reg(current,rt1[i]);
1309 }
1310
1311 void shiftimm_alloc(struct regstat *current,int i)
1312 {
1313   clear_const(current,rs1[i]);
1314   clear_const(current,rt1[i]);
1315   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1316   {
1317     if(rt1[i]) {
1318       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1319       else lt1[i]=rs1[i];
1320       alloc_reg(current,i,rt1[i]);
1321       current->is32|=1LL<<rt1[i];
1322       dirty_reg(current,rt1[i]);
1323     }
1324   }
1325   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1326   {
1327     if(rt1[i]) {
1328       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1329       alloc_reg64(current,i,rt1[i]);
1330       current->is32&=~(1LL<<rt1[i]);
1331       dirty_reg(current,rt1[i]);
1332     }
1333   }
1334   if(opcode2[i]==0x3c) // DSLL32
1335   {
1336     if(rt1[i]) {
1337       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1338       alloc_reg64(current,i,rt1[i]);
1339       current->is32&=~(1LL<<rt1[i]);
1340       dirty_reg(current,rt1[i]);
1341     }
1342   }
1343   if(opcode2[i]==0x3e) // DSRL32
1344   {
1345     if(rt1[i]) {
1346       alloc_reg64(current,i,rs1[i]);
1347       if(imm[i]==32) {
1348         alloc_reg64(current,i,rt1[i]);
1349         current->is32&=~(1LL<<rt1[i]);
1350       } else {
1351         alloc_reg(current,i,rt1[i]);
1352         current->is32|=1LL<<rt1[i];
1353       }
1354       dirty_reg(current,rt1[i]);
1355     }
1356   }
1357   if(opcode2[i]==0x3f) // DSRA32
1358   {
1359     if(rt1[i]) {
1360       alloc_reg64(current,i,rs1[i]);
1361       alloc_reg(current,i,rt1[i]);
1362       current->is32|=1LL<<rt1[i];
1363       dirty_reg(current,rt1[i]);
1364     }
1365   }
1366 }
1367
1368 void shift_alloc(struct regstat *current,int i)
1369 {
1370   if(rt1[i]) {
1371     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1372     {
1373       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1374       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1375       alloc_reg(current,i,rt1[i]);
1376       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1377       current->is32|=1LL<<rt1[i];
1378     } else { // DSLLV/DSRLV/DSRAV
1379       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1380       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1381       alloc_reg64(current,i,rt1[i]);
1382       current->is32&=~(1LL<<rt1[i]);
1383       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1384         alloc_reg_temp(current,i,-1);
1385     }
1386     clear_const(current,rs1[i]);
1387     clear_const(current,rs2[i]);
1388     clear_const(current,rt1[i]);
1389     dirty_reg(current,rt1[i]);
1390   }
1391 }
1392
1393 void alu_alloc(struct regstat *current,int i)
1394 {
1395   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1396     if(rt1[i]) {
1397       if(rs1[i]&&rs2[i]) {
1398         alloc_reg(current,i,rs1[i]);
1399         alloc_reg(current,i,rs2[i]);
1400       }
1401       else {
1402         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1403         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1404       }
1405       alloc_reg(current,i,rt1[i]);
1406     }
1407     current->is32|=1LL<<rt1[i];
1408   }
1409   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1410     if(rt1[i]) {
1411       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1412       {
1413         alloc_reg64(current,i,rs1[i]);
1414         alloc_reg64(current,i,rs2[i]);
1415         alloc_reg(current,i,rt1[i]);
1416       } else {
1417         alloc_reg(current,i,rs1[i]);
1418         alloc_reg(current,i,rs2[i]);
1419         alloc_reg(current,i,rt1[i]);
1420       }
1421     }
1422     current->is32|=1LL<<rt1[i];
1423   }
1424   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1425     if(rt1[i]) {
1426       if(rs1[i]&&rs2[i]) {
1427         alloc_reg(current,i,rs1[i]);
1428         alloc_reg(current,i,rs2[i]);
1429       }
1430       else
1431       {
1432         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1433         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1434       }
1435       alloc_reg(current,i,rt1[i]);
1436       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1437       {
1438         if(!((current->uu>>rt1[i])&1)) {
1439           alloc_reg64(current,i,rt1[i]);
1440         }
1441         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1442           if(rs1[i]&&rs2[i]) {
1443             alloc_reg64(current,i,rs1[i]);
1444             alloc_reg64(current,i,rs2[i]);
1445           }
1446           else
1447           {
1448             // Is is really worth it to keep 64-bit values in registers?
1449             #ifdef NATIVE_64BIT
1450             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1451             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1452             #endif
1453           }
1454         }
1455         current->is32&=~(1LL<<rt1[i]);
1456       } else {
1457         current->is32|=1LL<<rt1[i];
1458       }
1459     }
1460   }
1461   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1462     if(rt1[i]) {
1463       if(rs1[i]&&rs2[i]) {
1464         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1465           alloc_reg64(current,i,rs1[i]);
1466           alloc_reg64(current,i,rs2[i]);
1467           alloc_reg64(current,i,rt1[i]);
1468         } else {
1469           alloc_reg(current,i,rs1[i]);
1470           alloc_reg(current,i,rs2[i]);
1471           alloc_reg(current,i,rt1[i]);
1472         }
1473       }
1474       else {
1475         alloc_reg(current,i,rt1[i]);
1476         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1477           // DADD used as move, or zeroing
1478           // If we have a 64-bit source, then make the target 64 bits too
1479           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1480             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1481             alloc_reg64(current,i,rt1[i]);
1482           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1483             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1484             alloc_reg64(current,i,rt1[i]);
1485           }
1486           if(opcode2[i]>=0x2e&&rs2[i]) {
1487             // DSUB used as negation - 64-bit result
1488             // If we have a 32-bit register, extend it to 64 bits
1489             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1490             alloc_reg64(current,i,rt1[i]);
1491           }
1492         }
1493       }
1494       if(rs1[i]&&rs2[i]) {
1495         current->is32&=~(1LL<<rt1[i]);
1496       } else if(rs1[i]) {
1497         current->is32&=~(1LL<<rt1[i]);
1498         if((current->is32>>rs1[i])&1)
1499           current->is32|=1LL<<rt1[i];
1500       } else if(rs2[i]) {
1501         current->is32&=~(1LL<<rt1[i]);
1502         if((current->is32>>rs2[i])&1)
1503           current->is32|=1LL<<rt1[i];
1504       } else {
1505         current->is32|=1LL<<rt1[i];
1506       }
1507     }
1508   }
1509   clear_const(current,rs1[i]);
1510   clear_const(current,rs2[i]);
1511   clear_const(current,rt1[i]);
1512   dirty_reg(current,rt1[i]);
1513 }
1514
1515 void imm16_alloc(struct regstat *current,int i)
1516 {
1517   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1518   else lt1[i]=rs1[i];
1519   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1520   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1521     current->is32&=~(1LL<<rt1[i]);
1522     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1523       // TODO: Could preserve the 32-bit flag if the immediate is zero
1524       alloc_reg64(current,i,rt1[i]);
1525       alloc_reg64(current,i,rs1[i]);
1526     }
1527     clear_const(current,rs1[i]);
1528     clear_const(current,rt1[i]);
1529   }
1530   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1531     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1532     current->is32|=1LL<<rt1[i];
1533     clear_const(current,rs1[i]);
1534     clear_const(current,rt1[i]);
1535   }
1536   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1537     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1538       if(rs1[i]!=rt1[i]) {
1539         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1540         alloc_reg64(current,i,rt1[i]);
1541         current->is32&=~(1LL<<rt1[i]);
1542       }
1543     }
1544     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1545     if(is_const(current,rs1[i])) {
1546       int v=get_const(current,rs1[i]);
1547       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1548       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1549       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1550     }
1551     else clear_const(current,rt1[i]);
1552   }
1553   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1554     if(is_const(current,rs1[i])) {
1555       int v=get_const(current,rs1[i]);
1556       set_const(current,rt1[i],v+imm[i]);
1557     }
1558     else clear_const(current,rt1[i]);
1559     current->is32|=1LL<<rt1[i];
1560   }
1561   else {
1562     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1563     current->is32|=1LL<<rt1[i];
1564   }
1565   dirty_reg(current,rt1[i]);
1566 }
1567
1568 void load_alloc(struct regstat *current,int i)
1569 {
1570   clear_const(current,rt1[i]);
1571   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1572   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1573   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1574   if(rt1[i]) {
1575     alloc_reg(current,i,rt1[i]);
1576     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1577     {
1578       current->is32&=~(1LL<<rt1[i]);
1579       alloc_reg64(current,i,rt1[i]);
1580     }
1581     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1582     {
1583       current->is32&=~(1LL<<rt1[i]);
1584       alloc_reg64(current,i,rt1[i]);
1585       alloc_all(current,i);
1586       alloc_reg64(current,i,FTEMP);
1587     }
1588     else current->is32|=1LL<<rt1[i];
1589     dirty_reg(current,rt1[i]);
1590     // If using TLB, need a register for pointer to the mapping table
1591     if(using_tlb) alloc_reg(current,i,TLREG);
1592     // LWL/LWR need a temporary register for the old value
1593     if(opcode[i]==0x22||opcode[i]==0x26)
1594     {
1595       alloc_reg(current,i,FTEMP);
1596       alloc_reg_temp(current,i,-1);
1597     }
1598   }
1599   else
1600   {
1601     // Load to r0 (dummy load)
1602     // but we still need a register to calculate the address
1603     alloc_reg_temp(current,i,-1);
1604   }
1605 }
1606
1607 void store_alloc(struct regstat *current,int i)
1608 {
1609   clear_const(current,rs2[i]);
1610   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1611   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1612   alloc_reg(current,i,rs2[i]);
1613   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1614     alloc_reg64(current,i,rs2[i]);
1615     if(rs2[i]) alloc_reg(current,i,FTEMP);
1616   }
1617   // If using TLB, need a register for pointer to the mapping table
1618   if(using_tlb) alloc_reg(current,i,TLREG);
1619   #if defined(HOST_IMM8)
1620   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1621   else alloc_reg(current,i,INVCP);
1622   #endif
1623   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1624     alloc_reg(current,i,FTEMP);
1625   }
1626   // We need a temporary register for address generation
1627   alloc_reg_temp(current,i,-1);
1628 }
1629
1630 void c1ls_alloc(struct regstat *current,int i)
1631 {
1632   //clear_const(current,rs1[i]); // FIXME
1633   clear_const(current,rt1[i]);
1634   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1635   alloc_reg(current,i,CSREG); // Status
1636   alloc_reg(current,i,FTEMP);
1637   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1638     alloc_reg64(current,i,FTEMP);
1639   }
1640   // If using TLB, need a register for pointer to the mapping table
1641   if(using_tlb) alloc_reg(current,i,TLREG);
1642   #if defined(HOST_IMM8)
1643   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1644   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1645     alloc_reg(current,i,INVCP);
1646   #endif
1647   // We need a temporary register for address generation
1648   alloc_reg_temp(current,i,-1);
1649 }
1650
1651 #ifndef multdiv_alloc
1652 void multdiv_alloc(struct regstat *current,int i)
1653 {
1654   //  case 0x18: MULT
1655   //  case 0x19: MULTU
1656   //  case 0x1A: DIV
1657   //  case 0x1B: DIVU
1658   //  case 0x1C: DMULT
1659   //  case 0x1D: DMULTU
1660   //  case 0x1E: DDIV
1661   //  case 0x1F: DDIVU
1662   clear_const(current,rs1[i]);
1663   clear_const(current,rs2[i]);
1664   if(rs1[i]&&rs2[i])
1665   {
1666     if((opcode2[i]&4)==0) // 32-bit
1667     {
1668       current->u&=~(1LL<<HIREG);
1669       current->u&=~(1LL<<LOREG);
1670       alloc_reg(current,i,HIREG);
1671       alloc_reg(current,i,LOREG);
1672       alloc_reg(current,i,rs1[i]);
1673       alloc_reg(current,i,rs2[i]);
1674       current->is32|=1LL<<HIREG;
1675       current->is32|=1LL<<LOREG;
1676       dirty_reg(current,HIREG);
1677       dirty_reg(current,LOREG);
1678     }
1679     else // 64-bit
1680     {
1681       current->u&=~(1LL<<HIREG);
1682       current->u&=~(1LL<<LOREG);
1683       current->uu&=~(1LL<<HIREG);
1684       current->uu&=~(1LL<<LOREG);
1685       alloc_reg64(current,i,HIREG);
1686       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1687       alloc_reg64(current,i,rs1[i]);
1688       alloc_reg64(current,i,rs2[i]);
1689       alloc_all(current,i);
1690       current->is32&=~(1LL<<HIREG);
1691       current->is32&=~(1LL<<LOREG);
1692       dirty_reg(current,HIREG);
1693       dirty_reg(current,LOREG);
1694     }
1695   }
1696   else
1697   {
1698     // Multiply by zero is zero.
1699     // MIPS does not have a divide by zero exception.
1700     // The result is undefined, we return zero.
1701     alloc_reg(current,i,HIREG);
1702     alloc_reg(current,i,LOREG);
1703     current->is32|=1LL<<HIREG;
1704     current->is32|=1LL<<LOREG;
1705     dirty_reg(current,HIREG);
1706     dirty_reg(current,LOREG);
1707   }
1708 }
1709 #endif
1710
1711 void cop0_alloc(struct regstat *current,int i)
1712 {
1713   if(opcode2[i]==0) // MFC0
1714   {
1715     if(rt1[i]) {
1716       clear_const(current,rt1[i]);
1717       alloc_all(current,i);
1718       alloc_reg(current,i,rt1[i]);
1719       current->is32|=1LL<<rt1[i];
1720       dirty_reg(current,rt1[i]);
1721     }
1722   }
1723   else if(opcode2[i]==4) // MTC0
1724   {
1725     if(rs1[i]){
1726       clear_const(current,rs1[i]);
1727       alloc_reg(current,i,rs1[i]);
1728       alloc_all(current,i);
1729     }
1730     else {
1731       alloc_all(current,i); // FIXME: Keep r0
1732       current->u&=~1LL;
1733       alloc_reg(current,i,0);
1734     }
1735   }
1736   else
1737   {
1738     // TLBR/TLBWI/TLBWR/TLBP/ERET
1739     assert(opcode2[i]==0x10);
1740     alloc_all(current,i);
1741   }
1742 }
1743
1744 void cop1_alloc(struct regstat *current,int i)
1745 {
1746   alloc_reg(current,i,CSREG); // Load status
1747   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1748   {
1749     assert(rt1[i]);
1750     clear_const(current,rt1[i]);
1751     if(opcode2[i]==1) {
1752       alloc_reg64(current,i,rt1[i]); // DMFC1
1753       current->is32&=~(1LL<<rt1[i]);
1754     }else{
1755       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1756       current->is32|=1LL<<rt1[i];
1757     }
1758     dirty_reg(current,rt1[i]);
1759     alloc_reg_temp(current,i,-1);
1760   }
1761   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1762   {
1763     if(rs1[i]){
1764       clear_const(current,rs1[i]);
1765       if(opcode2[i]==5)
1766         alloc_reg64(current,i,rs1[i]); // DMTC1
1767       else
1768         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1769       alloc_reg_temp(current,i,-1);
1770     }
1771     else {
1772       current->u&=~1LL;
1773       alloc_reg(current,i,0);
1774       alloc_reg_temp(current,i,-1);
1775     }
1776   }
1777 }
1778 void fconv_alloc(struct regstat *current,int i)
1779 {
1780   alloc_reg(current,i,CSREG); // Load status
1781   alloc_reg_temp(current,i,-1);
1782 }
1783 void float_alloc(struct regstat *current,int i)
1784 {
1785   alloc_reg(current,i,CSREG); // Load status
1786   alloc_reg_temp(current,i,-1);
1787 }
1788 void fcomp_alloc(struct regstat *current,int i)
1789 {
1790   alloc_reg(current,i,CSREG); // Load status
1791   alloc_reg(current,i,FSREG); // Load flags
1792   dirty_reg(current,FSREG); // Flag will be modified
1793   alloc_reg_temp(current,i,-1);
1794 }
1795
1796 void syscall_alloc(struct regstat *current,int i)
1797 {
1798   alloc_cc(current,i);
1799   dirty_reg(current,CCREG);
1800   alloc_all(current,i);
1801   current->isconst=0;
1802 }
1803
1804 void delayslot_alloc(struct regstat *current,int i)
1805 {
1806   switch(itype[i]) {
1807     case UJUMP:
1808     case CJUMP:
1809     case SJUMP:
1810     case RJUMP:
1811     case FJUMP:
1812     case SYSCALL:
1813     case SPAN:
1814       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1815       printf("Disabled speculative precompilation\n");
1816       stop_after_jal=1;
1817       break;
1818     case IMM16:
1819       imm16_alloc(current,i);
1820       break;
1821     case LOAD:
1822     case LOADLR:
1823       load_alloc(current,i);
1824       break;
1825     case STORE:
1826     case STORELR:
1827       store_alloc(current,i);
1828       break;
1829     case ALU:
1830       alu_alloc(current,i);
1831       break;
1832     case SHIFT:
1833       shift_alloc(current,i);
1834       break;
1835     case MULTDIV:
1836       multdiv_alloc(current,i);
1837       break;
1838     case SHIFTIMM:
1839       shiftimm_alloc(current,i);
1840       break;
1841     case MOV:
1842       mov_alloc(current,i);
1843       break;
1844     case COP0:
1845       cop0_alloc(current,i);
1846       break;
1847     case COP1:
1848       cop1_alloc(current,i);
1849       break;
1850     case C1LS:
1851       c1ls_alloc(current,i);
1852       break;
1853     case FCONV:
1854       fconv_alloc(current,i);
1855       break;
1856     case FLOAT:
1857       float_alloc(current,i);
1858       break;
1859     case FCOMP:
1860       fcomp_alloc(current,i);
1861       break;
1862   }
1863 }
1864
1865 // Special case where a branch and delay slot span two pages in virtual memory
1866 static void pagespan_alloc(struct regstat *current,int i)
1867 {
1868   current->isconst=0;
1869   current->wasconst=0;
1870   regs[i].wasconst=0;
1871   alloc_all(current,i);
1872   alloc_cc(current,i);
1873   dirty_reg(current,CCREG);
1874   if(opcode[i]==3) // JAL
1875   {
1876     alloc_reg(current,i,31);
1877     dirty_reg(current,31);
1878   }
1879   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1880   {
1881     alloc_reg(current,i,rs1[i]);
1882     if (rt1[i]==31) {
1883       alloc_reg(current,i,31);
1884       dirty_reg(current,31);
1885     }
1886   }
1887   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1888   {
1889     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1890     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1891     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1892     {
1893       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1894       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1895     }
1896   }
1897   else
1898   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1899   {
1900     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1901     if(!((current->is32>>rs1[i])&1))
1902     {
1903       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1904     }
1905   }
1906   else
1907   if(opcode[i]==0x11) // BC1
1908   {
1909     alloc_reg(current,i,FSREG);
1910     alloc_reg(current,i,CSREG);
1911   }
1912   //else ...
1913 }
1914
1915 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1916 {
1917   stubs[stubcount][0]=type;
1918   stubs[stubcount][1]=addr;
1919   stubs[stubcount][2]=retaddr;
1920   stubs[stubcount][3]=a;
1921   stubs[stubcount][4]=b;
1922   stubs[stubcount][5]=c;
1923   stubs[stubcount][6]=d;
1924   stubs[stubcount][7]=e;
1925   stubcount++;
1926 }
1927
1928 // Write out a single register
1929 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1930 {
1931   int hr;
1932   for(hr=0;hr<HOST_REGS;hr++) {
1933     if(hr!=EXCLUDE_REG) {
1934       if((regmap[hr]&63)==r) {
1935         if((dirty>>hr)&1) {
1936           if(regmap[hr]<64) {
1937             emit_storereg(r,hr);
1938             if((is32>>regmap[hr])&1) {
1939               emit_sarimm(hr,31,hr);
1940               emit_storereg(r|64,hr);
1941             }
1942           }else{
1943             emit_storereg(r|64,hr);
1944           }
1945         }
1946       }
1947     }
1948   }
1949 }
1950
1951 int mchecksum()
1952 {
1953   //if(!tracedebug) return 0;
1954   int i;
1955   int sum=0;
1956   for(i=0;i<2097152;i++) {
1957     unsigned int temp=sum;
1958     sum<<=1;
1959     sum|=(~temp)>>31;
1960     sum^=((u_int *)rdram)[i];
1961   }
1962   return sum;
1963 }
1964 int rchecksum()
1965 {
1966   int i;
1967   int sum=0;
1968   for(i=0;i<64;i++)
1969     sum^=((u_int *)reg)[i];
1970   return sum;
1971 }
1972 int fchecksum()
1973 {
1974   int i;
1975   int sum=0;
1976   for(i=0;i<64;i++)
1977     sum^=((u_int *)reg_cop1_fgr_64)[i];
1978   return sum;
1979 }
1980 void rlist()
1981 {
1982   int i;
1983   printf("TRACE: ");
1984   for(i=0;i<32;i++)
1985     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1986   printf("\n");
1987   printf("TRACE: ");
1988   for(i=0;i<32;i++)
1989     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1990   printf("\n");
1991 }
1992
1993 void enabletrace()
1994 {
1995   tracedebug=1;
1996 }
1997
1998 void memdebug(int i)
1999 {
2000   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2001   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2002   //rlist();
2003   //if(tracedebug) {
2004   //if(Count>=-2084597794) {
2005   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2006   //if(0) {
2007     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2008     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2009     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2010     rlist();
2011     #ifdef __i386__
2012     printf("TRACE: %x\n",(&i)[-1]);
2013     #endif
2014     #ifdef __arm__
2015     int j;
2016     printf("TRACE: %x \n",(&j)[10]);
2017     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2018     #endif
2019     //fflush(stdout);
2020   }
2021   //printf("TRACE: %x\n",(&i)[-1]);
2022 }
2023
2024 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2025 {
2026   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2027 }
2028
2029 void alu_assemble(int i,struct regstat *i_regs)
2030 {
2031   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2032     if(rt1[i]) {
2033       signed char s1,s2,t;
2034       t=get_reg(i_regs->regmap,rt1[i]);
2035       if(t>=0) {
2036         s1=get_reg(i_regs->regmap,rs1[i]);
2037         s2=get_reg(i_regs->regmap,rs2[i]);
2038         if(rs1[i]&&rs2[i]) {
2039           assert(s1>=0);
2040           assert(s2>=0);
2041           if(opcode2[i]&2) emit_sub(s1,s2,t);
2042           else emit_add(s1,s2,t);
2043         }
2044         else if(rs1[i]) {
2045           if(s1>=0) emit_mov(s1,t);
2046           else emit_loadreg(rs1[i],t);
2047         }
2048         else if(rs2[i]) {
2049           if(s2>=0) {
2050             if(opcode2[i]&2) emit_neg(s2,t);
2051             else emit_mov(s2,t);
2052           }
2053           else {
2054             emit_loadreg(rs2[i],t);
2055             if(opcode2[i]&2) emit_neg(t,t);
2056           }
2057         }
2058         else emit_zeroreg(t);
2059       }
2060     }
2061   }
2062   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2063     if(rt1[i]) {
2064       signed char s1l,s2l,s1h,s2h,tl,th;
2065       tl=get_reg(i_regs->regmap,rt1[i]);
2066       th=get_reg(i_regs->regmap,rt1[i]|64);
2067       if(tl>=0) {
2068         s1l=get_reg(i_regs->regmap,rs1[i]);
2069         s2l=get_reg(i_regs->regmap,rs2[i]);
2070         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2071         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2072         if(rs1[i]&&rs2[i]) {
2073           assert(s1l>=0);
2074           assert(s2l>=0);
2075           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2076           else emit_adds(s1l,s2l,tl);
2077           if(th>=0) {
2078             #ifdef INVERTED_CARRY
2079             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2080             #else
2081             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2082             #endif
2083             else emit_add(s1h,s2h,th);
2084           }
2085         }
2086         else if(rs1[i]) {
2087           if(s1l>=0) emit_mov(s1l,tl);
2088           else emit_loadreg(rs1[i],tl);
2089           if(th>=0) {
2090             if(s1h>=0) emit_mov(s1h,th);
2091             else emit_loadreg(rs1[i]|64,th);
2092           }
2093         }
2094         else if(rs2[i]) {
2095           if(s2l>=0) {
2096             if(opcode2[i]&2) emit_negs(s2l,tl);
2097             else emit_mov(s2l,tl);
2098           }
2099           else {
2100             emit_loadreg(rs2[i],tl);
2101             if(opcode2[i]&2) emit_negs(tl,tl);
2102           }
2103           if(th>=0) {
2104             #ifdef INVERTED_CARRY
2105             if(s2h>=0) emit_mov(s2h,th);
2106             else emit_loadreg(rs2[i]|64,th);
2107             if(opcode2[i]&2) {
2108               emit_adcimm(-1,th); // x86 has inverted carry flag
2109               emit_not(th,th);
2110             }
2111             #else
2112             if(opcode2[i]&2) {
2113               if(s2h>=0) emit_rscimm(s2h,0,th);
2114               else {
2115                 emit_loadreg(rs2[i]|64,th);
2116                 emit_rscimm(th,0,th);
2117               }
2118             }else{
2119               if(s2h>=0) emit_mov(s2h,th);
2120               else emit_loadreg(rs2[i]|64,th);
2121             }
2122             #endif
2123           }
2124         }
2125         else {
2126           emit_zeroreg(tl);
2127           if(th>=0) emit_zeroreg(th);
2128         }
2129       }
2130     }
2131   }
2132   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2133     if(rt1[i]) {
2134       signed char s1l,s1h,s2l,s2h,t;
2135       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2136       {
2137         t=get_reg(i_regs->regmap,rt1[i]);
2138         //assert(t>=0);
2139         if(t>=0) {
2140           s1l=get_reg(i_regs->regmap,rs1[i]);
2141           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2142           s2l=get_reg(i_regs->regmap,rs2[i]);
2143           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2144           if(rs2[i]==0) // rx<r0
2145           {
2146             assert(s1h>=0);
2147             if(opcode2[i]==0x2a) // SLT
2148               emit_shrimm(s1h,31,t);
2149             else // SLTU (unsigned can not be less than zero)
2150               emit_zeroreg(t);
2151           }
2152           else if(rs1[i]==0) // r0<rx
2153           {
2154             assert(s2h>=0);
2155             if(opcode2[i]==0x2a) // SLT
2156               emit_set_gz64_32(s2h,s2l,t);
2157             else // SLTU (set if not zero)
2158               emit_set_nz64_32(s2h,s2l,t);
2159           }
2160           else {
2161             assert(s1l>=0);assert(s1h>=0);
2162             assert(s2l>=0);assert(s2h>=0);
2163             if(opcode2[i]==0x2a) // SLT
2164               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2165             else // SLTU
2166               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2167           }
2168         }
2169       } else {
2170         t=get_reg(i_regs->regmap,rt1[i]);
2171         //assert(t>=0);
2172         if(t>=0) {
2173           s1l=get_reg(i_regs->regmap,rs1[i]);
2174           s2l=get_reg(i_regs->regmap,rs2[i]);
2175           if(rs2[i]==0) // rx<r0
2176           {
2177             assert(s1l>=0);
2178             if(opcode2[i]==0x2a) // SLT
2179               emit_shrimm(s1l,31,t);
2180             else // SLTU (unsigned can not be less than zero)
2181               emit_zeroreg(t);
2182           }
2183           else if(rs1[i]==0) // r0<rx
2184           {
2185             assert(s2l>=0);
2186             if(opcode2[i]==0x2a) // SLT
2187               emit_set_gz32(s2l,t);
2188             else // SLTU (set if not zero)
2189               emit_set_nz32(s2l,t);
2190           }
2191           else{
2192             assert(s1l>=0);assert(s2l>=0);
2193             if(opcode2[i]==0x2a) // SLT
2194               emit_set_if_less32(s1l,s2l,t);
2195             else // SLTU
2196               emit_set_if_carry32(s1l,s2l,t);
2197           }
2198         }
2199       }
2200     }
2201   }
2202   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2203     if(rt1[i]) {
2204       signed char s1l,s1h,s2l,s2h,th,tl;
2205       tl=get_reg(i_regs->regmap,rt1[i]);
2206       th=get_reg(i_regs->regmap,rt1[i]|64);
2207       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2208       {
2209         assert(tl>=0);
2210         if(tl>=0) {
2211           s1l=get_reg(i_regs->regmap,rs1[i]);
2212           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2213           s2l=get_reg(i_regs->regmap,rs2[i]);
2214           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2215           if(rs1[i]&&rs2[i]) {
2216             assert(s1l>=0);assert(s1h>=0);
2217             assert(s2l>=0);assert(s2h>=0);
2218             if(opcode2[i]==0x24) { // AND
2219               emit_and(s1l,s2l,tl);
2220               emit_and(s1h,s2h,th);
2221             } else
2222             if(opcode2[i]==0x25) { // OR
2223               emit_or(s1l,s2l,tl);
2224               emit_or(s1h,s2h,th);
2225             } else
2226             if(opcode2[i]==0x26) { // XOR
2227               emit_xor(s1l,s2l,tl);
2228               emit_xor(s1h,s2h,th);
2229             } else
2230             if(opcode2[i]==0x27) { // NOR
2231               emit_or(s1l,s2l,tl);
2232               emit_or(s1h,s2h,th);
2233               emit_not(tl,tl);
2234               emit_not(th,th);
2235             }
2236           }
2237           else
2238           {
2239             if(opcode2[i]==0x24) { // AND
2240               emit_zeroreg(tl);
2241               emit_zeroreg(th);
2242             } else
2243             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2244               if(rs1[i]){
2245                 if(s1l>=0) emit_mov(s1l,tl);
2246                 else emit_loadreg(rs1[i],tl);
2247                 if(s1h>=0) emit_mov(s1h,th);
2248                 else emit_loadreg(rs1[i]|64,th);
2249               }
2250               else
2251               if(rs2[i]){
2252                 if(s2l>=0) emit_mov(s2l,tl);
2253                 else emit_loadreg(rs2[i],tl);
2254                 if(s2h>=0) emit_mov(s2h,th);
2255                 else emit_loadreg(rs2[i]|64,th);
2256               }
2257               else{
2258                 emit_zeroreg(tl);
2259                 emit_zeroreg(th);
2260               }
2261             } else
2262             if(opcode2[i]==0x27) { // NOR
2263               if(rs1[i]){
2264                 if(s1l>=0) emit_not(s1l,tl);
2265                 else{
2266                   emit_loadreg(rs1[i],tl);
2267                   emit_not(tl,tl);
2268                 }
2269                 if(s1h>=0) emit_not(s1h,th);
2270                 else{
2271                   emit_loadreg(rs1[i]|64,th);
2272                   emit_not(th,th);
2273                 }
2274               }
2275               else
2276               if(rs2[i]){
2277                 if(s2l>=0) emit_not(s2l,tl);
2278                 else{
2279                   emit_loadreg(rs2[i],tl);
2280                   emit_not(tl,tl);
2281                 }
2282                 if(s2h>=0) emit_not(s2h,th);
2283                 else{
2284                   emit_loadreg(rs2[i]|64,th);
2285                   emit_not(th,th);
2286                 }
2287               }
2288               else {
2289                 emit_movimm(-1,tl);
2290                 emit_movimm(-1,th);
2291               }
2292             }
2293           }
2294         }
2295       }
2296       else
2297       {
2298         // 32 bit
2299         if(tl>=0) {
2300           s1l=get_reg(i_regs->regmap,rs1[i]);
2301           s2l=get_reg(i_regs->regmap,rs2[i]);
2302           if(rs1[i]&&rs2[i]) {
2303             assert(s1l>=0);
2304             assert(s2l>=0);
2305             if(opcode2[i]==0x24) { // AND
2306               emit_and(s1l,s2l,tl);
2307             } else
2308             if(opcode2[i]==0x25) { // OR
2309               emit_or(s1l,s2l,tl);
2310             } else
2311             if(opcode2[i]==0x26) { // XOR
2312               emit_xor(s1l,s2l,tl);
2313             } else
2314             if(opcode2[i]==0x27) { // NOR
2315               emit_or(s1l,s2l,tl);
2316               emit_not(tl,tl);
2317             }
2318           }
2319           else
2320           {
2321             if(opcode2[i]==0x24) { // AND
2322               emit_zeroreg(tl);
2323             } else
2324             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2325               if(rs1[i]){
2326                 if(s1l>=0) emit_mov(s1l,tl);
2327                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2328               }
2329               else
2330               if(rs2[i]){
2331                 if(s2l>=0) emit_mov(s2l,tl);
2332                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2333               }
2334               else emit_zeroreg(tl);
2335             } else
2336             if(opcode2[i]==0x27) { // NOR
2337               if(rs1[i]){
2338                 if(s1l>=0) emit_not(s1l,tl);
2339                 else {
2340                   emit_loadreg(rs1[i],tl);
2341                   emit_not(tl,tl);
2342                 }
2343               }
2344               else
2345               if(rs2[i]){
2346                 if(s2l>=0) emit_not(s2l,tl);
2347                 else {
2348                   emit_loadreg(rs2[i],tl);
2349                   emit_not(tl,tl);
2350                 }
2351               }
2352               else emit_movimm(-1,tl);
2353             }
2354           }
2355         }
2356       }
2357     }
2358   }
2359 }
2360
2361 void imm16_assemble(int i,struct regstat *i_regs)
2362 {
2363   if (opcode[i]==0x0f) { // LUI
2364     if(rt1[i]) {
2365       signed char t;
2366       t=get_reg(i_regs->regmap,rt1[i]);
2367       //assert(t>=0);
2368       if(t>=0) {
2369         if(!((i_regs->isconst>>t)&1))
2370           emit_movimm(imm[i]<<16,t);
2371       }
2372     }
2373   }
2374   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2375     if(rt1[i]) {
2376       signed char s,t;
2377       t=get_reg(i_regs->regmap,rt1[i]);
2378       s=get_reg(i_regs->regmap,rs1[i]);
2379       if(rs1[i]) {
2380         //assert(t>=0);
2381         //assert(s>=0);
2382         if(t>=0) {
2383           if(!((i_regs->isconst>>t)&1)) {
2384             if(s<0) {
2385               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2386               emit_addimm(t,imm[i],t);
2387             }else{
2388               if(!((i_regs->wasconst>>s)&1))
2389                 emit_addimm(s,imm[i],t);
2390               else
2391                 emit_movimm(constmap[i][s]+imm[i],t);
2392             }
2393           }
2394         }
2395       } else {
2396         if(t>=0) {
2397           if(!((i_regs->isconst>>t)&1))
2398             emit_movimm(imm[i],t);
2399         }
2400       }
2401     }
2402   }
2403   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2404     if(rt1[i]) {
2405       signed char sh,sl,th,tl;
2406       th=get_reg(i_regs->regmap,rt1[i]|64);
2407       tl=get_reg(i_regs->regmap,rt1[i]);
2408       sh=get_reg(i_regs->regmap,rs1[i]|64);
2409       sl=get_reg(i_regs->regmap,rs1[i]);
2410       if(tl>=0) {
2411         if(rs1[i]) {
2412           assert(sh>=0);
2413           assert(sl>=0);
2414           if(th>=0) {
2415             emit_addimm64_32(sh,sl,imm[i],th,tl);
2416           }
2417           else {
2418             emit_addimm(sl,imm[i],tl);
2419           }
2420         } else {
2421           emit_movimm(imm[i],tl);
2422           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2423         }
2424       }
2425     }
2426   }
2427   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2428     if(rt1[i]) {
2429       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2430       signed char sh,sl,t;
2431       t=get_reg(i_regs->regmap,rt1[i]);
2432       sh=get_reg(i_regs->regmap,rs1[i]|64);
2433       sl=get_reg(i_regs->regmap,rs1[i]);
2434       //assert(t>=0);
2435       if(t>=0) {
2436         if(rs1[i]>0) {
2437           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2438           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2439             if(opcode[i]==0x0a) { // SLTI
2440               if(sl<0) {
2441                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2442                 emit_slti32(t,imm[i],t);
2443               }else{
2444                 emit_slti32(sl,imm[i],t);
2445               }
2446             }
2447             else { // SLTIU
2448               if(sl<0) {
2449                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2450                 emit_sltiu32(t,imm[i],t);
2451               }else{
2452                 emit_sltiu32(sl,imm[i],t);
2453               }
2454             }
2455           }else{ // 64-bit
2456             assert(sl>=0);
2457             if(opcode[i]==0x0a) // SLTI
2458               emit_slti64_32(sh,sl,imm[i],t);
2459             else // SLTIU
2460               emit_sltiu64_32(sh,sl,imm[i],t);
2461           }
2462         }else{
2463           // SLTI(U) with r0 is just stupid,
2464           // nonetheless examples can be found
2465           if(opcode[i]==0x0a) // SLTI
2466             if(0<imm[i]) emit_movimm(1,t);
2467             else emit_zeroreg(t);
2468           else // SLTIU
2469           {
2470             if(imm[i]) emit_movimm(1,t);
2471             else emit_zeroreg(t);
2472           }
2473         }
2474       }
2475     }
2476   }
2477   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2478     if(rt1[i]) {
2479       signed char sh,sl,th,tl;
2480       th=get_reg(i_regs->regmap,rt1[i]|64);
2481       tl=get_reg(i_regs->regmap,rt1[i]);
2482       sh=get_reg(i_regs->regmap,rs1[i]|64);
2483       sl=get_reg(i_regs->regmap,rs1[i]);
2484       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2485         if(opcode[i]==0x0c) //ANDI
2486         {
2487           if(rs1[i]) {
2488             if(sl<0) {
2489               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2490               emit_andimm(tl,imm[i],tl);
2491             }else{
2492               if(!((i_regs->wasconst>>sl)&1))
2493                 emit_andimm(sl,imm[i],tl);
2494               else
2495                 emit_movimm(constmap[i][sl]&imm[i],tl);
2496             }
2497           }
2498           else
2499             emit_zeroreg(tl);
2500           if(th>=0) emit_zeroreg(th);
2501         }
2502         else
2503         {
2504           if(rs1[i]) {
2505             if(sl<0) {
2506               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2507             }
2508             if(th>=0) {
2509               if(sh<0) {
2510                 emit_loadreg(rs1[i]|64,th);
2511               }else{
2512                 emit_mov(sh,th);
2513               }
2514             }
2515             if(opcode[i]==0x0d) //ORI
2516             if(sl<0) {
2517               emit_orimm(tl,imm[i],tl);
2518             }else{
2519               if(!((i_regs->wasconst>>sl)&1))
2520                 emit_orimm(sl,imm[i],tl);
2521               else
2522                 emit_movimm(constmap[i][sl]|imm[i],tl);
2523             }
2524             if(opcode[i]==0x0e) //XORI
2525             if(sl<0) {
2526               emit_xorimm(tl,imm[i],tl);
2527             }else{
2528               if(!((i_regs->wasconst>>sl)&1))
2529                 emit_xorimm(sl,imm[i],tl);
2530               else
2531                 emit_movimm(constmap[i][sl]^imm[i],tl);
2532             }
2533           }
2534           else {
2535             emit_movimm(imm[i],tl);
2536             if(th>=0) emit_zeroreg(th);
2537           }
2538         }
2539       }
2540     }
2541   }
2542 }
2543
2544 void shiftimm_assemble(int i,struct regstat *i_regs)
2545 {
2546   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2547   {
2548     if(rt1[i]) {
2549       signed char s,t;
2550       t=get_reg(i_regs->regmap,rt1[i]);
2551       s=get_reg(i_regs->regmap,rs1[i]);
2552       //assert(t>=0);
2553       if(t>=0){
2554         if(rs1[i]==0)
2555         {
2556           emit_zeroreg(t);
2557         }
2558         else
2559         {
2560           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2561           if(imm[i]) {
2562             if(opcode2[i]==0) // SLL
2563             {
2564               emit_shlimm(s<0?t:s,imm[i],t);
2565             }
2566             if(opcode2[i]==2) // SRL
2567             {
2568               emit_shrimm(s<0?t:s,imm[i],t);
2569             }
2570             if(opcode2[i]==3) // SRA
2571             {
2572               emit_sarimm(s<0?t:s,imm[i],t);
2573             }
2574           }else{
2575             // Shift by zero
2576             if(s>=0 && s!=t) emit_mov(s,t);
2577           }
2578         }
2579       }
2580       //emit_storereg(rt1[i],t); //DEBUG
2581     }
2582   }
2583   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2584   {
2585     if(rt1[i]) {
2586       signed char sh,sl,th,tl;
2587       th=get_reg(i_regs->regmap,rt1[i]|64);
2588       tl=get_reg(i_regs->regmap,rt1[i]);
2589       sh=get_reg(i_regs->regmap,rs1[i]|64);
2590       sl=get_reg(i_regs->regmap,rs1[i]);
2591       if(tl>=0) {
2592         if(rs1[i]==0)
2593         {
2594           emit_zeroreg(tl);
2595           if(th>=0) emit_zeroreg(th);
2596         }
2597         else
2598         {
2599           assert(sl>=0);
2600           assert(sh>=0);
2601           if(imm[i]) {
2602             if(opcode2[i]==0x38) // DSLL
2603             {
2604               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2605               emit_shlimm(sl,imm[i],tl);
2606             }
2607             if(opcode2[i]==0x3a) // DSRL
2608             {
2609               emit_shrdimm(sl,sh,imm[i],tl);
2610               if(th>=0) emit_shrimm(sh,imm[i],th);
2611             }
2612             if(opcode2[i]==0x3b) // DSRA
2613             {
2614               emit_shrdimm(sl,sh,imm[i],tl);
2615               if(th>=0) emit_sarimm(sh,imm[i],th);
2616             }
2617           }else{
2618             // Shift by zero
2619             if(sl!=tl) emit_mov(sl,tl);
2620             if(th>=0&&sh!=th) emit_mov(sh,th);
2621           }
2622         }
2623       }
2624     }
2625   }
2626   if(opcode2[i]==0x3c) // DSLL32
2627   {
2628     if(rt1[i]) {
2629       signed char sl,tl,th;
2630       tl=get_reg(i_regs->regmap,rt1[i]);
2631       th=get_reg(i_regs->regmap,rt1[i]|64);
2632       sl=get_reg(i_regs->regmap,rs1[i]);
2633       if(th>=0||tl>=0){
2634         assert(tl>=0);
2635         assert(th>=0);
2636         assert(sl>=0);
2637         emit_mov(sl,th);
2638         emit_zeroreg(tl);
2639         if(imm[i]>32)
2640         {
2641           emit_shlimm(th,imm[i]&31,th);
2642         }
2643       }
2644     }
2645   }
2646   if(opcode2[i]==0x3e) // DSRL32
2647   {
2648     if(rt1[i]) {
2649       signed char sh,tl,th;
2650       tl=get_reg(i_regs->regmap,rt1[i]);
2651       th=get_reg(i_regs->regmap,rt1[i]|64);
2652       sh=get_reg(i_regs->regmap,rs1[i]|64);
2653       if(tl>=0){
2654         assert(sh>=0);
2655         emit_mov(sh,tl);
2656         if(th>=0) emit_zeroreg(th);
2657         if(imm[i]>32)
2658         {
2659           emit_shrimm(tl,imm[i]&31,tl);
2660         }
2661       }
2662     }
2663   }
2664   if(opcode2[i]==0x3f) // DSRA32
2665   {
2666     if(rt1[i]) {
2667       signed char sh,tl;
2668       tl=get_reg(i_regs->regmap,rt1[i]);
2669       sh=get_reg(i_regs->regmap,rs1[i]|64);
2670       if(tl>=0){
2671         assert(sh>=0);
2672         emit_mov(sh,tl);
2673         if(imm[i]>32)
2674         {
2675           emit_sarimm(tl,imm[i]&31,tl);
2676         }
2677       }
2678     }
2679   }
2680 }
2681
2682 #ifndef shift_assemble
2683 void shift_assemble(int i,struct regstat *i_regs)
2684 {
2685   printf("Need shift_assemble for this architecture.\n");
2686   exit(1);
2687 }
2688 #endif
2689
2690 void load_assemble(int i,struct regstat *i_regs)
2691 {
2692   int s,th,tl,addr,map=-1;
2693   int offset;
2694   int jaddr=0;
2695   int memtarget,c=0;
2696   u_int hr,reglist=0;
2697   th=get_reg(i_regs->regmap,rt1[i]|64);
2698   tl=get_reg(i_regs->regmap,rt1[i]);
2699   s=get_reg(i_regs->regmap,rs1[i]);
2700   offset=imm[i];
2701   for(hr=0;hr<HOST_REGS;hr++) {
2702     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2703   }
2704   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2705   if(s>=0) {
2706     c=(i_regs->wasconst>>s)&1;
2707     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2708     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2709   }
2710   if(offset||s<0||c) addr=tl;
2711   else addr=s;
2712   //printf("load_assemble: c=%d\n",c);
2713   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2714   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2715   if(tl>=0) {
2716     //assert(tl>=0);
2717     //assert(rt1[i]);
2718     reglist&=~(1<<tl);
2719     if(th>=0) reglist&=~(1<<th);
2720     if(!using_tlb) {
2721       if(!c) {
2722 //#define R29_HACK 1
2723         #ifdef R29_HACK
2724         // Strmnnrmn's speed hack
2725         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2726         #endif
2727         {
2728           emit_cmpimm(addr,0x800000);
2729           jaddr=(int)out;
2730           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2731           // Hint to branch predictor that the branch is unlikely to be taken
2732           if(rs1[i]>=28)
2733             emit_jno_unlikely(0);
2734           else
2735           #endif
2736           emit_jno(0);
2737         }
2738       }
2739     }else{ // using tlb
2740       int x=0;
2741       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2742       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2743       map=get_reg(i_regs->regmap,TLREG);
2744       assert(map>=0);
2745       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2746       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2747     }
2748     if (opcode[i]==0x20) { // LB
2749       if(!c||memtarget) {
2750         #ifdef HOST_IMM_ADDR32
2751         if(c)
2752           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2753         else
2754         #endif
2755         {
2756           //emit_xorimm(addr,3,tl);
2757           //gen_tlb_addr_r(tl,map);
2758           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2759           int x=0;
2760           if(!c) emit_xorimm(addr,3,tl);
2761           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2762           emit_movsbl_indexed_tlb(x,tl,map,tl);
2763         }
2764         if(jaddr)
2765           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2766       }
2767       else
2768         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2769     }
2770     if (opcode[i]==0x21) { // LH
2771       if(!c||memtarget) {
2772         #ifdef HOST_IMM_ADDR32
2773         if(c)
2774           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2775         else
2776         #endif
2777         {
2778           int x=0;
2779           if(!c) emit_xorimm(addr,2,tl);
2780           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2781           //#ifdef
2782           //emit_movswl_indexed_tlb(x,tl,map,tl);
2783           //else
2784           if(map>=0) {
2785             gen_tlb_addr_r(tl,map);
2786             emit_movswl_indexed(x,tl,tl);
2787           }else
2788             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2789         }
2790         if(jaddr)
2791           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2792       }
2793       else
2794         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2795     }
2796     if (opcode[i]==0x23) { // LW
2797       if(!c||memtarget) {
2798         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2799         #ifdef HOST_IMM_ADDR32
2800         if(c)
2801           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2802         else
2803         #endif
2804         emit_readword_indexed_tlb(0,addr,map,tl);
2805         if(jaddr)
2806           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2807       }
2808       else
2809         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2810     }
2811     if (opcode[i]==0x24) { // LBU
2812       if(!c||memtarget) {
2813         #ifdef HOST_IMM_ADDR32
2814         if(c)
2815           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2816         else
2817         #endif
2818         {
2819           //emit_xorimm(addr,3,tl);
2820           //gen_tlb_addr_r(tl,map);
2821           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2822           int x=0;
2823           if(!c) emit_xorimm(addr,3,tl);
2824           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2825           emit_movzbl_indexed_tlb(x,tl,map,tl);
2826         }
2827         if(jaddr)
2828           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2829       }
2830       else
2831         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2832     }
2833     if (opcode[i]==0x25) { // LHU
2834       if(!c||memtarget) {
2835         #ifdef HOST_IMM_ADDR32
2836         if(c)
2837           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2838         else
2839         #endif
2840         {
2841           int x=0;
2842           if(!c) emit_xorimm(addr,2,tl);
2843           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2844           //#ifdef
2845           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2846           //#else
2847           if(map>=0) {
2848             gen_tlb_addr_r(tl,map);
2849             emit_movzwl_indexed(x,tl,tl);
2850           }else
2851             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2852           if(jaddr)
2853             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2854         }
2855       }
2856       else
2857         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2858     }
2859     if (opcode[i]==0x27) { // LWU
2860       assert(th>=0);
2861       if(!c||memtarget) {
2862         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2863         #ifdef HOST_IMM_ADDR32
2864         if(c)
2865           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2866         else
2867         #endif
2868         emit_readword_indexed_tlb(0,addr,map,tl);
2869         if(jaddr)
2870           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2871       }
2872       else {
2873         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2874       }
2875       emit_zeroreg(th);
2876     }
2877     if (opcode[i]==0x37) { // LD
2878       if(!c||memtarget) {
2879         //gen_tlb_addr_r(tl,map);
2880         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2881         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2882         #ifdef HOST_IMM_ADDR32
2883         if(c)
2884           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2885         else
2886         #endif
2887         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2888         if(jaddr)
2889           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2890       }
2891       else
2892         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2893     }
2894     //emit_storereg(rt1[i],tl); // DEBUG
2895   }
2896   //if(opcode[i]==0x23)
2897   //if(opcode[i]==0x24)
2898   //if(opcode[i]==0x23||opcode[i]==0x24)
2899   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2900   {
2901     //emit_pusha();
2902     save_regs(0x100f);
2903         emit_readword((int)&last_count,ECX);
2904         #ifdef __i386__
2905         if(get_reg(i_regs->regmap,CCREG)<0)
2906           emit_loadreg(CCREG,HOST_CCREG);
2907         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2908         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2909         emit_writeword(HOST_CCREG,(int)&Count);
2910         #endif
2911         #ifdef __arm__
2912         if(get_reg(i_regs->regmap,CCREG)<0)
2913           emit_loadreg(CCREG,0);
2914         else
2915           emit_mov(HOST_CCREG,0);
2916         emit_add(0,ECX,0);
2917         emit_addimm(0,2*ccadj[i],0);
2918         emit_writeword(0,(int)&Count);
2919         #endif
2920     emit_call((int)memdebug);
2921     //emit_popa();
2922     restore_regs(0x100f);
2923   }/**/
2924 }
2925
2926 #ifndef loadlr_assemble
2927 void loadlr_assemble(int i,struct regstat *i_regs)
2928 {
2929   printf("Need loadlr_assemble for this architecture.\n");
2930   exit(1);
2931 }
2932 #endif
2933
2934 void store_assemble(int i,struct regstat *i_regs)
2935 {
2936   int s,th,tl,map=-1;
2937   int addr,temp;
2938   int offset;
2939   int jaddr=0,jaddr2,type;
2940   int memtarget,c=0;
2941   int agr=AGEN1+(i&1);
2942   u_int hr,reglist=0;
2943   th=get_reg(i_regs->regmap,rs2[i]|64);
2944   tl=get_reg(i_regs->regmap,rs2[i]);
2945   s=get_reg(i_regs->regmap,rs1[i]);
2946   temp=get_reg(i_regs->regmap,agr);
2947   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2948   offset=imm[i];
2949   if(s>=0) {
2950     c=(i_regs->wasconst>>s)&1;
2951     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2952     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2953   }
2954   assert(tl>=0);
2955   assert(temp>=0);
2956   for(hr=0;hr<HOST_REGS;hr++) {
2957     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2958   }
2959   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2960   if(offset||s<0||c) addr=temp;
2961   else addr=s;
2962   if(!using_tlb) {
2963     if(!c) {
2964       #ifdef R29_HACK
2965       // Strmnnrmn's speed hack
2966       memtarget=1;
2967       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2968       #endif
2969       emit_cmpimm(addr,0x800000);
2970       #ifdef DESTRUCTIVE_SHIFT
2971       if(s==addr) emit_mov(s,temp);
2972       #endif
2973       #ifdef R29_HACK
2974       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2975       #endif
2976       {
2977         jaddr=(int)out;
2978         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2979         // Hint to branch predictor that the branch is unlikely to be taken
2980         if(rs1[i]>=28)
2981           emit_jno_unlikely(0);
2982         else
2983         #endif
2984         emit_jno(0);
2985       }
2986     }
2987   }else{ // using tlb
2988     int x=0;
2989     if (opcode[i]==0x28) x=3; // SB
2990     if (opcode[i]==0x29) x=2; // SH
2991     map=get_reg(i_regs->regmap,TLREG);
2992     assert(map>=0);
2993     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
2994     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
2995   }
2996
2997   if (opcode[i]==0x28) { // SB
2998     if(!c||memtarget) {
2999       int x=0;
3000       if(!c) emit_xorimm(addr,3,temp);
3001       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3002       //gen_tlb_addr_w(temp,map);
3003       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3004       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
3005     }
3006     type=STOREB_STUB;
3007   }
3008   if (opcode[i]==0x29) { // SH
3009     if(!c||memtarget) {
3010       int x=0;
3011       if(!c) emit_xorimm(addr,2,temp);
3012       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3013       //#ifdef
3014       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3015       //#else
3016       if(map>=0) {
3017         gen_tlb_addr_w(temp,map);
3018         emit_writehword_indexed(tl,x,temp);
3019       }else
3020         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3021     }
3022     type=STOREH_STUB;
3023   }
3024   if (opcode[i]==0x2B) { // SW
3025     if(!c||memtarget)
3026       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3027       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3028     type=STOREW_STUB;
3029   }
3030   if (opcode[i]==0x3F) { // SD
3031     if(!c||memtarget) {
3032       if(rs2[i]) {
3033         assert(th>=0);
3034         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3035         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3036         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3037       }else{
3038         // Store zero
3039         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3040         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3041         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3042       }
3043     }
3044     type=STORED_STUB;
3045   }
3046   if(jaddr) {
3047     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3048   } else if(!memtarget) {
3049     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3050   }
3051   if(!using_tlb) {
3052     if(!c||memtarget) {
3053       #ifdef DESTRUCTIVE_SHIFT
3054       // The x86 shift operation is 'destructive'; it overwrites the
3055       // source register, so we need to make a copy first and use that.
3056       addr=temp;
3057       #endif
3058       #if defined(HOST_IMM8)
3059       int ir=get_reg(i_regs->regmap,INVCP);
3060       assert(ir>=0);
3061       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3062       #else
3063       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3064       #endif
3065       jaddr2=(int)out;
3066       emit_jne(0);
3067       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3068     }
3069   }
3070   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3071   //if(opcode[i]==0x2B || opcode[i]==0x28)
3072   //if(opcode[i]==0x2B || opcode[i]==0x29)
3073   //if(opcode[i]==0x2B)
3074   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3075   {
3076     //emit_pusha();
3077     save_regs(0x100f);
3078         emit_readword((int)&last_count,ECX);
3079         #ifdef __i386__
3080         if(get_reg(i_regs->regmap,CCREG)<0)
3081           emit_loadreg(CCREG,HOST_CCREG);
3082         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3083         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3084         emit_writeword(HOST_CCREG,(int)&Count);
3085         #endif
3086         #ifdef __arm__
3087         if(get_reg(i_regs->regmap,CCREG)<0)
3088           emit_loadreg(CCREG,0);
3089         else
3090           emit_mov(HOST_CCREG,0);
3091         emit_add(0,ECX,0);
3092         emit_addimm(0,2*ccadj[i],0);
3093         emit_writeword(0,(int)&Count);
3094         #endif
3095     emit_call((int)memdebug);
3096     //emit_popa();
3097     restore_regs(0x100f);
3098   }/**/
3099 }
3100
3101 void storelr_assemble(int i,struct regstat *i_regs)
3102 {
3103   int s,th,tl;
3104   int temp;
3105   int temp2;
3106   int offset;
3107   int jaddr=0,jaddr2;
3108   int case1,case2,case3;
3109   int done0,done1,done2;
3110   int memtarget,c=0;
3111   u_int hr,reglist=0;
3112   th=get_reg(i_regs->regmap,rs2[i]|64);
3113   tl=get_reg(i_regs->regmap,rs2[i]);
3114   s=get_reg(i_regs->regmap,rs1[i]);
3115   temp=get_reg(i_regs->regmap,-1);
3116   offset=imm[i];
3117   if(s>=0) {
3118     c=(i_regs->isconst>>s)&1;
3119     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3120     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3121   }
3122   assert(tl>=0);
3123   for(hr=0;hr<HOST_REGS;hr++) {
3124     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3125   }
3126   if(tl>=0) {
3127     assert(temp>=0);
3128     if(!using_tlb) {
3129       if(!c) {
3130         emit_cmpimm(s<0||offset?temp:s,0x800000);
3131         if(!offset&&s!=temp) emit_mov(s,temp);
3132         jaddr=(int)out;
3133         emit_jno(0);
3134       }
3135       else
3136       {
3137         if(!memtarget||!rs1[i]) {
3138           jaddr=(int)out;
3139           emit_jmp(0);
3140         }
3141       }
3142       if((u_int)rdram!=0x80000000) 
3143         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3144     }else{ // using tlb
3145       int map=get_reg(i_regs->regmap,TLREG);
3146       assert(map>=0);
3147       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3148       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3149       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3150       if(!jaddr&&!memtarget) {
3151         jaddr=(int)out;
3152         emit_jmp(0);
3153       }
3154       gen_tlb_addr_w(temp,map);
3155     }
3156
3157     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3158       temp2=get_reg(i_regs->regmap,FTEMP);
3159       if(!rs2[i]) temp2=th=tl;
3160     }
3161
3162     emit_testimm(temp,2);
3163     case2=(int)out;
3164     emit_jne(0);
3165     emit_testimm(temp,1);
3166     case1=(int)out;
3167     emit_jne(0);
3168     // 0
3169     if (opcode[i]==0x2A) { // SWL
3170       emit_writeword_indexed(tl,0,temp);
3171     }
3172     if (opcode[i]==0x2E) { // SWR
3173       emit_writebyte_indexed(tl,3,temp);
3174     }
3175     if (opcode[i]==0x2C) { // SDL
3176       emit_writeword_indexed(th,0,temp);
3177       if(rs2[i]) emit_mov(tl,temp2);
3178     }
3179     if (opcode[i]==0x2D) { // SDR
3180       emit_writebyte_indexed(tl,3,temp);
3181       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3182     }
3183     done0=(int)out;
3184     emit_jmp(0);
3185     // 1
3186     set_jump_target(case1,(int)out);
3187     if (opcode[i]==0x2A) { // SWL
3188       // Write 3 msb into three least significant bytes
3189       if(rs2[i]) emit_rorimm(tl,8,tl);
3190       emit_writehword_indexed(tl,-1,temp);
3191       if(rs2[i]) emit_rorimm(tl,16,tl);
3192       emit_writebyte_indexed(tl,1,temp);
3193       if(rs2[i]) emit_rorimm(tl,8,tl);
3194     }
3195     if (opcode[i]==0x2E) { // SWR
3196       // Write two lsb into two most significant bytes
3197       emit_writehword_indexed(tl,1,temp);
3198     }
3199     if (opcode[i]==0x2C) { // SDL
3200       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3201       // Write 3 msb into three least significant bytes
3202       if(rs2[i]) emit_rorimm(th,8,th);
3203       emit_writehword_indexed(th,-1,temp);
3204       if(rs2[i]) emit_rorimm(th,16,th);
3205       emit_writebyte_indexed(th,1,temp);
3206       if(rs2[i]) emit_rorimm(th,8,th);
3207     }
3208     if (opcode[i]==0x2D) { // SDR
3209       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3210       // Write two lsb into two most significant bytes
3211       emit_writehword_indexed(tl,1,temp);
3212     }
3213     done1=(int)out;
3214     emit_jmp(0);
3215     // 2
3216     set_jump_target(case2,(int)out);
3217     emit_testimm(temp,1);
3218     case3=(int)out;
3219     emit_jne(0);
3220     if (opcode[i]==0x2A) { // SWL
3221       // Write two msb into two least significant bytes
3222       if(rs2[i]) emit_rorimm(tl,16,tl);
3223       emit_writehword_indexed(tl,-2,temp);
3224       if(rs2[i]) emit_rorimm(tl,16,tl);
3225     }
3226     if (opcode[i]==0x2E) { // SWR
3227       // Write 3 lsb into three most significant bytes
3228       emit_writebyte_indexed(tl,-1,temp);
3229       if(rs2[i]) emit_rorimm(tl,8,tl);
3230       emit_writehword_indexed(tl,0,temp);
3231       if(rs2[i]) emit_rorimm(tl,24,tl);
3232     }
3233     if (opcode[i]==0x2C) { // SDL
3234       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3235       // Write two msb into two least significant bytes
3236       if(rs2[i]) emit_rorimm(th,16,th);
3237       emit_writehword_indexed(th,-2,temp);
3238       if(rs2[i]) emit_rorimm(th,16,th);
3239     }
3240     if (opcode[i]==0x2D) { // SDR
3241       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3242       // Write 3 lsb into three most significant bytes
3243       emit_writebyte_indexed(tl,-1,temp);
3244       if(rs2[i]) emit_rorimm(tl,8,tl);
3245       emit_writehword_indexed(tl,0,temp);
3246       if(rs2[i]) emit_rorimm(tl,24,tl);
3247     }
3248     done2=(int)out;
3249     emit_jmp(0);
3250     // 3
3251     set_jump_target(case3,(int)out);
3252     if (opcode[i]==0x2A) { // SWL
3253       // Write msb into least significant byte
3254       if(rs2[i]) emit_rorimm(tl,24,tl);
3255       emit_writebyte_indexed(tl,-3,temp);
3256       if(rs2[i]) emit_rorimm(tl,8,tl);
3257     }
3258     if (opcode[i]==0x2E) { // SWR
3259       // Write entire word
3260       emit_writeword_indexed(tl,-3,temp);
3261     }
3262     if (opcode[i]==0x2C) { // SDL
3263       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3264       // Write msb into least significant byte
3265       if(rs2[i]) emit_rorimm(th,24,th);
3266       emit_writebyte_indexed(th,-3,temp);
3267       if(rs2[i]) emit_rorimm(th,8,th);
3268     }
3269     if (opcode[i]==0x2D) { // SDR
3270       if(rs2[i]) emit_mov(th,temp2);
3271       // Write entire word
3272       emit_writeword_indexed(tl,-3,temp);
3273     }
3274     set_jump_target(done0,(int)out);
3275     set_jump_target(done1,(int)out);
3276     set_jump_target(done2,(int)out);
3277     if (opcode[i]==0x2C) { // SDL
3278       emit_testimm(temp,4);
3279       done0=(int)out;
3280       emit_jne(0);
3281       emit_andimm(temp,~3,temp);
3282       emit_writeword_indexed(temp2,4,temp);
3283       set_jump_target(done0,(int)out);
3284     }
3285     if (opcode[i]==0x2D) { // SDR
3286       emit_testimm(temp,4);
3287       done0=(int)out;
3288       emit_jeq(0);
3289       emit_andimm(temp,~3,temp);
3290       emit_writeword_indexed(temp2,-4,temp);
3291       set_jump_target(done0,(int)out);
3292     }
3293     if(!c||!memtarget)
3294       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3295   }
3296   if(!using_tlb) {
3297     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3298     #if defined(HOST_IMM8)
3299     int ir=get_reg(i_regs->regmap,INVCP);
3300     assert(ir>=0);
3301     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3302     #else
3303     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3304     #endif
3305     jaddr2=(int)out;
3306     emit_jne(0);
3307     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3308   }
3309   /*
3310     emit_pusha();
3311     //save_regs(0x100f);
3312         emit_readword((int)&last_count,ECX);
3313         if(get_reg(i_regs->regmap,CCREG)<0)
3314           emit_loadreg(CCREG,HOST_CCREG);
3315         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3316         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3317         emit_writeword(HOST_CCREG,(int)&Count);
3318     emit_call((int)memdebug);
3319     emit_popa();
3320     //restore_regs(0x100f);
3321   /**/
3322 }
3323
3324 void c1ls_assemble(int i,struct regstat *i_regs)
3325 {
3326   int s,th,tl;
3327   int temp,ar;
3328   int map=-1;
3329   int offset;
3330   int c=0;
3331   int jaddr,jaddr2=0,jaddr3,type;
3332   int agr=AGEN1+(i&1);
3333   u_int hr,reglist=0;
3334   th=get_reg(i_regs->regmap,FTEMP|64);
3335   tl=get_reg(i_regs->regmap,FTEMP);
3336   s=get_reg(i_regs->regmap,rs1[i]);
3337   temp=get_reg(i_regs->regmap,agr);
3338   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3339   offset=imm[i];
3340   assert(tl>=0);
3341   assert(rs1[i]>0);
3342   assert(temp>=0);
3343   for(hr=0;hr<HOST_REGS;hr++) {
3344     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3345   }
3346   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3347   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3348   {
3349     // Loads use a temporary register which we need to save
3350     reglist|=1<<temp;
3351   }
3352   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3353     ar=temp;
3354   else // LWC1/LDC1
3355     ar=tl;
3356   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3357   //else c=(i_regs->wasconst>>s)&1;
3358   if(s>=0) c=(i_regs->wasconst>>s)&1;
3359   // Check cop1 unusable
3360   if(!cop1_usable) {
3361     signed char rs=get_reg(i_regs->regmap,CSREG);
3362     assert(rs>=0);
3363     emit_testimm(rs,0x20000000);
3364     jaddr=(int)out;
3365     emit_jeq(0);
3366     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3367     cop1_usable=1;
3368   }
3369   if (opcode[i]==0x39) { // SWC1 (get float address)
3370     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3371   }
3372   if (opcode[i]==0x3D) { // SDC1 (get double address)
3373     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3374   }
3375   // Generate address + offset
3376   if(!using_tlb) {
3377     if(!c)
3378       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3379   }
3380   else
3381   {
3382     map=get_reg(i_regs->regmap,TLREG);
3383     assert(map>=0);
3384     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3385       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3386     }
3387     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3388       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3389     }
3390   }
3391   if (opcode[i]==0x39) { // SWC1 (read float)
3392     emit_readword_indexed(0,tl,tl);
3393   }
3394   if (opcode[i]==0x3D) { // SDC1 (read double)
3395     emit_readword_indexed(4,tl,th);
3396     emit_readword_indexed(0,tl,tl);
3397   }
3398   if (opcode[i]==0x31) { // LWC1 (get target address)
3399     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3400   }
3401   if (opcode[i]==0x35) { // LDC1 (get target address)
3402     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3403   }
3404   if(!using_tlb) {
3405     if(!c) {
3406       jaddr2=(int)out;
3407       emit_jno(0);
3408     }
3409     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3410       jaddr2=(int)out;
3411       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3412     }
3413     #ifdef DESTRUCTIVE_SHIFT
3414     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3415       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3416     }
3417     #endif
3418   }else{
3419     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3420       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3421     }
3422     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3423       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3424     }
3425   }
3426   if (opcode[i]==0x31) { // LWC1
3427     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3428     //gen_tlb_addr_r(ar,map);
3429     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3430     #ifdef HOST_IMM_ADDR32
3431     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3432     else
3433     #endif
3434     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3435     type=LOADW_STUB;
3436   }
3437   if (opcode[i]==0x35) { // LDC1
3438     assert(th>=0);
3439     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3440     //gen_tlb_addr_r(ar,map);
3441     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3442     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3443     #ifdef HOST_IMM_ADDR32
3444     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3445     else
3446     #endif
3447     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3448     type=LOADD_STUB;
3449   }
3450   if (opcode[i]==0x39) { // SWC1
3451     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3452     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3453     type=STOREW_STUB;
3454   }
3455   if (opcode[i]==0x3D) { // SDC1
3456     assert(th>=0);
3457     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3458     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3459     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3460     type=STORED_STUB;
3461   }
3462   if(!using_tlb) {
3463     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3464       #ifndef DESTRUCTIVE_SHIFT
3465       temp=offset||c||s<0?ar:s;
3466       #endif
3467       #if defined(HOST_IMM8)
3468       int ir=get_reg(i_regs->regmap,INVCP);
3469       assert(ir>=0);
3470       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3471       #else
3472       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3473       #endif
3474       jaddr3=(int)out;
3475       emit_jne(0);
3476       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3477     }
3478   }
3479   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3480   if (opcode[i]==0x31) { // LWC1 (write float)
3481     emit_writeword_indexed(tl,0,temp);
3482   }
3483   if (opcode[i]==0x35) { // LDC1 (write double)
3484     emit_writeword_indexed(th,4,temp);
3485     emit_writeword_indexed(tl,0,temp);
3486   }
3487   //if(opcode[i]==0x39)
3488   /*if(opcode[i]==0x39||opcode[i]==0x31)
3489   {
3490     emit_pusha();
3491         emit_readword((int)&last_count,ECX);
3492         if(get_reg(i_regs->regmap,CCREG)<0)
3493           emit_loadreg(CCREG,HOST_CCREG);
3494         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3495         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3496         emit_writeword(HOST_CCREG,(int)&Count);
3497     emit_call((int)memdebug);
3498     emit_popa();
3499   }/**/
3500 }
3501
3502 #ifndef multdiv_assemble
3503 void multdiv_assemble(int i,struct regstat *i_regs)
3504 {
3505   printf("Need multdiv_assemble for this architecture.\n");
3506   exit(1);
3507 }
3508 #endif
3509
3510 void mov_assemble(int i,struct regstat *i_regs)
3511 {
3512   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3513   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3514   assert(rt1[i]>0);
3515   if(rt1[i]) {
3516     signed char sh,sl,th,tl;
3517     th=get_reg(i_regs->regmap,rt1[i]|64);
3518     tl=get_reg(i_regs->regmap,rt1[i]);
3519     //assert(tl>=0);
3520     if(tl>=0) {
3521       sh=get_reg(i_regs->regmap,rs1[i]|64);
3522       sl=get_reg(i_regs->regmap,rs1[i]);
3523       if(sl>=0) emit_mov(sl,tl);
3524       else emit_loadreg(rs1[i],tl);
3525       if(th>=0) {
3526         if(sh>=0) emit_mov(sh,th);
3527         else emit_loadreg(rs1[i]|64,th);
3528       }
3529     }
3530   }
3531 }
3532
3533 #ifndef fconv_assemble
3534 void fconv_assemble(int i,struct regstat *i_regs)
3535 {
3536   printf("Need fconv_assemble for this architecture.\n");
3537   exit(1);
3538 }
3539 #endif
3540
3541 #if 0
3542 void float_assemble(int i,struct regstat *i_regs)
3543 {
3544   printf("Need float_assemble for this architecture.\n");
3545   exit(1);
3546 }
3547 #endif
3548
3549 void syscall_assemble(int i,struct regstat *i_regs)
3550 {
3551   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3552   assert(ccreg==HOST_CCREG);
3553   assert(!is_delayslot);
3554   emit_movimm(start+i*4,EAX); // Get PC
3555   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3556   emit_jmp((int)jump_syscall);
3557 }
3558
3559 void ds_assemble(int i,struct regstat *i_regs)
3560 {
3561   is_delayslot=1;
3562   switch(itype[i]) {
3563     case ALU:
3564       alu_assemble(i,i_regs);break;
3565     case IMM16:
3566       imm16_assemble(i,i_regs);break;
3567     case SHIFT:
3568       shift_assemble(i,i_regs);break;
3569     case SHIFTIMM:
3570       shiftimm_assemble(i,i_regs);break;
3571     case LOAD:
3572       load_assemble(i,i_regs);break;
3573     case LOADLR:
3574       loadlr_assemble(i,i_regs);break;
3575     case STORE:
3576       store_assemble(i,i_regs);break;
3577     case STORELR:
3578       storelr_assemble(i,i_regs);break;
3579     case COP0:
3580       cop0_assemble(i,i_regs);break;
3581     case COP1:
3582       cop1_assemble(i,i_regs);break;
3583     case C1LS:
3584       c1ls_assemble(i,i_regs);break;
3585     case FCONV:
3586       fconv_assemble(i,i_regs);break;
3587     case FLOAT:
3588       float_assemble(i,i_regs);break;
3589     case FCOMP:
3590       fcomp_assemble(i,i_regs);break;
3591     case MULTDIV:
3592       multdiv_assemble(i,i_regs);break;
3593     case MOV:
3594       mov_assemble(i,i_regs);break;
3595     case SYSCALL:
3596     case SPAN:
3597     case UJUMP:
3598     case RJUMP:
3599     case CJUMP:
3600     case SJUMP:
3601     case FJUMP:
3602       printf("Jump in the delay slot.  This is probably a bug.\n");
3603   }
3604   is_delayslot=0;
3605 }
3606
3607 // Is the branch target a valid internal jump?
3608 int internal_branch(uint64_t i_is32,int addr)
3609 {
3610   if(addr&1) return 0; // Indirect (register) jump
3611   if(addr>=start && addr<start+slen*4-4)
3612   {
3613     int t=(addr-start)>>2;
3614     // Delay slots are not valid branch targets
3615     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3616     // 64 -> 32 bit transition requires a recompile
3617     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3618     {
3619       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3620       else printf("optimizable: yes\n");
3621     }*/
3622     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3623     if(requires_32bit[t]&~i_is32) return 0;
3624     else return 1;
3625   }
3626   return 0;
3627 }
3628
3629 #ifndef wb_invalidate
3630 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3631   uint64_t u,uint64_t uu)
3632 {
3633   int hr;
3634   for(hr=0;hr<HOST_REGS;hr++) {
3635     if(hr!=EXCLUDE_REG) {
3636       if(pre[hr]!=entry[hr]) {
3637         if(pre[hr]>=0) {
3638           if((dirty>>hr)&1) {
3639             if(get_reg(entry,pre[hr])<0) {
3640               if(pre[hr]<64) {
3641                 if(!((u>>pre[hr])&1)) {
3642                   emit_storereg(pre[hr],hr);
3643                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3644                     emit_sarimm(hr,31,hr);
3645                     emit_storereg(pre[hr]|64,hr);
3646                   }
3647                 }
3648               }else{
3649                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3650                   emit_storereg(pre[hr],hr);
3651                 }
3652               }
3653             }
3654           }
3655         }
3656       }
3657     }
3658   }
3659   // Move from one register to another (no writeback)
3660   for(hr=0;hr<HOST_REGS;hr++) {
3661     if(hr!=EXCLUDE_REG) {
3662       if(pre[hr]!=entry[hr]) {
3663         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3664           int nr;
3665           if((nr=get_reg(entry,pre[hr]))>=0) {
3666             emit_mov(hr,nr);
3667           }
3668         }
3669       }
3670     }
3671   }
3672 }
3673 #endif
3674
3675 // Load the specified registers
3676 // This only loads the registers given as arguments because
3677 // we don't want to load things that will be overwritten
3678 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3679 {
3680   int hr;
3681   // Load 32-bit regs
3682   for(hr=0;hr<HOST_REGS;hr++) {
3683     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3684       if(entry[hr]!=regmap[hr]) {
3685         if(regmap[hr]==rs1||regmap[hr]==rs2)
3686         {
3687           if(regmap[hr]==0) {
3688             emit_zeroreg(hr);
3689           }
3690           else
3691           {
3692             emit_loadreg(regmap[hr],hr);
3693           }
3694         }
3695       }
3696     }
3697   }
3698   //Load 64-bit regs
3699   for(hr=0;hr<HOST_REGS;hr++) {
3700     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3701       if(entry[hr]!=regmap[hr]) {
3702         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3703         {
3704           assert(regmap[hr]!=64);
3705           if((is32>>(regmap[hr]&63))&1) {
3706             int lr=get_reg(regmap,regmap[hr]-64);
3707             if(lr>=0)
3708               emit_sarimm(lr,31,hr);
3709             else
3710               emit_loadreg(regmap[hr],hr);
3711           }
3712           else
3713           {
3714             emit_loadreg(regmap[hr],hr);
3715           }
3716         }
3717       }
3718     }
3719   }
3720 }
3721
3722 // Load registers prior to the start of a loop
3723 // so that they are not loaded within the loop
3724 static void loop_preload(signed char pre[],signed char entry[])
3725 {
3726   int hr;
3727   for(hr=0;hr<HOST_REGS;hr++) {
3728     if(hr!=EXCLUDE_REG) {
3729       if(pre[hr]!=entry[hr]) {
3730         if(entry[hr]>=0) {
3731           if(get_reg(pre,entry[hr])<0) {
3732             assem_debug("loop preload:\n");
3733             //printf("loop preload: %d\n",hr);
3734             if(entry[hr]==0) {
3735               emit_zeroreg(hr);
3736             }
3737             else if(entry[hr]<TEMPREG)
3738             {
3739               emit_loadreg(entry[hr],hr);
3740             }
3741             else if(entry[hr]-64<TEMPREG)
3742             {
3743               emit_loadreg(entry[hr],hr);
3744             }
3745           }
3746         }
3747       }
3748     }
3749   }
3750 }
3751
3752 // Generate address for load/store instruction
3753 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3754 {
3755   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3756     int ra;
3757     int agr=AGEN1+(i&1);
3758     int mgr=MGEN1+(i&1);
3759     if(itype[i]==LOAD) {
3760       ra=get_reg(i_regs->regmap,rt1[i]);
3761       //if(rt1[i]) assert(ra>=0);
3762     }
3763     if(itype[i]==LOADLR) {
3764       ra=get_reg(i_regs->regmap,FTEMP);
3765     }
3766     if(itype[i]==STORE||itype[i]==STORELR) {
3767       ra=get_reg(i_regs->regmap,agr);
3768       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3769     }
3770     if(itype[i]==C1LS) {
3771       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3772         ra=get_reg(i_regs->regmap,FTEMP);
3773       else { // SWC1/SDC1
3774         ra=get_reg(i_regs->regmap,agr);
3775         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3776       }
3777     }
3778     int rs=get_reg(i_regs->regmap,rs1[i]);
3779     int rm=get_reg(i_regs->regmap,TLREG);
3780     if(ra>=0) {
3781       int offset=imm[i];
3782       int c=(i_regs->wasconst>>rs)&1;
3783       if(rs1[i]==0) {
3784         // Using r0 as a base address
3785         /*if(rm>=0) {
3786           if(!entry||entry[rm]!=mgr) {
3787             generate_map_const(offset,rm);
3788           } // else did it in the previous cycle
3789         }*/
3790         if(!entry||entry[ra]!=agr) {
3791           if (opcode[i]==0x22||opcode[i]==0x26) {
3792             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3793           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3794             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3795           }else{
3796             emit_movimm(offset,ra);
3797           }
3798         } // else did it in the previous cycle
3799       }
3800       else if(rs<0) {
3801         if(!entry||entry[ra]!=rs1[i])
3802           emit_loadreg(rs1[i],ra);
3803         //if(!entry||entry[ra]!=rs1[i])
3804         //  printf("poor load scheduling!\n");
3805       }
3806       else if(c) {
3807         if(rm>=0) {
3808           if(!entry||entry[rm]!=mgr) {
3809             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3810               // Stores to memory go thru the mapper to detect self-modifying
3811               // code, loads don't.
3812               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3813                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3814                 generate_map_const(constmap[i][rs]+offset,rm);
3815             }else{
3816               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3817                 generate_map_const(constmap[i][rs]+offset,rm);
3818             }
3819           }
3820         }
3821         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3822           if(!entry||entry[ra]!=agr) {
3823             if (opcode[i]==0x22||opcode[i]==0x26) {
3824               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3825             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3826               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3827             }else{
3828               #ifdef HOST_IMM_ADDR32
3829               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3830                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3831               #endif
3832               emit_movimm(constmap[i][rs]+offset,ra);
3833             }
3834           } // else did it in the previous cycle
3835         } // else load_consts already did it
3836       }
3837       if(offset&&!c&&rs1[i]) {
3838         if(rs>=0) {
3839           emit_addimm(rs,offset,ra);
3840         }else{
3841           emit_addimm(ra,offset,ra);
3842         }
3843       }
3844     }
3845   }
3846   // Preload constants for next instruction
3847   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3848     int agr,ra;
3849     #ifndef HOST_IMM_ADDR32
3850     // Mapper entry
3851     agr=MGEN1+((i+1)&1);
3852     ra=get_reg(i_regs->regmap,agr);
3853     if(ra>=0) {
3854       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3855       int offset=imm[i+1];
3856       int c=(regs[i+1].wasconst>>rs)&1;
3857       if(c) {
3858         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3859           // Stores to memory go thru the mapper to detect self-modifying
3860           // code, loads don't.
3861           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3862              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3863             generate_map_const(constmap[i+1][rs]+offset,ra);
3864         }else{
3865           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3866             generate_map_const(constmap[i+1][rs]+offset,ra);
3867         }
3868       }
3869       /*else if(rs1[i]==0) {
3870         generate_map_const(offset,ra);
3871       }*/
3872     }
3873     #endif
3874     // Actual address
3875     agr=AGEN1+((i+1)&1);
3876     ra=get_reg(i_regs->regmap,agr);
3877     if(ra>=0) {
3878       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3879       int offset=imm[i+1];
3880       int c=(regs[i+1].wasconst>>rs)&1;
3881       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3882         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3883           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3884         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3885           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3886         }else{
3887           #ifdef HOST_IMM_ADDR32
3888           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3889              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3890           #endif
3891           emit_movimm(constmap[i+1][rs]+offset,ra);
3892         }
3893       }
3894       else if(rs1[i+1]==0) {
3895         // Using r0 as a base address
3896         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3897           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3898         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3899           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3900         }else{
3901           emit_movimm(offset,ra);
3902         }
3903       }
3904     }
3905   }
3906 }
3907
3908 int get_final_value(int hr, int i, int *value)
3909 {
3910   int reg=regs[i].regmap[hr];
3911   while(i<slen-1) {
3912     if(regs[i+1].regmap[hr]!=reg) break;
3913     if(!((regs[i+1].isconst>>hr)&1)) break;
3914     if(bt[i+1]) break;
3915     i++;
3916   }
3917   if(i<slen-1) {
3918     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3919       *value=constmap[i][hr];
3920       return 1;
3921     }
3922     if(!bt[i+1]) {
3923       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3924         // Load in delay slot, out-of-order execution
3925         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3926         {
3927           #ifdef HOST_IMM_ADDR32
3928           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3929           #endif
3930           // Precompute load address
3931           *value=constmap[i][hr]+imm[i+2];
3932           return 1;
3933         }
3934       }
3935       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3936       {
3937         #ifdef HOST_IMM_ADDR32
3938         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3939         #endif
3940         // Precompute load address
3941         *value=constmap[i][hr]+imm[i+1];
3942         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3943         return 1;
3944       }
3945     }
3946   }
3947   *value=constmap[i][hr];
3948   //printf("c=%x\n",(int)constmap[i][hr]);
3949   if(i==slen-1) return 1;
3950   if(reg<64) {
3951     return !((unneeded_reg[i+1]>>reg)&1);
3952   }else{
3953     return !((unneeded_reg_upper[i+1]>>reg)&1);
3954   }
3955 }
3956
3957 // Load registers with known constants
3958 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3959 {
3960   int hr;
3961   // Load 32-bit regs
3962   for(hr=0;hr<HOST_REGS;hr++) {
3963     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3964       //if(entry[hr]!=regmap[hr]) {
3965       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3966         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3967           int value;
3968           if(get_final_value(hr,i,&value)) {
3969             if(value==0) {
3970               emit_zeroreg(hr);
3971             }
3972             else {
3973               emit_movimm(value,hr);
3974             }
3975           }
3976         }
3977       }
3978     }
3979   }
3980   // Load 64-bit regs
3981   for(hr=0;hr<HOST_REGS;hr++) {
3982     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3983       //if(entry[hr]!=regmap[hr]) {
3984       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3985         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3986           if((is32>>(regmap[hr]&63))&1) {
3987             int lr=get_reg(regmap,regmap[hr]-64);
3988             assert(lr>=0);
3989             emit_sarimm(lr,31,hr);
3990           }
3991           else
3992           {
3993             int value;
3994             if(get_final_value(hr,i,&value)) {
3995               if(value==0) {
3996                 emit_zeroreg(hr);
3997               }
3998               else {
3999                 emit_movimm(value,hr);
4000               }
4001             }
4002           }
4003         }
4004       }
4005     }
4006   }
4007 }
4008 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4009 {
4010   int hr;
4011   // Load 32-bit regs
4012   for(hr=0;hr<HOST_REGS;hr++) {
4013     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4014       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4015         int value=constmap[i][hr];
4016         if(value==0) {
4017           emit_zeroreg(hr);
4018         }
4019         else {
4020           emit_movimm(value,hr);
4021         }
4022       }
4023     }
4024   }
4025   // Load 64-bit regs
4026   for(hr=0;hr<HOST_REGS;hr++) {
4027     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4028       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4029         if((is32>>(regmap[hr]&63))&1) {
4030           int lr=get_reg(regmap,regmap[hr]-64);
4031           assert(lr>=0);
4032           emit_sarimm(lr,31,hr);
4033         }
4034         else
4035         {
4036           int value=constmap[i][hr];
4037           if(value==0) {
4038             emit_zeroreg(hr);
4039           }
4040           else {
4041             emit_movimm(value,hr);
4042           }
4043         }
4044       }
4045     }
4046   }
4047 }
4048
4049 // Write out all dirty registers (except cycle count)
4050 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4051 {
4052   int hr;
4053   for(hr=0;hr<HOST_REGS;hr++) {
4054     if(hr!=EXCLUDE_REG) {
4055       if(i_regmap[hr]>0) {
4056         if(i_regmap[hr]!=CCREG) {
4057           if((i_dirty>>hr)&1) {
4058             if(i_regmap[hr]<64) {
4059               emit_storereg(i_regmap[hr],hr);
4060               if( ((i_is32>>i_regmap[hr])&1) ) {
4061                 #ifdef DESTRUCTIVE_WRITEBACK
4062                 emit_sarimm(hr,31,hr);
4063                 emit_storereg(i_regmap[hr]|64,hr);
4064                 #else
4065                 emit_sarimm(hr,31,HOST_TEMPREG);
4066                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4067                 #endif
4068               }
4069             }else{
4070               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4071                 emit_storereg(i_regmap[hr],hr);
4072               }
4073             }
4074           }
4075         }
4076       }
4077     }
4078   }
4079 }
4080 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4081 // This writes the registers not written by store_regs_bt
4082 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4083 {
4084   int hr;
4085   int t=(addr-start)>>2;
4086   for(hr=0;hr<HOST_REGS;hr++) {
4087     if(hr!=EXCLUDE_REG) {
4088       if(i_regmap[hr]>0) {
4089         if(i_regmap[hr]!=CCREG) {
4090           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4091             if((i_dirty>>hr)&1) {
4092               if(i_regmap[hr]<64) {
4093                 emit_storereg(i_regmap[hr],hr);
4094                 if( ((i_is32>>i_regmap[hr])&1) ) {
4095                   #ifdef DESTRUCTIVE_WRITEBACK
4096                   emit_sarimm(hr,31,hr);
4097                   emit_storereg(i_regmap[hr]|64,hr);
4098                   #else
4099                   emit_sarimm(hr,31,HOST_TEMPREG);
4100                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4101                   #endif
4102                 }
4103               }else{
4104                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4105                   emit_storereg(i_regmap[hr],hr);
4106                 }
4107               }
4108             }
4109           }
4110         }
4111       }
4112     }
4113   }
4114 }
4115
4116 // Load all registers (except cycle count)
4117 void load_all_regs(signed char i_regmap[])
4118 {
4119   int hr;
4120   for(hr=0;hr<HOST_REGS;hr++) {
4121     if(hr!=EXCLUDE_REG) {
4122       if(i_regmap[hr]==0) {
4123         emit_zeroreg(hr);
4124       }
4125       else
4126       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4127       {
4128         emit_loadreg(i_regmap[hr],hr);
4129       }
4130     }
4131   }
4132 }
4133
4134 // Load all current registers also needed by next instruction
4135 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4136 {
4137   int hr;
4138   for(hr=0;hr<HOST_REGS;hr++) {
4139     if(hr!=EXCLUDE_REG) {
4140       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4141         if(i_regmap[hr]==0) {
4142           emit_zeroreg(hr);
4143         }
4144         else
4145         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4146         {
4147           emit_loadreg(i_regmap[hr],hr);
4148         }
4149       }
4150     }
4151   }
4152 }
4153
4154 // Load all regs, storing cycle count if necessary
4155 void load_regs_entry(int t)
4156 {
4157   int hr;
4158   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4159   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4160   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4161     emit_storereg(CCREG,HOST_CCREG);
4162   }
4163   // Load 32-bit regs
4164   for(hr=0;hr<HOST_REGS;hr++) {
4165     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4166       if(regs[t].regmap_entry[hr]==0) {
4167         emit_zeroreg(hr);
4168       }
4169       else if(regs[t].regmap_entry[hr]!=CCREG)
4170       {
4171         emit_loadreg(regs[t].regmap_entry[hr],hr);
4172       }
4173     }
4174   }
4175   // Load 64-bit regs
4176   for(hr=0;hr<HOST_REGS;hr++) {
4177     if(regs[t].regmap_entry[hr]>=64) {
4178       assert(regs[t].regmap_entry[hr]!=64);
4179       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4180         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4181         if(lr<0) {
4182           emit_loadreg(regs[t].regmap_entry[hr],hr);
4183         }
4184         else
4185         {
4186           emit_sarimm(lr,31,hr);
4187         }
4188       }
4189       else
4190       {
4191         emit_loadreg(regs[t].regmap_entry[hr],hr);
4192       }
4193     }
4194   }
4195 }
4196
4197 // Store dirty registers prior to branch
4198 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4199 {
4200   if(internal_branch(i_is32,addr))
4201   {
4202     int t=(addr-start)>>2;
4203     int hr;
4204     for(hr=0;hr<HOST_REGS;hr++) {
4205       if(hr!=EXCLUDE_REG) {
4206         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4207           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4208             if((i_dirty>>hr)&1) {
4209               if(i_regmap[hr]<64) {
4210                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4211                   emit_storereg(i_regmap[hr],hr);
4212                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4213                     #ifdef DESTRUCTIVE_WRITEBACK
4214                     emit_sarimm(hr,31,hr);
4215                     emit_storereg(i_regmap[hr]|64,hr);
4216                     #else
4217                     emit_sarimm(hr,31,HOST_TEMPREG);
4218                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4219                     #endif
4220                   }
4221                 }
4222               }else{
4223                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4224                   emit_storereg(i_regmap[hr],hr);
4225                 }
4226               }
4227             }
4228           }
4229         }
4230       }
4231     }
4232   }
4233   else
4234   {
4235     // Branch out of this block, write out all dirty regs
4236     wb_dirtys(i_regmap,i_is32,i_dirty);
4237   }
4238 }
4239
4240 // Load all needed registers for branch target
4241 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4242 {
4243   //if(addr>=start && addr<(start+slen*4))
4244   if(internal_branch(i_is32,addr))
4245   {
4246     int t=(addr-start)>>2;
4247     int hr;
4248     // Store the cycle count before loading something else
4249     if(i_regmap[HOST_CCREG]!=CCREG) {
4250       assert(i_regmap[HOST_CCREG]==-1);
4251     }
4252     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4253       emit_storereg(CCREG,HOST_CCREG);
4254     }
4255     // Load 32-bit regs
4256     for(hr=0;hr<HOST_REGS;hr++) {
4257       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4258         #ifdef DESTRUCTIVE_WRITEBACK
4259         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4260         #else
4261         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4262         #endif
4263           if(regs[t].regmap_entry[hr]==0) {
4264             emit_zeroreg(hr);
4265           }
4266           else if(regs[t].regmap_entry[hr]!=CCREG)
4267           {
4268             emit_loadreg(regs[t].regmap_entry[hr],hr);
4269           }
4270         }
4271       }
4272     }
4273     //Load 64-bit regs
4274     for(hr=0;hr<HOST_REGS;hr++) {
4275       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4276         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4277           assert(regs[t].regmap_entry[hr]!=64);
4278           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4279             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4280             if(lr<0) {
4281               emit_loadreg(regs[t].regmap_entry[hr],hr);
4282             }
4283             else
4284             {
4285               emit_sarimm(lr,31,hr);
4286             }
4287           }
4288           else
4289           {
4290             emit_loadreg(regs[t].regmap_entry[hr],hr);
4291           }
4292         }
4293         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4294           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4295           assert(lr>=0);
4296           emit_sarimm(lr,31,hr);
4297         }
4298       }
4299     }
4300   }
4301 }
4302
4303 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4304 {
4305   if(addr>=start && addr<start+slen*4-4)
4306   {
4307     int t=(addr-start)>>2;
4308     int hr;
4309     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4310     for(hr=0;hr<HOST_REGS;hr++)
4311     {
4312       if(hr!=EXCLUDE_REG)
4313       {
4314         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4315         {
4316           if(regs[t].regmap_entry[hr]!=-1)
4317           {
4318             return 0;
4319           }
4320           else 
4321           if((i_dirty>>hr)&1)
4322           {
4323             if(i_regmap[hr]<64)
4324             {
4325               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4326                 return 0;
4327             }
4328             else
4329             {
4330               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4331                 return 0;
4332             }
4333           }
4334         }
4335         else // Same register but is it 32-bit or dirty?
4336         if(i_regmap[hr]>=0)
4337         {
4338           if(!((regs[t].dirty>>hr)&1))
4339           {
4340             if((i_dirty>>hr)&1)
4341             {
4342               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4343               {
4344                 //printf("%x: dirty no match\n",addr);
4345                 return 0;
4346               }
4347             }
4348           }
4349           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4350           {
4351             //printf("%x: is32 no match\n",addr);
4352             return 0;
4353           }
4354         }
4355       }
4356     }
4357     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4358     if(requires_32bit[t]&~i_is32) return 0;
4359     // Delay slots are not valid branch targets
4360     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4361     // Delay slots require additional processing, so do not match
4362     if(is_ds[t]) return 0;
4363   }
4364   else
4365   {
4366     int hr;
4367     for(hr=0;hr<HOST_REGS;hr++)
4368     {
4369       if(hr!=EXCLUDE_REG)
4370       {
4371         if(i_regmap[hr]>=0)
4372         {
4373           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4374           {
4375             if((i_dirty>>hr)&1)
4376             {
4377               return 0;
4378             }
4379           }
4380         }
4381       }
4382     }
4383   }
4384   return 1;
4385 }
4386
4387 // Used when a branch jumps into the delay slot of another branch
4388 void ds_assemble_entry(int i)
4389 {
4390   int t=(ba[i]-start)>>2;
4391   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4392   assem_debug("Assemble delay slot at %x\n",ba[i]);
4393   assem_debug("<->\n");
4394   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4395     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4396   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4397   address_generation(t,&regs[t],regs[t].regmap_entry);
4398   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4399     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4400   cop1_usable=0;
4401   is_delayslot=0;
4402   switch(itype[t]) {
4403     case ALU:
4404       alu_assemble(t,&regs[t]);break;
4405     case IMM16:
4406       imm16_assemble(t,&regs[t]);break;
4407     case SHIFT:
4408       shift_assemble(t,&regs[t]);break;
4409     case SHIFTIMM:
4410       shiftimm_assemble(t,&regs[t]);break;
4411     case LOAD:
4412       load_assemble(t,&regs[t]);break;
4413     case LOADLR:
4414       loadlr_assemble(t,&regs[t]);break;
4415     case STORE:
4416       store_assemble(t,&regs[t]);break;
4417     case STORELR:
4418       storelr_assemble(t,&regs[t]);break;
4419     case COP0:
4420       cop0_assemble(t,&regs[t]);break;
4421     case COP1:
4422       cop1_assemble(t,&regs[t]);break;
4423     case C1LS:
4424       c1ls_assemble(t,&regs[t]);break;
4425     case FCONV:
4426       fconv_assemble(t,&regs[t]);break;
4427     case FLOAT:
4428       float_assemble(t,&regs[t]);break;
4429     case FCOMP:
4430       fcomp_assemble(t,&regs[t]);break;
4431     case MULTDIV:
4432       multdiv_assemble(t,&regs[t]);break;
4433     case MOV:
4434       mov_assemble(t,&regs[t]);break;
4435     case SYSCALL:
4436     case SPAN:
4437     case UJUMP:
4438     case RJUMP:
4439     case CJUMP:
4440     case SJUMP:
4441     case FJUMP:
4442       printf("Jump in the delay slot.  This is probably a bug.\n");
4443   }
4444   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4445   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4446   if(internal_branch(regs[t].is32,ba[i]+4))
4447     assem_debug("branch: internal\n");
4448   else
4449     assem_debug("branch: external\n");
4450   assert(internal_branch(regs[t].is32,ba[i]+4));
4451   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4452   emit_jmp(0);
4453 }
4454
4455 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4456 {
4457   int count;
4458   int jaddr;
4459   int idle=0;
4460   if(itype[i]==RJUMP)
4461   {
4462     *adj=0;
4463   }
4464   //if(ba[i]>=start && ba[i]<(start+slen*4))
4465   if(internal_branch(branch_regs[i].is32,ba[i]))
4466   {
4467     int t=(ba[i]-start)>>2;
4468     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4469     else *adj=ccadj[t];
4470   }
4471   else
4472   {
4473     *adj=0;
4474   }
4475   count=ccadj[i];
4476   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4477     // Idle loop
4478     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4479     idle=(int)out;
4480     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4481     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4482     jaddr=(int)out;
4483     emit_jmp(0);
4484   }
4485   else if(*adj==0||invert) {
4486     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4487     jaddr=(int)out;
4488     emit_jns(0);
4489   }
4490   else
4491   {
4492     emit_cmpimm(HOST_CCREG,-2*(count+2));
4493     jaddr=(int)out;
4494     emit_jns(0);
4495   }
4496   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4497 }
4498
4499 void do_ccstub(int n)
4500 {
4501   literal_pool(256);
4502   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4503   set_jump_target(stubs[n][1],(int)out);
4504   int i=stubs[n][4];
4505   if(stubs[n][6]==NULLDS) {
4506     // Delay slot instruction is nullified ("likely" branch)
4507     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4508   }
4509   else if(stubs[n][6]!=TAKEN) {
4510     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4511   }
4512   else {
4513     if(internal_branch(branch_regs[i].is32,ba[i]))
4514       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4515   }
4516   if(stubs[n][5]!=-1)
4517   {
4518     // Save PC as return address
4519     emit_movimm(stubs[n][5],EAX);
4520     emit_writeword(EAX,(int)&pcaddr);
4521   }
4522   else
4523   {
4524     // Return address depends on which way the branch goes
4525     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4526     {
4527       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4528       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4529       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4530       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4531       if(rs1[i]==0)
4532       {
4533         s1l=s2l;s1h=s2h;
4534         s2l=s2h=-1;
4535       }
4536       else if(rs2[i]==0)
4537       {
4538         s2l=s2h=-1;
4539       }
4540       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4541         s1h=s2h=-1;
4542       }
4543       assert(s1l>=0);
4544       #ifdef DESTRUCTIVE_WRITEBACK
4545       if(rs1[i]) {
4546         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4547           emit_loadreg(rs1[i],s1l);
4548       } 
4549       else {
4550         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4551           emit_loadreg(rs2[i],s1l);
4552       }
4553       if(s2l>=0)
4554         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4555           emit_loadreg(rs2[i],s2l);
4556       #endif
4557       int hr=0;
4558       int addr,alt,ntaddr;
4559       while(hr<HOST_REGS)
4560       {
4561         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4562            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4563            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4564         {
4565           addr=hr++;break;
4566         }
4567         hr++;
4568       }
4569       while(hr<HOST_REGS)
4570       {
4571         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4572            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4573            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4574         {
4575           alt=hr++;break;
4576         }
4577         hr++;
4578       }
4579       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4580       {
4581         while(hr<HOST_REGS)
4582         {
4583           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4584              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4585              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4586           {
4587             ntaddr=hr;break;
4588           }
4589           hr++;
4590         }
4591         assert(hr<HOST_REGS);
4592       }
4593       if((opcode[i]&0x2f)==4) // BEQ
4594       {
4595         #ifdef HAVE_CMOV_IMM
4596         if(s1h<0) {
4597           if(s2l>=0) emit_cmp(s1l,s2l);
4598           else emit_test(s1l,s1l);
4599           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4600         }
4601         else
4602         #endif
4603         {
4604           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4605           if(s1h>=0) {
4606             if(s2h>=0) emit_cmp(s1h,s2h);
4607             else emit_test(s1h,s1h);
4608             emit_cmovne_reg(alt,addr);
4609           }
4610           if(s2l>=0) emit_cmp(s1l,s2l);
4611           else emit_test(s1l,s1l);
4612           emit_cmovne_reg(alt,addr);
4613         }
4614       }
4615       if((opcode[i]&0x2f)==5) // BNE
4616       {
4617         #ifdef HAVE_CMOV_IMM
4618         if(s1h<0) {
4619           if(s2l>=0) emit_cmp(s1l,s2l);
4620           else emit_test(s1l,s1l);
4621           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4622         }
4623         else
4624         #endif
4625         {
4626           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4627           if(s1h>=0) {
4628             if(s2h>=0) emit_cmp(s1h,s2h);
4629             else emit_test(s1h,s1h);
4630             emit_cmovne_reg(alt,addr);
4631           }
4632           if(s2l>=0) emit_cmp(s1l,s2l);
4633           else emit_test(s1l,s1l);
4634           emit_cmovne_reg(alt,addr);
4635         }
4636       }
4637       if((opcode[i]&0x2f)==6) // BLEZ
4638       {
4639         //emit_movimm(ba[i],alt);
4640         //emit_movimm(start+i*4+8,addr);
4641         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4642         emit_cmpimm(s1l,1);
4643         if(s1h>=0) emit_mov(addr,ntaddr);
4644         emit_cmovl_reg(alt,addr);
4645         if(s1h>=0) {
4646           emit_test(s1h,s1h);
4647           emit_cmovne_reg(ntaddr,addr);
4648           emit_cmovs_reg(alt,addr);
4649         }
4650       }
4651       if((opcode[i]&0x2f)==7) // BGTZ
4652       {
4653         //emit_movimm(ba[i],addr);
4654         //emit_movimm(start+i*4+8,ntaddr);
4655         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4656         emit_cmpimm(s1l,1);
4657         if(s1h>=0) emit_mov(addr,alt);
4658         emit_cmovl_reg(ntaddr,addr);
4659         if(s1h>=0) {
4660           emit_test(s1h,s1h);
4661           emit_cmovne_reg(alt,addr);
4662           emit_cmovs_reg(ntaddr,addr);
4663         }
4664       }
4665       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4666       {
4667         //emit_movimm(ba[i],alt);
4668         //emit_movimm(start+i*4+8,addr);
4669         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4670         if(s1h>=0) emit_test(s1h,s1h);
4671         else emit_test(s1l,s1l);
4672         emit_cmovs_reg(alt,addr);
4673       }
4674       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4675       {
4676         //emit_movimm(ba[i],addr);
4677         //emit_movimm(start+i*4+8,alt);
4678         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4679         if(s1h>=0) emit_test(s1h,s1h);
4680         else emit_test(s1l,s1l);
4681         emit_cmovs_reg(alt,addr);
4682       }
4683       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4684         if(source[i]&0x10000) // BC1T
4685         {
4686           //emit_movimm(ba[i],alt);
4687           //emit_movimm(start+i*4+8,addr);
4688           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4689           emit_testimm(s1l,0x800000);
4690           emit_cmovne_reg(alt,addr);
4691         }
4692         else // BC1F
4693         {
4694           //emit_movimm(ba[i],addr);
4695           //emit_movimm(start+i*4+8,alt);
4696           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4697           emit_testimm(s1l,0x800000);
4698           emit_cmovne_reg(alt,addr);
4699         }
4700       }
4701       emit_writeword(addr,(int)&pcaddr);
4702     }
4703     else
4704     if(itype[i]==RJUMP)
4705     {
4706       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4707       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4708         r=get_reg(branch_regs[i].regmap,RTEMP);
4709       }
4710       emit_writeword(r,(int)&pcaddr);
4711     }
4712     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4713   }
4714   // Update cycle count
4715   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4716   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4717   emit_call((int)cc_interrupt);
4718   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4719   if(stubs[n][6]==TAKEN) {
4720     if(internal_branch(branch_regs[i].is32,ba[i]))
4721       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4722     else if(itype[i]==RJUMP) {
4723       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4724         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4725       else
4726         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4727     }
4728   }else if(stubs[n][6]==NOTTAKEN) {
4729     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4730     else load_all_regs(branch_regs[i].regmap);
4731   }else if(stubs[n][6]==NULLDS) {
4732     // Delay slot instruction is nullified ("likely" branch)
4733     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4734     else load_all_regs(regs[i].regmap);
4735   }else{
4736     load_all_regs(branch_regs[i].regmap);
4737   }
4738   emit_jmp(stubs[n][2]); // return address
4739   
4740   /* This works but uses a lot of memory...
4741   emit_readword((int)&last_count,ECX);
4742   emit_add(HOST_CCREG,ECX,EAX);
4743   emit_writeword(EAX,(int)&Count);
4744   emit_call((int)gen_interupt);
4745   emit_readword((int)&Count,HOST_CCREG);
4746   emit_readword((int)&next_interupt,EAX);
4747   emit_readword((int)&pending_exception,EBX);
4748   emit_writeword(EAX,(int)&last_count);
4749   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4750   emit_test(EBX,EBX);
4751   int jne_instr=(int)out;
4752   emit_jne(0);
4753   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4754   load_all_regs(branch_regs[i].regmap);
4755   emit_jmp(stubs[n][2]); // return address
4756   set_jump_target(jne_instr,(int)out);
4757   emit_readword((int)&pcaddr,EAX);
4758   // Call get_addr_ht instead of doing the hash table here.
4759   // This code is executed infrequently and takes up a lot of space
4760   // so smaller is better.
4761   emit_storereg(CCREG,HOST_CCREG);
4762   emit_pushreg(EAX);
4763   emit_call((int)get_addr_ht);
4764   emit_loadreg(CCREG,HOST_CCREG);
4765   emit_addimm(ESP,4,ESP);
4766   emit_jmpreg(EAX);*/
4767 }
4768
4769 add_to_linker(int addr,int target,int ext)
4770 {
4771   link_addr[linkcount][0]=addr;
4772   link_addr[linkcount][1]=target;
4773   link_addr[linkcount][2]=ext;  
4774   linkcount++;
4775 }
4776
4777 void ujump_assemble(int i,struct regstat *i_regs)
4778 {
4779   signed char *i_regmap=i_regs->regmap;
4780   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4781   address_generation(i+1,i_regs,regs[i].regmap_entry);
4782   #ifdef REG_PREFETCH
4783   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4784   if(rt1[i]==31&&temp>=0) 
4785   {
4786     int return_address=start+i*4+8;
4787     if(get_reg(branch_regs[i].regmap,31)>0) 
4788     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4789   }
4790   #endif
4791   ds_assemble(i+1,i_regs);
4792   uint64_t bc_unneeded=branch_regs[i].u;
4793   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4794   bc_unneeded|=1|(1LL<<rt1[i]);
4795   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4796   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4797                 bc_unneeded,bc_unneeded_upper);
4798   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4799   if(rt1[i]==31) {
4800     int rt;
4801     unsigned int return_address;
4802     assert(rt1[i+1]!=31);
4803     assert(rt2[i+1]!=31);
4804     rt=get_reg(branch_regs[i].regmap,31);
4805     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4806     //assert(rt>=0);
4807     return_address=start+i*4+8;
4808     if(rt>=0) {
4809       #ifdef USE_MINI_HT
4810       if(internal_branch(branch_regs[i].is32,return_address)) {
4811         int temp=rt+1;
4812         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4813            branch_regs[i].regmap[temp]>=0)
4814         {
4815           temp=get_reg(branch_regs[i].regmap,-1);
4816         }
4817         #ifdef HOST_TEMPREG
4818         if(temp<0) temp=HOST_TEMPREG;
4819         #endif
4820         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4821         else emit_movimm(return_address,rt);
4822       }
4823       else
4824       #endif
4825       {
4826         #ifdef REG_PREFETCH
4827         if(temp>=0) 
4828         {
4829           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4830         }
4831         #endif
4832         emit_movimm(return_address,rt); // PC into link register
4833         #ifdef IMM_PREFETCH
4834         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4835         #endif
4836       }
4837     }
4838   }
4839   int cc,adj;
4840   cc=get_reg(branch_regs[i].regmap,CCREG);
4841   assert(cc==HOST_CCREG);
4842   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4843   #ifdef REG_PREFETCH
4844   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4845   #endif
4846   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4847   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4848   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4849   if(internal_branch(branch_regs[i].is32,ba[i]))
4850     assem_debug("branch: internal\n");
4851   else
4852     assem_debug("branch: external\n");
4853   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4854     ds_assemble_entry(i);
4855   }
4856   else {
4857     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4858     emit_jmp(0);
4859   }
4860 }
4861
4862 void rjump_assemble(int i,struct regstat *i_regs)
4863 {
4864   signed char *i_regmap=i_regs->regmap;
4865   int temp;
4866   int rs,cc,adj;
4867   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4868   assert(rs>=0);
4869   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4870     // Delay slot abuse, make a copy of the branch address register
4871     temp=get_reg(branch_regs[i].regmap,RTEMP);
4872     assert(temp>=0);
4873     assert(regs[i].regmap[temp]==RTEMP);
4874     emit_mov(rs,temp);
4875     rs=temp;
4876   }
4877   address_generation(i+1,i_regs,regs[i].regmap_entry);
4878   #ifdef REG_PREFETCH
4879   if(rt1[i]==31) 
4880   {
4881     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4882       int return_address=start+i*4+8;
4883       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4884     }
4885   }
4886   #endif
4887   #ifdef USE_MINI_HT
4888   if(rs1[i]==31) {
4889     int rh=get_reg(regs[i].regmap,RHASH);
4890     if(rh>=0) do_preload_rhash(rh);
4891   }
4892   #endif
4893   ds_assemble(i+1,i_regs);
4894   uint64_t bc_unneeded=branch_regs[i].u;
4895   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4896   bc_unneeded|=1|(1LL<<rt1[i]);
4897   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4898   bc_unneeded&=~(1LL<<rs1[i]);
4899   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4900                 bc_unneeded,bc_unneeded_upper);
4901   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4902   if(rt1[i]==31) {
4903     int rt,return_address;
4904     assert(rt1[i+1]!=31);
4905     assert(rt2[i+1]!=31);
4906     rt=get_reg(branch_regs[i].regmap,31);
4907     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4908     assert(rt>=0);
4909     return_address=start+i*4+8;
4910     #ifdef REG_PREFETCH
4911     if(temp>=0) 
4912     {
4913       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4914     }
4915     #endif
4916     emit_movimm(return_address,rt); // PC into link register
4917     #ifdef IMM_PREFETCH
4918     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4919     #endif
4920   }
4921   cc=get_reg(branch_regs[i].regmap,CCREG);
4922   assert(cc==HOST_CCREG);
4923   #ifdef USE_MINI_HT
4924   int rh=get_reg(branch_regs[i].regmap,RHASH);
4925   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4926   if(rs1[i]==31) {
4927     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4928     do_preload_rhtbl(ht);
4929     do_rhash(rs,rh);
4930   }
4931   #endif
4932   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4933   #ifdef DESTRUCTIVE_WRITEBACK
4934   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4935     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4936       emit_loadreg(rs1[i],rs);
4937     }
4938   }
4939   #endif
4940   #ifdef REG_PREFETCH
4941   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4942   #endif
4943   #ifdef USE_MINI_HT
4944   if(rs1[i]==31) {
4945     do_miniht_load(ht,rh);
4946   }
4947   #endif
4948   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4949   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4950   //assert(adj==0);
4951   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
4952   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4953   emit_jns(0);
4954   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4955   #ifdef USE_MINI_HT
4956   if(rs1[i]==31) {
4957     do_miniht_jump(rs,rh,ht);
4958   }
4959   else
4960   #endif
4961   {
4962     //if(rs!=EAX) emit_mov(rs,EAX);
4963     //emit_jmp((int)jump_vaddr_eax);
4964     emit_jmp(jump_vaddr_reg[rs]);
4965   }
4966   /* Check hash table
4967   temp=!rs;
4968   emit_mov(rs,temp);
4969   emit_shrimm(rs,16,rs);
4970   emit_xor(temp,rs,rs);
4971   emit_movzwl_reg(rs,rs);
4972   emit_shlimm(rs,4,rs);
4973   emit_cmpmem_indexed((int)hash_table,rs,temp);
4974   emit_jne((int)out+14);
4975   emit_readword_indexed((int)hash_table+4,rs,rs);
4976   emit_jmpreg(rs);
4977   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4978   emit_addimm_no_flags(8,rs);
4979   emit_jeq((int)out-17);
4980   // No hit on hash table, call compiler
4981   emit_pushreg(temp);
4982 //DEBUG >
4983 #ifdef DEBUG_CYCLE_COUNT
4984   emit_readword((int)&last_count,ECX);
4985   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4986   emit_readword((int)&next_interupt,ECX);
4987   emit_writeword(HOST_CCREG,(int)&Count);
4988   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4989   emit_writeword(ECX,(int)&last_count);
4990 #endif
4991 //DEBUG <
4992   emit_storereg(CCREG,HOST_CCREG);
4993   emit_call((int)get_addr);
4994   emit_loadreg(CCREG,HOST_CCREG);
4995   emit_addimm(ESP,4,ESP);
4996   emit_jmpreg(EAX);*/
4997   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4998   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4999   #endif
5000 }
5001
5002 void cjump_assemble(int i,struct regstat *i_regs)
5003 {
5004   signed char *i_regmap=i_regs->regmap;
5005   int cc;
5006   int match;
5007   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5008   assem_debug("match=%d\n",match);
5009   int s1h,s1l,s2h,s2l;
5010   int prev_cop1_usable=cop1_usable;
5011   int unconditional=0,nop=0;
5012   int only32=0;
5013   int ooo=1;
5014   int invert=0;
5015   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5016   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5017   if(likely[i]) ooo=0;
5018   if(!match) invert=1;
5019   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5020   if(i>(ba[i]-start)>>2) invert=1;
5021   #endif
5022     
5023   if(ooo)
5024     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5025        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5026   {
5027     // Write-after-read dependency prevents out of order execution
5028     // First test branch condition, then execute delay slot, then branch
5029     ooo=0;
5030   }
5031
5032   if(ooo) {
5033     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5034     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5035     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5036     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5037   }
5038   else {
5039     s1l=get_reg(i_regmap,rs1[i]);
5040     s1h=get_reg(i_regmap,rs1[i]|64);
5041     s2l=get_reg(i_regmap,rs2[i]);
5042     s2h=get_reg(i_regmap,rs2[i]|64);
5043   }
5044   if(rs1[i]==0&&rs2[i]==0)
5045   {
5046     if(opcode[i]&1) nop=1;
5047     else unconditional=1;
5048     //assert(opcode[i]!=5);
5049     //assert(opcode[i]!=7);
5050     //assert(opcode[i]!=0x15);
5051     //assert(opcode[i]!=0x17);
5052   }
5053   else if(rs1[i]==0)
5054   {
5055     s1l=s2l;s1h=s2h;
5056     s2l=s2h=-1;
5057     only32=(regs[i].was32>>rs2[i])&1;
5058   }
5059   else if(rs2[i]==0)
5060   {
5061     s2l=s2h=-1;
5062     only32=(regs[i].was32>>rs1[i])&1;
5063   }
5064   else {
5065     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5066   }
5067
5068   if(ooo) {
5069     // Out of order execution (delay slot first)
5070     //printf("OOOE\n");
5071     address_generation(i+1,i_regs,regs[i].regmap_entry);
5072     ds_assemble(i+1,i_regs);
5073     int adj;
5074     uint64_t bc_unneeded=branch_regs[i].u;
5075     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5076     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5077     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5078     bc_unneeded|=1;
5079     bc_unneeded_upper|=1;
5080     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5081                   bc_unneeded,bc_unneeded_upper);
5082     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5083     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5084     cc=get_reg(branch_regs[i].regmap,CCREG);
5085     assert(cc==HOST_CCREG);
5086     if(unconditional) 
5087       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5088     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5089     //assem_debug("cycle count (adj)\n");
5090     if(unconditional) {
5091       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5092       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5093         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5094         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5095         if(internal)
5096           assem_debug("branch: internal\n");
5097         else
5098           assem_debug("branch: external\n");
5099         if(internal&&is_ds[(ba[i]-start)>>2]) {
5100           ds_assemble_entry(i);
5101         }
5102         else {
5103           add_to_linker((int)out,ba[i],internal);
5104           emit_jmp(0);
5105         }
5106         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5107         if(((u_int)out)&7) emit_addnop(0);
5108         #endif
5109       }
5110     }
5111     else if(nop) {
5112       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5113       int jaddr=(int)out;
5114       emit_jns(0);
5115       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5116     }
5117     else {
5118       int taken=0,nottaken=0,nottaken1=0;
5119       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5120       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5121       if(!only32)
5122       {
5123         assert(s1h>=0);
5124         if(opcode[i]==4) // BEQ
5125         {
5126           if(s2h>=0) emit_cmp(s1h,s2h);
5127           else emit_test(s1h,s1h);
5128           nottaken1=(int)out;
5129           emit_jne(1);
5130         }
5131         if(opcode[i]==5) // BNE
5132         {
5133           if(s2h>=0) emit_cmp(s1h,s2h);
5134           else emit_test(s1h,s1h);
5135           if(invert) taken=(int)out;
5136           else add_to_linker((int)out,ba[i],internal);
5137           emit_jne(0);
5138         }
5139         if(opcode[i]==6) // BLEZ
5140         {
5141           emit_test(s1h,s1h);
5142           if(invert) taken=(int)out;
5143           else add_to_linker((int)out,ba[i],internal);
5144           emit_js(0);
5145           nottaken1=(int)out;
5146           emit_jne(1);
5147         }
5148         if(opcode[i]==7) // BGTZ
5149         {
5150           emit_test(s1h,s1h);
5151           nottaken1=(int)out;
5152           emit_js(1);
5153           if(invert) taken=(int)out;
5154           else add_to_linker((int)out,ba[i],internal);
5155           emit_jne(0);
5156         }
5157       } // if(!only32)
5158           
5159       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5160       assert(s1l>=0);
5161       if(opcode[i]==4) // BEQ
5162       {
5163         if(s2l>=0) emit_cmp(s1l,s2l);
5164         else emit_test(s1l,s1l);
5165         if(invert){
5166           nottaken=(int)out;
5167           emit_jne(1);
5168         }else{
5169           add_to_linker((int)out,ba[i],internal);
5170           emit_jeq(0);
5171         }
5172       }
5173       if(opcode[i]==5) // BNE
5174       {
5175         if(s2l>=0) emit_cmp(s1l,s2l);
5176         else emit_test(s1l,s1l);
5177         if(invert){
5178           nottaken=(int)out;
5179           emit_jeq(1);
5180         }else{
5181           add_to_linker((int)out,ba[i],internal);
5182           emit_jne(0);
5183         }
5184       }
5185       if(opcode[i]==6) // BLEZ
5186       {
5187         emit_cmpimm(s1l,1);
5188         if(invert){
5189           nottaken=(int)out;
5190           emit_jge(1);
5191         }else{
5192           add_to_linker((int)out,ba[i],internal);
5193           emit_jl(0);
5194         }
5195       }
5196       if(opcode[i]==7) // BGTZ
5197       {
5198         emit_cmpimm(s1l,1);
5199         if(invert){
5200           nottaken=(int)out;
5201           emit_jl(1);
5202         }else{
5203           add_to_linker((int)out,ba[i],internal);
5204           emit_jge(0);
5205         }
5206       }
5207       if(invert) {
5208         if(taken) set_jump_target(taken,(int)out);
5209         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5210         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5211           if(adj) {
5212             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5213             add_to_linker((int)out,ba[i],internal);
5214           }else{
5215             emit_addnop(13);
5216             add_to_linker((int)out,ba[i],internal*2);
5217           }
5218           emit_jmp(0);
5219         }else
5220         #endif
5221         {
5222           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5223           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5224           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5225           if(internal)
5226             assem_debug("branch: internal\n");
5227           else
5228             assem_debug("branch: external\n");
5229           if(internal&&is_ds[(ba[i]-start)>>2]) {
5230             ds_assemble_entry(i);
5231           }
5232           else {
5233             add_to_linker((int)out,ba[i],internal);
5234             emit_jmp(0);
5235           }
5236         }
5237         set_jump_target(nottaken,(int)out);
5238       }
5239
5240       if(nottaken1) set_jump_target(nottaken1,(int)out);
5241       if(adj) {
5242         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5243       }
5244     } // (!unconditional)
5245   } // if(ooo)
5246   else
5247   {
5248     // In-order execution (branch first)
5249     //if(likely[i]) printf("IOL\n");
5250     //else
5251     //printf("IOE\n");
5252     int taken=0,nottaken=0,nottaken1=0;
5253     if(!unconditional&&!nop) {
5254       if(!only32)
5255       {
5256         assert(s1h>=0);
5257         if((opcode[i]&0x2f)==4) // BEQ
5258         {
5259           if(s2h>=0) emit_cmp(s1h,s2h);
5260           else emit_test(s1h,s1h);
5261           nottaken1=(int)out;
5262           emit_jne(2);
5263         }
5264         if((opcode[i]&0x2f)==5) // BNE
5265         {
5266           if(s2h>=0) emit_cmp(s1h,s2h);
5267           else emit_test(s1h,s1h);
5268           taken=(int)out;
5269           emit_jne(1);
5270         }
5271         if((opcode[i]&0x2f)==6) // BLEZ
5272         {
5273           emit_test(s1h,s1h);
5274           taken=(int)out;
5275           emit_js(1);
5276           nottaken1=(int)out;
5277           emit_jne(2);
5278         }
5279         if((opcode[i]&0x2f)==7) // BGTZ
5280         {
5281           emit_test(s1h,s1h);
5282           nottaken1=(int)out;
5283           emit_js(2);
5284           taken=(int)out;
5285           emit_jne(1);
5286         }
5287       } // if(!only32)
5288           
5289       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5290       assert(s1l>=0);
5291       if((opcode[i]&0x2f)==4) // BEQ
5292       {
5293         if(s2l>=0) emit_cmp(s1l,s2l);
5294         else emit_test(s1l,s1l);
5295         nottaken=(int)out;
5296         emit_jne(2);
5297       }
5298       if((opcode[i]&0x2f)==5) // BNE
5299       {
5300         if(s2l>=0) emit_cmp(s1l,s2l);
5301         else emit_test(s1l,s1l);
5302         nottaken=(int)out;
5303         emit_jeq(2);
5304       }
5305       if((opcode[i]&0x2f)==6) // BLEZ
5306       {
5307         emit_cmpimm(s1l,1);
5308         nottaken=(int)out;
5309         emit_jge(2);
5310       }
5311       if((opcode[i]&0x2f)==7) // BGTZ
5312       {
5313         emit_cmpimm(s1l,1);
5314         nottaken=(int)out;
5315         emit_jl(2);
5316       }
5317     } // if(!unconditional)
5318     int adj;
5319     uint64_t ds_unneeded=branch_regs[i].u;
5320     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5321     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5322     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5323     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5324     ds_unneeded|=1;
5325     ds_unneeded_upper|=1;
5326     // branch taken
5327     if(!nop) {
5328       if(taken) set_jump_target(taken,(int)out);
5329       assem_debug("1:\n");
5330       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5331                     ds_unneeded,ds_unneeded_upper);
5332       // load regs
5333       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5334       address_generation(i+1,&branch_regs[i],0);
5335       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5336       ds_assemble(i+1,&branch_regs[i]);
5337       cc=get_reg(branch_regs[i].regmap,CCREG);
5338       if(cc==-1) {
5339         emit_loadreg(CCREG,cc=HOST_CCREG);
5340         // CHECK: Is the following instruction (fall thru) allocated ok?
5341       }
5342       assert(cc==HOST_CCREG);
5343       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5344       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5345       assem_debug("cycle count (adj)\n");
5346       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5347       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5348       if(internal)
5349         assem_debug("branch: internal\n");
5350       else
5351         assem_debug("branch: external\n");
5352       if(internal&&is_ds[(ba[i]-start)>>2]) {
5353         ds_assemble_entry(i);
5354       }
5355       else {
5356         add_to_linker((int)out,ba[i],internal);
5357         emit_jmp(0);
5358       }
5359     }
5360     // branch not taken
5361     cop1_usable=prev_cop1_usable;
5362     if(!unconditional) {
5363       if(nottaken1) set_jump_target(nottaken1,(int)out);
5364       set_jump_target(nottaken,(int)out);
5365       assem_debug("2:\n");
5366       if(!likely[i]) {
5367         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5368                       ds_unneeded,ds_unneeded_upper);
5369         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5370         address_generation(i+1,&branch_regs[i],0);
5371         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5372         ds_assemble(i+1,&branch_regs[i]);
5373       }
5374       cc=get_reg(branch_regs[i].regmap,CCREG);
5375       if(cc==-1&&!likely[i]) {
5376         // Cycle count isn't in a register, temporarily load it then write it out
5377         emit_loadreg(CCREG,HOST_CCREG);
5378         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5379         int jaddr=(int)out;
5380         emit_jns(0);
5381         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5382         emit_storereg(CCREG,HOST_CCREG);
5383       }
5384       else{
5385         cc=get_reg(i_regmap,CCREG);
5386         assert(cc==HOST_CCREG);
5387         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5388         int jaddr=(int)out;
5389         emit_jns(0);
5390         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5391       }
5392     }
5393   }
5394 }
5395
5396 void sjump_assemble(int i,struct regstat *i_regs)
5397 {
5398   signed char *i_regmap=i_regs->regmap;
5399   int cc;
5400   int match;
5401   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5402   assem_debug("smatch=%d\n",match);
5403   int s1h,s1l;
5404   int prev_cop1_usable=cop1_usable;
5405   int unconditional=0,nevertaken=0;
5406   int only32=0;
5407   int ooo=1;
5408   int invert=0;
5409   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5410   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5411   if(likely[i]) ooo=0;
5412   if(!match) invert=1;
5413   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5414   if(i>(ba[i]-start)>>2) invert=1;
5415   #endif
5416
5417   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5418   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5419
5420   if(ooo)
5421     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5422   {
5423     // Write-after-read dependency prevents out of order execution
5424     // First test branch condition, then execute delay slot, then branch
5425     ooo=0;
5426   }
5427   // TODO: Conditional branches w/link must execute in-order so that
5428   // condition test and write to r31 occur before cycle count test
5429
5430   if(ooo) {
5431     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5432     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5433   }
5434   else {
5435     s1l=get_reg(i_regmap,rs1[i]);
5436     s1h=get_reg(i_regmap,rs1[i]|64);
5437   }
5438   if(rs1[i]==0)
5439   {
5440     if(opcode2[i]&1) unconditional=1;
5441     else nevertaken=1;
5442     // These are never taken (r0 is never less than zero)
5443     //assert(opcode2[i]!=0);
5444     //assert(opcode2[i]!=2);
5445     //assert(opcode2[i]!=0x10);
5446     //assert(opcode2[i]!=0x12);
5447   }
5448   else {
5449     only32=(regs[i].was32>>rs1[i])&1;
5450   }
5451
5452   if(ooo) {
5453     // Out of order execution (delay slot first)
5454     //printf("OOOE\n");
5455     address_generation(i+1,i_regs,regs[i].regmap_entry);
5456     ds_assemble(i+1,i_regs);
5457     int adj;
5458     uint64_t bc_unneeded=branch_regs[i].u;
5459     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5460     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5461     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5462     bc_unneeded|=1;
5463     bc_unneeded_upper|=1;
5464     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5465                   bc_unneeded,bc_unneeded_upper);
5466     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5467     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5468     if(rt1[i]==31) {
5469       int rt,return_address;
5470       assert(rt1[i+1]!=31);
5471       assert(rt2[i+1]!=31);
5472       rt=get_reg(branch_regs[i].regmap,31);
5473       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5474       if(rt>=0) {
5475         // Save the PC even if the branch is not taken
5476         return_address=start+i*4+8;
5477         emit_movimm(return_address,rt); // PC into link register
5478         #ifdef IMM_PREFETCH
5479         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5480         #endif
5481       }
5482     }
5483     cc=get_reg(branch_regs[i].regmap,CCREG);
5484     assert(cc==HOST_CCREG);
5485     if(unconditional) 
5486       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5487     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5488     assem_debug("cycle count (adj)\n");
5489     if(unconditional) {
5490       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5491       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5492         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5493         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5494         if(internal)
5495           assem_debug("branch: internal\n");
5496         else
5497           assem_debug("branch: external\n");
5498         if(internal&&is_ds[(ba[i]-start)>>2]) {
5499           ds_assemble_entry(i);
5500         }
5501         else {
5502           add_to_linker((int)out,ba[i],internal);
5503           emit_jmp(0);
5504         }
5505         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5506         if(((u_int)out)&7) emit_addnop(0);
5507         #endif
5508       }
5509     }
5510     else if(nevertaken) {
5511       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5512       int jaddr=(int)out;
5513       emit_jns(0);
5514       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5515     }
5516     else {
5517       int nottaken=0;
5518       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5519       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5520       if(!only32)
5521       {
5522         assert(s1h>=0);
5523         if(opcode2[i]==0) // BLTZ
5524         {
5525           emit_test(s1h,s1h);
5526           if(invert){
5527             nottaken=(int)out;
5528             emit_jns(1);
5529           }else{
5530             add_to_linker((int)out,ba[i],internal);
5531             emit_js(0);
5532           }
5533         }
5534         if(opcode2[i]==1) // BGEZ
5535         {
5536           emit_test(s1h,s1h);
5537           if(invert){
5538             nottaken=(int)out;
5539             emit_js(1);
5540           }else{
5541             add_to_linker((int)out,ba[i],internal);
5542             emit_jns(0);
5543           }
5544         }
5545       } // if(!only32)
5546       else
5547       {
5548         assert(s1l>=0);
5549         if(opcode2[i]==0) // BLTZ
5550         {
5551           emit_test(s1l,s1l);
5552           if(invert){
5553             nottaken=(int)out;
5554             emit_jns(1);
5555           }else{
5556             add_to_linker((int)out,ba[i],internal);
5557             emit_js(0);
5558           }
5559         }
5560         if(opcode2[i]==1) // BGEZ
5561         {
5562           emit_test(s1l,s1l);
5563           if(invert){
5564             nottaken=(int)out;
5565             emit_js(1);
5566           }else{
5567             add_to_linker((int)out,ba[i],internal);
5568             emit_jns(0);
5569           }
5570         }
5571       } // if(!only32)
5572           
5573       if(invert) {
5574         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5575         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5576           if(adj) {
5577             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5578             add_to_linker((int)out,ba[i],internal);
5579           }else{
5580             emit_addnop(13);
5581             add_to_linker((int)out,ba[i],internal*2);
5582           }
5583           emit_jmp(0);
5584         }else
5585         #endif
5586         {
5587           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5588           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5589           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5590           if(internal)
5591             assem_debug("branch: internal\n");
5592           else
5593             assem_debug("branch: external\n");
5594           if(internal&&is_ds[(ba[i]-start)>>2]) {
5595             ds_assemble_entry(i);
5596           }
5597           else {
5598             add_to_linker((int)out,ba[i],internal);
5599             emit_jmp(0);
5600           }
5601         }
5602         set_jump_target(nottaken,(int)out);
5603       }
5604
5605       if(adj) {
5606         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5607       }
5608     } // (!unconditional)
5609   } // if(ooo)
5610   else
5611   {
5612     // In-order execution (branch first)
5613     //printf("IOE\n");
5614     int nottaken=0;
5615     if(!unconditional) {
5616       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5617       if(!only32)
5618       {
5619         assert(s1h>=0);
5620         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5621         {
5622           emit_test(s1h,s1h);
5623           nottaken=(int)out;
5624           emit_jns(1);
5625         }
5626         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5627         {
5628           emit_test(s1h,s1h);
5629           nottaken=(int)out;
5630           emit_js(1);
5631         }
5632       } // if(!only32)
5633       else
5634       {
5635         assert(s1l>=0);
5636         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5637         {
5638           emit_test(s1l,s1l);
5639           nottaken=(int)out;
5640           emit_jns(1);
5641         }
5642         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5643         {
5644           emit_test(s1l,s1l);
5645           nottaken=(int)out;
5646           emit_js(1);
5647         }
5648       }
5649     } // if(!unconditional)
5650     int adj;
5651     uint64_t ds_unneeded=branch_regs[i].u;
5652     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5653     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5654     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5655     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5656     ds_unneeded|=1;
5657     ds_unneeded_upper|=1;
5658     // branch taken
5659     if(!nevertaken) {
5660       //assem_debug("1:\n");
5661       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5662                     ds_unneeded,ds_unneeded_upper);
5663       // load regs
5664       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5665       address_generation(i+1,&branch_regs[i],0);
5666       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5667       ds_assemble(i+1,&branch_regs[i]);
5668       cc=get_reg(branch_regs[i].regmap,CCREG);
5669       if(cc==-1) {
5670         emit_loadreg(CCREG,cc=HOST_CCREG);
5671         // CHECK: Is the following instruction (fall thru) allocated ok?
5672       }
5673       assert(cc==HOST_CCREG);
5674       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5675       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5676       assem_debug("cycle count (adj)\n");
5677       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5678       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5679       if(internal)
5680         assem_debug("branch: internal\n");
5681       else
5682         assem_debug("branch: external\n");
5683       if(internal&&is_ds[(ba[i]-start)>>2]) {
5684         ds_assemble_entry(i);
5685       }
5686       else {
5687         add_to_linker((int)out,ba[i],internal);
5688         emit_jmp(0);
5689       }
5690     }
5691     // branch not taken
5692     cop1_usable=prev_cop1_usable;
5693     if(!unconditional) {
5694       set_jump_target(nottaken,(int)out);
5695       assem_debug("1:\n");
5696       if(!likely[i]) {
5697         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5698                       ds_unneeded,ds_unneeded_upper);
5699         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5700         address_generation(i+1,&branch_regs[i],0);
5701         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5702         ds_assemble(i+1,&branch_regs[i]);
5703       }
5704       cc=get_reg(branch_regs[i].regmap,CCREG);
5705       if(cc==-1&&!likely[i]) {
5706         // Cycle count isn't in a register, temporarily load it then write it out
5707         emit_loadreg(CCREG,HOST_CCREG);
5708         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5709         int jaddr=(int)out;
5710         emit_jns(0);
5711         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5712         emit_storereg(CCREG,HOST_CCREG);
5713       }
5714       else{
5715         cc=get_reg(i_regmap,CCREG);
5716         assert(cc==HOST_CCREG);
5717         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5718         int jaddr=(int)out;
5719         emit_jns(0);
5720         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5721       }
5722     }
5723   }
5724 }
5725
5726 void fjump_assemble(int i,struct regstat *i_regs)
5727 {
5728   signed char *i_regmap=i_regs->regmap;
5729   int cc;
5730   int match;
5731   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5732   assem_debug("fmatch=%d\n",match);
5733   int fs,cs;
5734   int eaddr;
5735   int ooo=1;
5736   int invert=0;
5737   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5738   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5739   if(likely[i]) ooo=0;
5740   if(!match) invert=1;
5741   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5742   if(i>(ba[i]-start)>>2) invert=1;
5743   #endif
5744
5745   if(ooo)
5746     if(itype[i+1]==FCOMP)
5747   {
5748     // Write-after-read dependency prevents out of order execution
5749     // First test branch condition, then execute delay slot, then branch
5750     ooo=0;
5751   }
5752
5753   if(ooo) {
5754     fs=get_reg(branch_regs[i].regmap,FSREG);
5755     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5756   }
5757   else {
5758     fs=get_reg(i_regmap,FSREG);
5759   }
5760
5761   // Check cop1 unusable
5762   if(!cop1_usable) {
5763     cs=get_reg(i_regmap,CSREG);
5764     assert(cs>=0);
5765     emit_testimm(cs,0x20000000);
5766     eaddr=(int)out;
5767     emit_jeq(0);
5768     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5769     cop1_usable=1;
5770   }
5771
5772   if(ooo) {
5773     // Out of order execution (delay slot first)
5774     //printf("OOOE\n");
5775     ds_assemble(i+1,i_regs);
5776     int adj;
5777     uint64_t bc_unneeded=branch_regs[i].u;
5778     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5779     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5780     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5781     bc_unneeded|=1;
5782     bc_unneeded_upper|=1;
5783     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5784                   bc_unneeded,bc_unneeded_upper);
5785     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5786     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5787     cc=get_reg(branch_regs[i].regmap,CCREG);
5788     assert(cc==HOST_CCREG);
5789     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5790     assem_debug("cycle count (adj)\n");
5791     if(1) {
5792       int nottaken=0;
5793       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5794       if(1) {
5795         assert(fs>=0);
5796         emit_testimm(fs,0x800000);
5797         if(source[i]&0x10000) // BC1T
5798         {
5799           if(invert){
5800             nottaken=(int)out;
5801             emit_jeq(1);
5802           }else{
5803             add_to_linker((int)out,ba[i],internal);
5804             emit_jne(0);
5805           }
5806         }
5807         else // BC1F
5808           if(invert){
5809             nottaken=(int)out;
5810             emit_jne(1);
5811           }else{
5812             add_to_linker((int)out,ba[i],internal);
5813             emit_jeq(0);
5814           }
5815         {
5816         }
5817       } // if(!only32)
5818           
5819       if(invert) {
5820         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5821         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5822         else if(match) emit_addnop(13);
5823         #endif
5824         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5825         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5826         if(internal)
5827           assem_debug("branch: internal\n");
5828         else
5829           assem_debug("branch: external\n");
5830         if(internal&&is_ds[(ba[i]-start)>>2]) {
5831           ds_assemble_entry(i);
5832         }
5833         else {
5834           add_to_linker((int)out,ba[i],internal);
5835           emit_jmp(0);
5836         }
5837         set_jump_target(nottaken,(int)out);
5838       }
5839
5840       if(adj) {
5841         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5842       }
5843     } // (!unconditional)
5844   } // if(ooo)
5845   else
5846   {
5847     // In-order execution (branch first)
5848     //printf("IOE\n");
5849     int nottaken=0;
5850     if(1) {
5851       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5852       if(1) {
5853         assert(fs>=0);
5854         emit_testimm(fs,0x800000);
5855         if(source[i]&0x10000) // BC1T
5856         {
5857           nottaken=(int)out;
5858           emit_jeq(1);
5859         }
5860         else // BC1F
5861         {
5862           nottaken=(int)out;
5863           emit_jne(1);
5864         }
5865       }
5866     } // if(!unconditional)
5867     int adj;
5868     uint64_t ds_unneeded=branch_regs[i].u;
5869     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5870     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5871     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5872     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5873     ds_unneeded|=1;
5874     ds_unneeded_upper|=1;
5875     // branch taken
5876     //assem_debug("1:\n");
5877     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5878                   ds_unneeded,ds_unneeded_upper);
5879     // load regs
5880     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5881     address_generation(i+1,&branch_regs[i],0);
5882     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5883     ds_assemble(i+1,&branch_regs[i]);
5884     cc=get_reg(branch_regs[i].regmap,CCREG);
5885     if(cc==-1) {
5886       emit_loadreg(CCREG,cc=HOST_CCREG);
5887       // CHECK: Is the following instruction (fall thru) allocated ok?
5888     }
5889     assert(cc==HOST_CCREG);
5890     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5891     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5892     assem_debug("cycle count (adj)\n");
5893     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5894     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5895     if(internal)
5896       assem_debug("branch: internal\n");
5897     else
5898       assem_debug("branch: external\n");
5899     if(internal&&is_ds[(ba[i]-start)>>2]) {
5900       ds_assemble_entry(i);
5901     }
5902     else {
5903       add_to_linker((int)out,ba[i],internal);
5904       emit_jmp(0);
5905     }
5906
5907     // branch not taken
5908     if(1) { // <- FIXME (don't need this)
5909       set_jump_target(nottaken,(int)out);
5910       assem_debug("1:\n");
5911       if(!likely[i]) {
5912         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5913                       ds_unneeded,ds_unneeded_upper);
5914         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5915         address_generation(i+1,&branch_regs[i],0);
5916         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5917         ds_assemble(i+1,&branch_regs[i]);
5918       }
5919       cc=get_reg(branch_regs[i].regmap,CCREG);
5920       if(cc==-1&&!likely[i]) {
5921         // Cycle count isn't in a register, temporarily load it then write it out
5922         emit_loadreg(CCREG,HOST_CCREG);
5923         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5924         int jaddr=(int)out;
5925         emit_jns(0);
5926         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5927         emit_storereg(CCREG,HOST_CCREG);
5928       }
5929       else{
5930         cc=get_reg(i_regmap,CCREG);
5931         assert(cc==HOST_CCREG);
5932         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5933         int jaddr=(int)out;
5934         emit_jns(0);
5935         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5936       }
5937     }
5938   }
5939 }
5940
5941 static void pagespan_assemble(int i,struct regstat *i_regs)
5942 {
5943   int s1l=get_reg(i_regs->regmap,rs1[i]);
5944   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5945   int s2l=get_reg(i_regs->regmap,rs2[i]);
5946   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5947   void *nt_branch=NULL;
5948   int taken=0;
5949   int nottaken=0;
5950   int unconditional=0;
5951   if(rs1[i]==0)
5952   {
5953     s1l=s2l;s1h=s2h;
5954     s2l=s2h=-1;
5955   }
5956   else if(rs2[i]==0)
5957   {
5958     s2l=s2h=-1;
5959   }
5960   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5961     s1h=s2h=-1;
5962   }
5963   int hr=0;
5964   int addr,alt,ntaddr;
5965   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5966   else {
5967     while(hr<HOST_REGS)
5968     {
5969       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5970          (i_regs->regmap[hr]&63)!=rs1[i] &&
5971          (i_regs->regmap[hr]&63)!=rs2[i] )
5972       {
5973         addr=hr++;break;
5974       }
5975       hr++;
5976     }
5977   }
5978   while(hr<HOST_REGS)
5979   {
5980     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5981        (i_regs->regmap[hr]&63)!=rs1[i] &&
5982        (i_regs->regmap[hr]&63)!=rs2[i] )
5983     {
5984       alt=hr++;break;
5985     }
5986     hr++;
5987   }
5988   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5989   {
5990     while(hr<HOST_REGS)
5991     {
5992       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5993          (i_regs->regmap[hr]&63)!=rs1[i] &&
5994          (i_regs->regmap[hr]&63)!=rs2[i] )
5995       {
5996         ntaddr=hr;break;
5997       }
5998       hr++;
5999     }
6000   }
6001   assert(hr<HOST_REGS);
6002   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6003     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6004   }
6005   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6006   if(opcode[i]==2) // J
6007   {
6008     unconditional=1;
6009   }
6010   if(opcode[i]==3) // JAL
6011   {
6012     // TODO: mini_ht
6013     int rt=get_reg(i_regs->regmap,31);
6014     emit_movimm(start+i*4+8,rt);
6015     unconditional=1;
6016   }
6017   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6018   {
6019     emit_mov(s1l,addr);
6020     if(opcode2[i]==9) // JALR
6021     {
6022       int rt=get_reg(i_regs->regmap,31);
6023       emit_movimm(start+i*4+8,rt);
6024     }
6025   }
6026   if((opcode[i]&0x3f)==4) // BEQ
6027   {
6028     if(rs1[i]==rs2[i])
6029     {
6030       unconditional=1;
6031     }
6032     else
6033     #ifdef HAVE_CMOV_IMM
6034     if(s1h<0) {
6035       if(s2l>=0) emit_cmp(s1l,s2l);
6036       else emit_test(s1l,s1l);
6037       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6038     }
6039     else
6040     #endif
6041     {
6042       assert(s1l>=0);
6043       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6044       if(s1h>=0) {
6045         if(s2h>=0) emit_cmp(s1h,s2h);
6046         else emit_test(s1h,s1h);
6047         emit_cmovne_reg(alt,addr);
6048       }
6049       if(s2l>=0) emit_cmp(s1l,s2l);
6050       else emit_test(s1l,s1l);
6051       emit_cmovne_reg(alt,addr);
6052     }
6053   }
6054   if((opcode[i]&0x3f)==5) // BNE
6055   {
6056     #ifdef HAVE_CMOV_IMM
6057     if(s1h<0) {
6058       if(s2l>=0) emit_cmp(s1l,s2l);
6059       else emit_test(s1l,s1l);
6060       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6061     }
6062     else
6063     #endif
6064     {
6065       assert(s1l>=0);
6066       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6067       if(s1h>=0) {
6068         if(s2h>=0) emit_cmp(s1h,s2h);
6069         else emit_test(s1h,s1h);
6070         emit_cmovne_reg(alt,addr);
6071       }
6072       if(s2l>=0) emit_cmp(s1l,s2l);
6073       else emit_test(s1l,s1l);
6074       emit_cmovne_reg(alt,addr);
6075     }
6076   }
6077   if((opcode[i]&0x3f)==0x14) // BEQL
6078   {
6079     if(s1h>=0) {
6080       if(s2h>=0) emit_cmp(s1h,s2h);
6081       else emit_test(s1h,s1h);
6082       nottaken=(int)out;
6083       emit_jne(0);
6084     }
6085     if(s2l>=0) emit_cmp(s1l,s2l);
6086     else emit_test(s1l,s1l);
6087     if(nottaken) set_jump_target(nottaken,(int)out);
6088     nottaken=(int)out;
6089     emit_jne(0);
6090   }
6091   if((opcode[i]&0x3f)==0x15) // BNEL
6092   {
6093     if(s1h>=0) {
6094       if(s2h>=0) emit_cmp(s1h,s2h);
6095       else emit_test(s1h,s1h);
6096       taken=(int)out;
6097       emit_jne(0);
6098     }
6099     if(s2l>=0) emit_cmp(s1l,s2l);
6100     else emit_test(s1l,s1l);
6101     nottaken=(int)out;
6102     emit_jeq(0);
6103     if(taken) set_jump_target(taken,(int)out);
6104   }
6105   if((opcode[i]&0x3f)==6) // BLEZ
6106   {
6107     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6108     emit_cmpimm(s1l,1);
6109     if(s1h>=0) emit_mov(addr,ntaddr);
6110     emit_cmovl_reg(alt,addr);
6111     if(s1h>=0) {
6112       emit_test(s1h,s1h);
6113       emit_cmovne_reg(ntaddr,addr);
6114       emit_cmovs_reg(alt,addr);
6115     }
6116   }
6117   if((opcode[i]&0x3f)==7) // BGTZ
6118   {
6119     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6120     emit_cmpimm(s1l,1);
6121     if(s1h>=0) emit_mov(addr,alt);
6122     emit_cmovl_reg(ntaddr,addr);
6123     if(s1h>=0) {
6124       emit_test(s1h,s1h);
6125       emit_cmovne_reg(alt,addr);
6126       emit_cmovs_reg(ntaddr,addr);
6127     }
6128   }
6129   if((opcode[i]&0x3f)==0x16) // BLEZL
6130   {
6131     assert((opcode[i]&0x3f)!=0x16);
6132   }
6133   if((opcode[i]&0x3f)==0x17) // BGTZL
6134   {
6135     assert((opcode[i]&0x3f)!=0x17);
6136   }
6137   assert(opcode[i]!=1); // BLTZ/BGEZ
6138
6139   //FIXME: Check CSREG
6140   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6141     if((source[i]&0x30000)==0) // BC1F
6142     {
6143       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6144       emit_testimm(s1l,0x800000);
6145       emit_cmovne_reg(alt,addr);
6146     }
6147     if((source[i]&0x30000)==0x10000) // BC1T
6148     {
6149       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6150       emit_testimm(s1l,0x800000);
6151       emit_cmovne_reg(alt,addr);
6152     }
6153     if((source[i]&0x30000)==0x20000) // BC1FL
6154     {
6155       emit_testimm(s1l,0x800000);
6156       nottaken=(int)out;
6157       emit_jne(0);
6158     }
6159     if((source[i]&0x30000)==0x30000) // BC1TL
6160     {
6161       emit_testimm(s1l,0x800000);
6162       nottaken=(int)out;
6163       emit_jeq(0);
6164     }
6165   }
6166
6167   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6168   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6169   if(likely[i]||unconditional)
6170   {
6171     emit_movimm(ba[i],HOST_BTREG);
6172   }
6173   else if(addr!=HOST_BTREG)
6174   {
6175     emit_mov(addr,HOST_BTREG);
6176   }
6177   void *branch_addr=out;
6178   emit_jmp(0);
6179   int target_addr=start+i*4+5;
6180   void *stub=out;
6181   void *compiled_target_addr=check_addr(target_addr);
6182   emit_extjump_ds((int)branch_addr,target_addr);
6183   if(compiled_target_addr) {
6184     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6185     add_link(target_addr,stub);
6186   }
6187   else set_jump_target((int)branch_addr,(int)stub);
6188   if(likely[i]) {
6189     // Not-taken path
6190     set_jump_target((int)nottaken,(int)out);
6191     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6192     void *branch_addr=out;
6193     emit_jmp(0);
6194     int target_addr=start+i*4+8;
6195     void *stub=out;
6196     void *compiled_target_addr=check_addr(target_addr);
6197     emit_extjump_ds((int)branch_addr,target_addr);
6198     if(compiled_target_addr) {
6199       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6200       add_link(target_addr,stub);
6201     }
6202     else set_jump_target((int)branch_addr,(int)stub);
6203   }
6204 }
6205
6206 // Assemble the delay slot for the above
6207 static void pagespan_ds()
6208 {
6209   assem_debug("initial delay slot:\n");
6210   u_int vaddr=start+1;
6211   u_int page=get_page(vaddr);
6212   u_int vpage=get_vpage(vaddr);
6213   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6214   do_dirty_stub_ds();
6215   ll_add(jump_in+page,vaddr,(void *)out);
6216   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6217   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6218     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6219   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6220     emit_writeword(HOST_BTREG,(int)&branch_target);
6221   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6222   address_generation(0,&regs[0],regs[0].regmap_entry);
6223   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6224     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6225   cop1_usable=0;
6226   is_delayslot=0;
6227   switch(itype[0]) {
6228     case ALU:
6229       alu_assemble(0,&regs[0]);break;
6230     case IMM16:
6231       imm16_assemble(0,&regs[0]);break;
6232     case SHIFT:
6233       shift_assemble(0,&regs[0]);break;
6234     case SHIFTIMM:
6235       shiftimm_assemble(0,&regs[0]);break;
6236     case LOAD:
6237       load_assemble(0,&regs[0]);break;
6238     case LOADLR:
6239       loadlr_assemble(0,&regs[0]);break;
6240     case STORE:
6241       store_assemble(0,&regs[0]);break;
6242     case STORELR:
6243       storelr_assemble(0,&regs[0]);break;
6244     case COP0:
6245       cop0_assemble(0,&regs[0]);break;
6246     case COP1:
6247       cop1_assemble(0,&regs[0]);break;
6248     case C1LS:
6249       c1ls_assemble(0,&regs[0]);break;
6250     case FCONV:
6251       fconv_assemble(0,&regs[0]);break;
6252     case FLOAT:
6253       float_assemble(0,&regs[0]);break;
6254     case FCOMP:
6255       fcomp_assemble(0,&regs[0]);break;
6256     case MULTDIV:
6257       multdiv_assemble(0,&regs[0]);break;
6258     case MOV:
6259       mov_assemble(0,&regs[0]);break;
6260     case SYSCALL:
6261     case SPAN:
6262     case UJUMP:
6263     case RJUMP:
6264     case CJUMP:
6265     case SJUMP:
6266     case FJUMP:
6267       printf("Jump in the delay slot.  This is probably a bug.\n");
6268   }
6269   int btaddr=get_reg(regs[0].regmap,BTREG);
6270   if(btaddr<0) {
6271     btaddr=get_reg(regs[0].regmap,-1);
6272     emit_readword((int)&branch_target,btaddr);
6273   }
6274   assert(btaddr!=HOST_CCREG);
6275   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6276 #ifdef HOST_IMM8
6277   emit_movimm(start+4,HOST_TEMPREG);
6278   emit_cmp(btaddr,HOST_TEMPREG);
6279 #else
6280   emit_cmpimm(btaddr,start+4);
6281 #endif
6282   int branch=(int)out;
6283   emit_jeq(0);
6284   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6285   emit_jmp(jump_vaddr_reg[btaddr]);
6286   set_jump_target(branch,(int)out);
6287   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6288   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6289 }
6290
6291 // Basic liveness analysis for MIPS registers
6292 void unneeded_registers(int istart,int iend,int r)
6293 {
6294   int i;
6295   uint64_t u,uu,b,bu;
6296   uint64_t temp_u,temp_uu;
6297   uint64_t tdep;
6298   if(iend==slen-1) {
6299     u=1;uu=1;
6300   }else{
6301     u=unneeded_reg[iend+1];
6302     uu=unneeded_reg_upper[iend+1];
6303     u=1;uu=1;
6304   }
6305   for (i=iend;i>=istart;i--)
6306   {
6307     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6308     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6309     {
6310       // If subroutine call, flag return address as a possible branch target
6311       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6312       
6313       if(ba[i]<start || ba[i]>=(start+slen*4))
6314       {
6315         // Branch out of this block, flush all regs
6316         u=1;
6317         uu=1;
6318         /* Hexagon hack 
6319         if(itype[i]==UJUMP&&rt1[i]==31)
6320         {
6321           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6322         }
6323         if(itype[i]==RJUMP&&rs1[i]==31)
6324         {
6325           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6326         }
6327         if(start>0x80000400&&start<0x80800000) {
6328           if(itype[i]==UJUMP&&rt1[i]==31)
6329           {
6330             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6331             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6332           }
6333           if(itype[i]==RJUMP&&rs1[i]==31)
6334           {
6335             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6336             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6337           }
6338         }*/
6339         branch_unneeded_reg[i]=u;
6340         branch_unneeded_reg_upper[i]=uu;
6341         // Merge in delay slot
6342         tdep=(~uu>>rt1[i+1])&1;
6343         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6344         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6345         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6346         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6347         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6348         u|=1;uu|=1;
6349         // If branch is "likely" (and conditional)
6350         // then we skip the delay slot on the fall-thru path
6351         if(likely[i]) {
6352           if(i<slen-1) {
6353             u&=unneeded_reg[i+2];
6354             uu&=unneeded_reg_upper[i+2];
6355           }
6356           else
6357           {
6358             u=1;
6359             uu=1;
6360           }
6361         }
6362       }
6363       else
6364       {
6365         // Internal branch, flag target
6366         bt[(ba[i]-start)>>2]=1;
6367         if(ba[i]<=start+i*4) {
6368           // Backward branch
6369           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6370           {
6371             // Unconditional branch
6372             temp_u=1;temp_uu=1;
6373           } else {
6374             // Conditional branch (not taken case)
6375             temp_u=unneeded_reg[i+2];
6376             temp_uu=unneeded_reg_upper[i+2];
6377           }
6378           // Merge in delay slot
6379           tdep=(~temp_uu>>rt1[i+1])&1;
6380           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6381           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6382           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6383           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6384           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6385           temp_u|=1;temp_uu|=1;
6386           // If branch is "likely" (and conditional)
6387           // then we skip the delay slot on the fall-thru path
6388           if(likely[i]) {
6389             if(i<slen-1) {
6390               temp_u&=unneeded_reg[i+2];
6391               temp_uu&=unneeded_reg_upper[i+2];
6392             }
6393             else
6394             {
6395               temp_u=1;
6396               temp_uu=1;
6397             }
6398           }
6399           tdep=(~temp_uu>>rt1[i])&1;
6400           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6401           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6402           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6403           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6404           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6405           temp_u|=1;temp_uu|=1;
6406           unneeded_reg[i]=temp_u;
6407           unneeded_reg_upper[i]=temp_uu;
6408           // Only go three levels deep.  This recursion can take an
6409           // excessive amount of time if there are a lot of nested loops.
6410           if(r<2) {
6411             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6412           }else{
6413             unneeded_reg[(ba[i]-start)>>2]=1;
6414             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6415           }
6416         } /*else*/ if(1) {
6417           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6418           {
6419             // Unconditional branch
6420             u=unneeded_reg[(ba[i]-start)>>2];
6421             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6422             branch_unneeded_reg[i]=u;
6423             branch_unneeded_reg_upper[i]=uu;
6424         //u=1;
6425         //uu=1;
6426         //branch_unneeded_reg[i]=u;
6427         //branch_unneeded_reg_upper[i]=uu;
6428             // Merge in delay slot
6429             tdep=(~uu>>rt1[i+1])&1;
6430             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6431             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6432             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6433             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6434             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6435             u|=1;uu|=1;
6436           } else {
6437             // Conditional branch
6438             b=unneeded_reg[(ba[i]-start)>>2];
6439             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6440             branch_unneeded_reg[i]=b;
6441             branch_unneeded_reg_upper[i]=bu;
6442         //b=1;
6443         //bu=1;
6444         //branch_unneeded_reg[i]=b;
6445         //branch_unneeded_reg_upper[i]=bu;
6446             // Branch delay slot
6447             tdep=(~uu>>rt1[i+1])&1;
6448             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6449             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6450             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6451             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6452             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6453             b|=1;bu|=1;
6454             // If branch is "likely" then we skip the
6455             // delay slot on the fall-thru path
6456             if(likely[i]) {
6457               u=b;
6458               uu=bu;
6459               if(i<slen-1) {
6460                 u&=unneeded_reg[i+2];
6461                 uu&=unneeded_reg_upper[i+2];
6462         //u=1;
6463         //uu=1;
6464               }
6465             } else {
6466               u&=b;
6467               uu&=bu;
6468         //u=1;
6469         //uu=1;
6470             }
6471             if(i<slen-1) {
6472               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6473               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6474         //branch_unneeded_reg[i]=1;
6475         //branch_unneeded_reg_upper[i]=1;
6476             } else {
6477               branch_unneeded_reg[i]=1;
6478               branch_unneeded_reg_upper[i]=1;
6479             }
6480           }
6481         }
6482       }
6483     }
6484     else if(itype[i]==SYSCALL)
6485     {
6486       // SYSCALL instruction (software interrupt)
6487       u=1;
6488       uu=1;
6489     }
6490     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6491     {
6492       // ERET instruction (return from interrupt)
6493       u=1;
6494       uu=1;
6495     }
6496     //u=uu=1; // DEBUG
6497     tdep=(~uu>>rt1[i])&1;
6498     // Written registers are unneeded
6499     u|=1LL<<rt1[i];
6500     u|=1LL<<rt2[i];
6501     uu|=1LL<<rt1[i];
6502     uu|=1LL<<rt2[i];
6503     // Accessed registers are needed
6504     u&=~(1LL<<rs1[i]);
6505     u&=~(1LL<<rs2[i]);
6506     uu&=~(1LL<<us1[i]);
6507     uu&=~(1LL<<us2[i]);
6508     // Source-target dependencies
6509     uu&=~(tdep<<dep1[i]);
6510     uu&=~(tdep<<dep2[i]);
6511     // R0 is always unneeded
6512     u|=1;uu|=1;
6513     // Save it
6514     unneeded_reg[i]=u;
6515     unneeded_reg_upper[i]=uu;
6516     /*
6517     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6518     printf("U:");
6519     int r;
6520     for(r=1;r<=CCREG;r++) {
6521       if((unneeded_reg[i]>>r)&1) {
6522         if(r==HIREG) printf(" HI");
6523         else if(r==LOREG) printf(" LO");
6524         else printf(" r%d",r);
6525       }
6526     }
6527     printf(" UU:");
6528     for(r=1;r<=CCREG;r++) {
6529       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6530         if(r==HIREG) printf(" HI");
6531         else if(r==LOREG) printf(" LO");
6532         else printf(" r%d",r);
6533       }
6534     }
6535     printf("\n");*/
6536   }
6537 }
6538
6539 // Identify registers which are likely to contain 32-bit values
6540 // This is used to predict whether any branches will jump to a
6541 // location with 64-bit values in registers.
6542 static void provisional_32bit()
6543 {
6544   int i,j;
6545   uint64_t is32=1;
6546   uint64_t lastbranch=1;
6547   
6548   for(i=0;i<slen;i++)
6549   {
6550     if(i>0) {
6551       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6552         if(i>1) is32=lastbranch;
6553         else is32=1;
6554       }
6555     }
6556     if(i>1)
6557     {
6558       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6559         if(likely[i-2]) {
6560           if(i>2) is32=lastbranch;
6561           else is32=1;
6562         }
6563       }
6564       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6565       {
6566         if(rs1[i-2]==0||rs2[i-2]==0)
6567         {
6568           if(rs1[i-2]) {
6569             is32|=1LL<<rs1[i-2];
6570           }
6571           if(rs2[i-2]) {
6572             is32|=1LL<<rs2[i-2];
6573           }
6574         }
6575       }
6576     }
6577     // If something jumps here with 64-bit values
6578     // then promote those registers to 64 bits
6579     if(bt[i])
6580     {
6581       uint64_t temp_is32=is32;
6582       for(j=i-1;j>=0;j--)
6583       {
6584         if(ba[j]==start+i*4) 
6585           //temp_is32&=branch_regs[j].is32;
6586           temp_is32&=p32[j];
6587       }
6588       for(j=i;j<slen;j++)
6589       {
6590         if(ba[j]==start+i*4) 
6591           temp_is32=1;
6592       }
6593       is32=temp_is32;
6594     }
6595     int type=itype[i];
6596     int op=opcode[i];
6597     int op2=opcode2[i];
6598     int rt=rt1[i];
6599     int s1=rs1[i];
6600     int s2=rs2[i];
6601     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6602       // Branches don't write registers, consider the delay slot instead.
6603       type=itype[i+1];
6604       op=opcode[i+1];
6605       op2=opcode2[i+1];
6606       rt=rt1[i+1];
6607       s1=rs1[i+1];
6608       s2=rs2[i+1];
6609       lastbranch=is32;
6610     }
6611     switch(type) {
6612       case LOAD:
6613         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6614            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6615           is32&=~(1LL<<rt);
6616         else
6617           is32|=1LL<<rt;
6618         break;
6619       case STORE:
6620       case STORELR:
6621         break;
6622       case LOADLR:
6623         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6624         if(op==0x22) is32|=1LL<<rt; // LWL
6625         break;
6626       case IMM16:
6627         if (op==0x08||op==0x09|| // ADDI/ADDIU
6628             op==0x0a||op==0x0b|| // SLTI/SLTIU
6629             op==0x0c|| // ANDI
6630             op==0x0f)  // LUI
6631         {
6632           is32|=1LL<<rt;
6633         }
6634         if(op==0x18||op==0x19) { // DADDI/DADDIU
6635           is32&=~(1LL<<rt);
6636           //if(imm[i]==0)
6637           //  is32|=((is32>>s1)&1LL)<<rt;
6638         }
6639         if(op==0x0d||op==0x0e) { // ORI/XORI
6640           uint64_t sr=((is32>>s1)&1LL);
6641           is32&=~(1LL<<rt);
6642           is32|=sr<<rt;
6643         }
6644         break;
6645       case UJUMP:
6646         break;
6647       case RJUMP:
6648         break;
6649       case CJUMP:
6650         break;
6651       case SJUMP:
6652         break;
6653       case FJUMP:
6654         break;
6655       case ALU:
6656         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6657           is32|=1LL<<rt;
6658         }
6659         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6660           is32|=1LL<<rt;
6661         }
6662         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6663           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6664           is32&=~(1LL<<rt);
6665           is32|=sr<<rt;
6666         }
6667         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6668           if(s1==0&&s2==0) {
6669             is32|=1LL<<rt;
6670           }
6671           else if(s2==0) {
6672             uint64_t sr=((is32>>s1)&1LL);
6673             is32&=~(1LL<<rt);
6674             is32|=sr<<rt;
6675           }
6676           else if(s1==0) {
6677             uint64_t sr=((is32>>s2)&1LL);
6678             is32&=~(1LL<<rt);
6679             is32|=sr<<rt;
6680           }
6681           else {
6682             is32&=~(1LL<<rt);
6683           }
6684         }
6685         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6686           if(s1==0&&s2==0) {
6687             is32|=1LL<<rt;
6688           }
6689           else if(s2==0) {
6690             uint64_t sr=((is32>>s1)&1LL);
6691             is32&=~(1LL<<rt);
6692             is32|=sr<<rt;
6693           }
6694           else {
6695             is32&=~(1LL<<rt);
6696           }
6697         }
6698         break;
6699       case MULTDIV:
6700         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6701           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6702         }
6703         else {
6704           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6705         }
6706         break;
6707       case MOV:
6708         {
6709           uint64_t sr=((is32>>s1)&1LL);
6710           is32&=~(1LL<<rt);
6711           is32|=sr<<rt;
6712         }
6713         break;
6714       case SHIFT:
6715         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6716         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6717         break;
6718       case SHIFTIMM:
6719         is32|=1LL<<rt;
6720         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6721         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6722         break;
6723       case COP0:
6724         if(op2==0) is32|=1LL<<rt; // MFC0
6725         break;
6726       case COP1:
6727         if(op2==0) is32|=1LL<<rt; // MFC1
6728         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6729         if(op2==2) is32|=1LL<<rt; // CFC1
6730         break;
6731       case C1LS:
6732         break;
6733       case FLOAT:
6734       case FCONV:
6735         break;
6736       case FCOMP:
6737         break;
6738       case SYSCALL:
6739         break;
6740       default:
6741         break;
6742     }
6743     is32|=1;
6744     p32[i]=is32;
6745
6746     if(i>0)
6747     {
6748       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6749       {
6750         if(rt1[i-1]==31) // JAL/JALR
6751         {
6752           // Subroutine call will return here, don't alloc any registers
6753           is32=1;
6754         }
6755         else if(i+1<slen)
6756         {
6757           // Internal branch will jump here, match registers to caller
6758           is32=0x3FFFFFFFFLL;
6759         }
6760       }
6761     }
6762   }
6763 }
6764
6765 // Identify registers which may be assumed to contain 32-bit values
6766 // and where optimizations will rely on this.
6767 // This is used to determine whether backward branches can safely
6768 // jump to a location with 64-bit values in registers.
6769 static void provisional_r32()
6770 {
6771   u_int r32=0;
6772   int i;
6773   
6774   for (i=slen-1;i>=0;i--)
6775   {
6776     int hr;
6777     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6778     {
6779       if(ba[i]<start || ba[i]>=(start+slen*4))
6780       {
6781         // Branch out of this block, don't need anything
6782         r32=0;
6783       }
6784       else
6785       {
6786         // Internal branch
6787         // Need whatever matches the target
6788         // (and doesn't get overwritten by the delay slot instruction)
6789         r32=0;
6790         int t=(ba[i]-start)>>2;
6791         if(ba[i]>start+i*4) {
6792           // Forward branch
6793           //if(!(requires_32bit[t]&~regs[i].was32))
6794           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6795           if(!(pr32[t]&~regs[i].was32))
6796             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6797         }else{
6798           // Backward branch
6799           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6800             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6801         }
6802       }
6803       // Conditional branch may need registers for following instructions
6804       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6805       {
6806         if(i<slen-2) {
6807           //r32|=requires_32bit[i+2];
6808           r32|=pr32[i+2];
6809           r32&=regs[i].was32;
6810           // Mark this address as a branch target since it may be called
6811           // upon return from interrupt
6812           //bt[i+2]=1;
6813         }
6814       }
6815       // Merge in delay slot
6816       if(!likely[i]) {
6817         // These are overwritten unless the branch is "likely"
6818         // and the delay slot is nullified if not taken
6819         r32&=~(1LL<<rt1[i+1]);
6820         r32&=~(1LL<<rt2[i+1]);
6821       }
6822       // Assume these are needed (delay slot)
6823       if(us1[i+1]>0)
6824       {
6825         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6826       }
6827       if(us2[i+1]>0)
6828       {
6829         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6830       }
6831       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6832       {
6833         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6834       }
6835       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6836       {
6837         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6838       }
6839     }
6840     else if(itype[i]==SYSCALL)
6841     {
6842       // SYSCALL instruction (software interrupt)
6843       r32=0;
6844     }
6845     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6846     {
6847       // ERET instruction (return from interrupt)
6848       r32=0;
6849     }
6850     // Check 32 bits
6851     r32&=~(1LL<<rt1[i]);
6852     r32&=~(1LL<<rt2[i]);
6853     if(us1[i]>0)
6854     {
6855       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6856     }
6857     if(us2[i]>0)
6858     {
6859       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6860     }
6861     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6862     {
6863       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6864     }
6865     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6866     {
6867       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6868     }
6869     //requires_32bit[i]=r32;
6870     pr32[i]=r32;
6871     
6872     // Dirty registers which are 32-bit, require 32-bit input
6873     // as they will be written as 32-bit values
6874     for(hr=0;hr<HOST_REGS;hr++)
6875     {
6876       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6877         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6878           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6879           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6880           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6881         }
6882       }
6883     }
6884   }
6885 }
6886
6887 // Write back dirty registers as soon as we will no longer modify them,
6888 // so that we don't end up with lots of writes at the branches.
6889 void clean_registers(int istart,int iend,int wr)
6890 {
6891   int i;
6892   int r;
6893   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6894   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6895   if(iend==slen-1) {
6896     will_dirty_i=will_dirty_next=0;
6897     wont_dirty_i=wont_dirty_next=0;
6898   }else{
6899     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6900     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6901   }
6902   for (i=iend;i>=istart;i--)
6903   {
6904     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6905     {
6906       if(ba[i]<start || ba[i]>=(start+slen*4))
6907       {
6908         // Branch out of this block, flush all regs
6909         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6910         {
6911           // Unconditional branch
6912           will_dirty_i=0;
6913           wont_dirty_i=0;
6914           // Merge in delay slot (will dirty)
6915           for(r=0;r<HOST_REGS;r++) {
6916             if(r!=EXCLUDE_REG) {
6917               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6918               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6919               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6920               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6921               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6922               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6923               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6924               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6925               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6926               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6927               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6928               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6929               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6930               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6931             }
6932           }
6933         }
6934         else
6935         {
6936           // Conditional branch
6937           will_dirty_i=0;
6938           wont_dirty_i=wont_dirty_next;
6939           // Merge in delay slot (will dirty)
6940           for(r=0;r<HOST_REGS;r++) {
6941             if(r!=EXCLUDE_REG) {
6942               if(!likely[i]) {
6943                 // Might not dirty if likely branch is not taken
6944                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6945                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6946                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6947                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6948                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6949                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6950                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6951                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6952                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6953                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6954                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6955                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6956                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6957                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6958               }
6959             }
6960           }
6961         }
6962         // Merge in delay slot (wont dirty)
6963         for(r=0;r<HOST_REGS;r++) {
6964           if(r!=EXCLUDE_REG) {
6965             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6966             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6967             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6968             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6969             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6970             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6971             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6972             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6973             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6974             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6975           }
6976         }
6977         if(wr) {
6978           #ifndef DESTRUCTIVE_WRITEBACK
6979           branch_regs[i].dirty&=wont_dirty_i;
6980           #endif
6981           branch_regs[i].dirty|=will_dirty_i;
6982         }
6983       }
6984       else
6985       {
6986         // Internal branch
6987         if(ba[i]<=start+i*4) {
6988           // Backward branch
6989           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6990           {
6991             // Unconditional branch
6992             temp_will_dirty=0;
6993             temp_wont_dirty=0;
6994             // Merge in delay slot (will dirty)
6995             for(r=0;r<HOST_REGS;r++) {
6996               if(r!=EXCLUDE_REG) {
6997                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6998                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6999                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7000                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7001                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7002                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7003                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7004                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7005                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7006                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7007                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7008                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7009                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7010                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7011               }
7012             }
7013           } else {
7014             // Conditional branch (not taken case)
7015             temp_will_dirty=will_dirty_next;
7016             temp_wont_dirty=wont_dirty_next;
7017             // Merge in delay slot (will dirty)
7018             for(r=0;r<HOST_REGS;r++) {
7019               if(r!=EXCLUDE_REG) {
7020                 if(!likely[i]) {
7021                   // Will not dirty if likely branch is not taken
7022                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7023                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7024                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7025                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7026                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7027                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7028                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7029                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7030                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7031                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7032                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7033                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7034                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7035                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7036                 }
7037               }
7038             }
7039           }
7040           // Merge in delay slot (wont dirty)
7041           for(r=0;r<HOST_REGS;r++) {
7042             if(r!=EXCLUDE_REG) {
7043               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7044               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7045               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7046               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7047               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7048               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7049               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7050               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7051               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7052               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7053             }
7054           }
7055           // Deal with changed mappings
7056           if(i<iend) {
7057             for(r=0;r<HOST_REGS;r++) {
7058               if(r!=EXCLUDE_REG) {
7059                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7060                   temp_will_dirty&=~(1<<r);
7061                   temp_wont_dirty&=~(1<<r);
7062                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7063                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7064                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7065                   } else {
7066                     temp_will_dirty|=1<<r;
7067                     temp_wont_dirty|=1<<r;
7068                   }
7069                 }
7070               }
7071             }
7072           }
7073           if(wr) {
7074             will_dirty[i]=temp_will_dirty;
7075             wont_dirty[i]=temp_wont_dirty;
7076             clean_registers((ba[i]-start)>>2,i-1,0);
7077           }else{
7078             // Limit recursion.  It can take an excessive amount
7079             // of time if there are a lot of nested loops.
7080             will_dirty[(ba[i]-start)>>2]=0;
7081             wont_dirty[(ba[i]-start)>>2]=-1;
7082           }
7083         }
7084         /*else*/ if(1)
7085         {
7086           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7087           {
7088             // Unconditional branch
7089             will_dirty_i=0;
7090             wont_dirty_i=0;
7091           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7092             for(r=0;r<HOST_REGS;r++) {
7093               if(r!=EXCLUDE_REG) {
7094                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7095                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7096                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7097                 }
7098               }
7099             }
7100           //}
7101             // Merge in delay slot
7102             for(r=0;r<HOST_REGS;r++) {
7103               if(r!=EXCLUDE_REG) {
7104                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7105                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7106                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7107                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7108                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7109                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7110                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7111                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7112                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7113                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7114                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7115                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7116                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7117                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7118               }
7119             }
7120           } else {
7121             // Conditional branch
7122             will_dirty_i=will_dirty_next;
7123             wont_dirty_i=wont_dirty_next;
7124           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7125             for(r=0;r<HOST_REGS;r++) {
7126               if(r!=EXCLUDE_REG) {
7127                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7128                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7129                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7130                 }
7131                 else
7132                 {
7133                   will_dirty_i&=~(1<<r);
7134                 }
7135                 // Treat delay slot as part of branch too
7136                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7137                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7138                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7139                 }
7140                 else
7141                 {
7142                   will_dirty[i+1]&=~(1<<r);
7143                 }*/
7144               }
7145             }
7146           //}
7147             // Merge in delay slot
7148             for(r=0;r<HOST_REGS;r++) {
7149               if(r!=EXCLUDE_REG) {
7150                 if(!likely[i]) {
7151                   // Might not dirty if likely branch is not taken
7152                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7153                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7154                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7155                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7156                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7157                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7158                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7159                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7160                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7161                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7162                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7163                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7164                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7165                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7166                 }
7167               }
7168             }
7169           }
7170           // Merge in delay slot
7171           for(r=0;r<HOST_REGS;r++) {
7172             if(r!=EXCLUDE_REG) {
7173               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7174               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7175               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7176               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7177               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7178               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7179               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7180               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7181               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7182               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7183             }
7184           }
7185           if(wr) {
7186             #ifndef DESTRUCTIVE_WRITEBACK
7187             branch_regs[i].dirty&=wont_dirty_i;
7188             #endif
7189             branch_regs[i].dirty|=will_dirty_i;
7190           }
7191         }
7192       }
7193     }
7194     else if(itype[i]==SYSCALL)
7195     {
7196       // SYSCALL instruction (software interrupt)
7197       will_dirty_i=0;
7198       wont_dirty_i=0;
7199     }
7200     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7201     {
7202       // ERET instruction (return from interrupt)
7203       will_dirty_i=0;
7204       wont_dirty_i=0;
7205     }
7206     will_dirty_next=will_dirty_i;
7207     wont_dirty_next=wont_dirty_i;
7208     for(r=0;r<HOST_REGS;r++) {
7209       if(r!=EXCLUDE_REG) {
7210         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7211         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7212         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7213         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7214         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7215         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7216         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7217         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7218         if(i>istart) {
7219           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7220           {
7221             // Don't store a register immediately after writing it,
7222             // may prevent dual-issue.
7223             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7224             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7225           }
7226         }
7227       }
7228     }
7229     // Save it
7230     will_dirty[i]=will_dirty_i;
7231     wont_dirty[i]=wont_dirty_i;
7232     // Mark registers that won't be dirtied as not dirty
7233     if(wr) {
7234       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7235       for(r=0;r<HOST_REGS;r++) {
7236         if((will_dirty_i>>r)&1) {
7237           printf(" r%d",r);
7238         }
7239       }
7240       printf("\n");*/
7241
7242       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7243         regs[i].dirty|=will_dirty_i;
7244         #ifndef DESTRUCTIVE_WRITEBACK
7245         regs[i].dirty&=wont_dirty_i;
7246         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7247         {
7248           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7249             for(r=0;r<HOST_REGS;r++) {
7250               if(r!=EXCLUDE_REG) {
7251                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7252                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7253                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7254               }
7255             }
7256           }
7257         }
7258         else
7259         {
7260           if(i<iend) {
7261             for(r=0;r<HOST_REGS;r++) {
7262               if(r!=EXCLUDE_REG) {
7263                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7264                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7265                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7266               }
7267             }
7268           }
7269         }
7270         #endif
7271       //}
7272     }
7273     // Deal with changed mappings
7274     temp_will_dirty=will_dirty_i;
7275     temp_wont_dirty=wont_dirty_i;
7276     for(r=0;r<HOST_REGS;r++) {
7277       if(r!=EXCLUDE_REG) {
7278         int nr;
7279         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7280           if(wr) {
7281             #ifndef DESTRUCTIVE_WRITEBACK
7282             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7283             #endif
7284             regs[i].wasdirty|=will_dirty_i&(1<<r);
7285           }
7286         }
7287         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7288           // Register moved to a different register
7289           will_dirty_i&=~(1<<r);
7290           wont_dirty_i&=~(1<<r);
7291           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7292           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7293           if(wr) {
7294             #ifndef DESTRUCTIVE_WRITEBACK
7295             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7296             #endif
7297             regs[i].wasdirty|=will_dirty_i&(1<<r);
7298           }
7299         }
7300         else {
7301           will_dirty_i&=~(1<<r);
7302           wont_dirty_i&=~(1<<r);
7303           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7304             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7305             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7306           } else {
7307             wont_dirty_i|=1<<r;
7308             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7309           }
7310         }
7311       }
7312     }
7313   }
7314 }
7315
7316   /* disassembly */
7317 void disassemble_inst(int i)
7318 {
7319     if (bt[i]) printf("*"); else printf(" ");
7320     switch(itype[i]) {
7321       case UJUMP:
7322         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7323       case CJUMP:
7324         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7325       case SJUMP:
7326         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7327       case FJUMP:
7328         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7329       case RJUMP:
7330         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7331       case SPAN:
7332         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7333       case IMM16:
7334         if(opcode[i]==0xf) //LUI
7335           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7336         else
7337           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7338         break;
7339       case LOAD:
7340       case LOADLR:
7341         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7342         break;
7343       case STORE:
7344       case STORELR:
7345         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7346         break;
7347       case ALU:
7348       case SHIFT:
7349         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7350         break;
7351       case MULTDIV:
7352         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7353         break;
7354       case SHIFTIMM:
7355         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7356         break;
7357       case MOV:
7358         if((opcode2[i]&0x1d)==0x10)
7359           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7360         else if((opcode2[i]&0x1d)==0x11)
7361           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7362         else
7363           printf (" %x: %s\n",start+i*4,insn[i]);
7364         break;
7365       case COP0:
7366         if(opcode2[i]==0)
7367           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7368         else if(opcode2[i]==4)
7369           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7370         else printf (" %x: %s\n",start+i*4,insn[i]);
7371         break;
7372       case COP1:
7373         if(opcode2[i]<3)
7374           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7375         else if(opcode2[i]>3)
7376           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7377         else printf (" %x: %s\n",start+i*4,insn[i]);
7378         break;
7379       case C1LS:
7380         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7381         break;
7382       default:
7383         //printf (" %s %8x\n",insn[i],source[i]);
7384         printf (" %x: %s\n",start+i*4,insn[i]);
7385     }
7386 }
7387
7388 void new_dynarec_init()
7389 {
7390   printf("Init new dynarec\n");
7391   out=(u_char *)BASE_ADDR;
7392   if (mmap (out, 1<<TARGET_SIZE_2,
7393             PROT_READ | PROT_WRITE | PROT_EXEC,
7394             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7395             -1, 0) <= 0) {printf("mmap() failed\n");}
7396   rdword=&readmem_dword;
7397   fake_pc.f.r.rs=&readmem_dword;
7398   fake_pc.f.r.rt=&readmem_dword;
7399   fake_pc.f.r.rd=&readmem_dword;
7400   int n;
7401   for(n=0x80000;n<0x80800;n++)
7402     invalid_code[n]=1;
7403   for(n=0;n<65536;n++)
7404     hash_table[n][0]=hash_table[n][2]=-1;
7405   memset(mini_ht,-1,sizeof(mini_ht));
7406   memset(restore_candidate,0,sizeof(restore_candidate));
7407   copy=shadow;
7408   expirep=16384; // Expiry pointer, +2 blocks
7409   pending_exception=0;
7410   literalcount=0;
7411 #ifdef HOST_IMM8
7412   // Copy this into local area so we don't have to put it in every literal pool
7413   invc_ptr=invalid_code;
7414 #endif
7415   stop_after_jal=0;
7416   // TLB
7417   using_tlb=0;
7418   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7419     memory_map[n]=-1;
7420   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7421     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7422   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7423     memory_map[n]=-1;
7424   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7425     writemem[n] = write_nomem_new;
7426     writememb[n] = write_nomemb_new;
7427     writememh[n] = write_nomemh_new;
7428     writememd[n] = write_nomemd_new;
7429     readmem[n] = read_nomem_new;
7430     readmemb[n] = read_nomemb_new;
7431     readmemh[n] = read_nomemh_new;
7432     readmemd[n] = read_nomemd_new;
7433   }
7434   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7435     writemem[n] = write_rdram_new;
7436     writememb[n] = write_rdramb_new;
7437     writememh[n] = write_rdramh_new;
7438     writememd[n] = write_rdramd_new;
7439   }
7440   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7441     writemem[n] = write_nomem_new;
7442     writememb[n] = write_nomemb_new;
7443     writememh[n] = write_nomemh_new;
7444     writememd[n] = write_nomemd_new;
7445     readmem[n] = read_nomem_new;
7446     readmemb[n] = read_nomemb_new;
7447     readmemh[n] = read_nomemh_new;
7448     readmemd[n] = read_nomemd_new;
7449   }
7450   tlb_hacks();
7451   arch_init();
7452 }
7453
7454 void new_dynarec_cleanup()
7455 {
7456   int n;
7457   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7458   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7459   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7460   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7461   #ifdef ROM_COPY
7462   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7463   #endif
7464 }
7465
7466 int new_recompile_block(int addr)
7467 {
7468 /*
7469   if(addr==0x800cd050) {
7470     int block;
7471     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7472     int n;
7473     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7474   }
7475 */
7476   //if(Count==365117028) tracedebug=1;
7477   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7478   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7479   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7480   //if(debug) 
7481   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7482   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7483   /*if(Count>=312978186) {
7484     rlist();
7485   }*/
7486   //rlist();
7487   start = (u_int)addr&~3;
7488   //assert(((u_int)addr&1)==0);
7489   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7490     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7491     pagelimit = 0xa4001000;
7492   }
7493   else if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7494     source = (u_int *)((u_int)rdram+start-0x80000000);
7495     pagelimit = 0x80800000;
7496   }
7497   else if ((signed int)addr >= (signed int)0xC0000000) {
7498     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7499     //if(tlb_LUT_r[start>>12])
7500       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7501     if((signed int)memory_map[start>>12]>=0) {
7502       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7503       pagelimit=(start+4096)&0xFFFFF000;
7504       int map=memory_map[start>>12];
7505       int i;
7506       for(i=0;i<5;i++) {
7507         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7508         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7509       }
7510       assem_debug("pagelimit=%x\n",pagelimit);
7511       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7512     }
7513     else {
7514       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7515       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7516       return 1; // Caller will invoke exception handler
7517     }
7518     //printf("source= %x\n",(int)source);
7519   }
7520   else {
7521     printf("Compile at bogus memory address: %x \n", (int)addr);
7522     exit(1);
7523   }
7524
7525   /* Pass 1: disassemble */
7526   /* Pass 2: register dependencies, branch targets */
7527   /* Pass 3: register allocation */
7528   /* Pass 4: branch dependencies */
7529   /* Pass 5: pre-alloc */
7530   /* Pass 6: optimize clean/dirty state */
7531   /* Pass 7: flag 32-bit registers */
7532   /* Pass 8: assembly */
7533   /* Pass 9: linker */
7534   /* Pass 10: garbage collection / free memory */
7535
7536   int i,j;
7537   int done=0;
7538   unsigned int type,op,op2;
7539
7540   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7541   
7542   /* Pass 1 disassembly */
7543
7544   for(i=0;!done;i++) {
7545     bt[i]=0;likely[i]=0;op2=0;
7546     opcode[i]=op=source[i]>>26;
7547     switch(op)
7548     {
7549       case 0x00: strcpy(insn[i],"special"); type=NI;
7550         op2=source[i]&0x3f;
7551         switch(op2)
7552         {
7553           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7554           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7555           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7556           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7557           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7558           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7559           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7560           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7561           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7562           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7563           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7564           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7565           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7566           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7567           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7568           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7569           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7570           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7571           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7572           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7573           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7574           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7575           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7576           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7577           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7578           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7579           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7580           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7581           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7582           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7583           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7584           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7585           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7586           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7587           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7588           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7589           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7590           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7591           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7592           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7593           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7594           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7595           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7596           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7597           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7598           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7599           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7600           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7601           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7602           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7603           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7604           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7605         }
7606         break;
7607       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7608         op2=(source[i]>>16)&0x1f;
7609         switch(op2)
7610         {
7611           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7612           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7613           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7614           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7615           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7616           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7617           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7618           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7619           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7620           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7621           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7622           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7623           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7624           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7625         }
7626         break;
7627       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7628       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7629       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7630       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7631       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7632       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7633       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7634       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7635       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7636       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7637       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7638       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7639       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7640       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7641       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7642         op2=(source[i]>>21)&0x1f;
7643         switch(op2)
7644         {
7645           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7646           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7647           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7648           switch(source[i]&0x3f)
7649           {
7650             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7651             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7652             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7653             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7654             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7655           }
7656         }
7657         break;
7658       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7659         op2=(source[i]>>21)&0x1f;
7660         switch(op2)
7661         {
7662           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7663           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7664           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7665           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7666           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7667           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7668           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7669           switch((source[i]>>16)&0x3)
7670           {
7671             case 0x00: strcpy(insn[i],"BC1F"); break;
7672             case 0x01: strcpy(insn[i],"BC1T"); break;
7673             case 0x02: strcpy(insn[i],"BC1FL"); break;
7674             case 0x03: strcpy(insn[i],"BC1TL"); break;
7675           }
7676           break;
7677           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7678           switch(source[i]&0x3f)
7679           {
7680             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7681             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7682             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7683             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7684             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7685             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7686             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7687             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7688             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7689             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7690             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7691             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7692             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7693             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7694             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7695             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7696             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7697             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7698             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7699             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7700             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7701             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7702             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7703             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7704             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7705             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7706             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7707             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7708             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7709             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7710             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7711             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7712             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7713             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7714             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7715           }
7716           break;
7717           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7718           switch(source[i]&0x3f)
7719           {
7720             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7721             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7722             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7723             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7724             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7725             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7726             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7727             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7728             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7729             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7730             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7731             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7732             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7733             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7734             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7735             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7736             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7737             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7738             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7739             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7740             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7741             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7742             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7743             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7744             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7745             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7746             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7747             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7748             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7749             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7750             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7751             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7752             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7753             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7754             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7755           }
7756           break;
7757           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7758           switch(source[i]&0x3f)
7759           {
7760             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7761             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7762           }
7763           break;
7764           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7765           switch(source[i]&0x3f)
7766           {
7767             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7768             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7769           }
7770           break;
7771         }
7772         break;
7773       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7774       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7775       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7776       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7777       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7778       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7779       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7780       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7781       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7782       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7783       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7784       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7785       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7786       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7787       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7788       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7789       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7790       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7791       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7792       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7793       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7794       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7795       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7796       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7797       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7798       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7799       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7800       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7801       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7802       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7803       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7804       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7805       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7806       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7807       default: strcpy(insn[i],"???"); type=NI; break;
7808     }
7809     itype[i]=type;
7810     opcode2[i]=op2;
7811     /* Get registers/immediates */
7812     lt1[i]=0;
7813     us1[i]=0;
7814     us2[i]=0;
7815     dep1[i]=0;
7816     dep2[i]=0;
7817     switch(type) {
7818       case LOAD:
7819         rs1[i]=(source[i]>>21)&0x1f;
7820         rs2[i]=0;
7821         rt1[i]=(source[i]>>16)&0x1f;
7822         rt2[i]=0;
7823         imm[i]=(short)source[i];
7824         break;
7825       case STORE:
7826       case STORELR:
7827         rs1[i]=(source[i]>>21)&0x1f;
7828         rs2[i]=(source[i]>>16)&0x1f;
7829         rt1[i]=0;
7830         rt2[i]=0;
7831         imm[i]=(short)source[i];
7832         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7833         break;
7834       case LOADLR:
7835         // LWL/LWR only load part of the register,
7836         // therefore the target register must be treated as a source too
7837         rs1[i]=(source[i]>>21)&0x1f;
7838         rs2[i]=(source[i]>>16)&0x1f;
7839         rt1[i]=(source[i]>>16)&0x1f;
7840         rt2[i]=0;
7841         imm[i]=(short)source[i];
7842         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7843         if(op==0x26) dep1[i]=rt1[i]; // LWR
7844         break;
7845       case IMM16:
7846         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7847         else rs1[i]=(source[i]>>21)&0x1f;
7848         rs2[i]=0;
7849         rt1[i]=(source[i]>>16)&0x1f;
7850         rt2[i]=0;
7851         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7852           imm[i]=(unsigned short)source[i];
7853         }else{
7854           imm[i]=(short)source[i];
7855         }
7856         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7857         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7858         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7859         break;
7860       case UJUMP:
7861         rs1[i]=0;
7862         rs2[i]=0;
7863         rt1[i]=0;
7864         rt2[i]=0;
7865         // The JAL instruction writes to r31.
7866         if (op&1) {
7867           rt1[i]=31;
7868         }
7869         rs2[i]=CCREG;
7870         break;
7871       case RJUMP:
7872         rs1[i]=(source[i]>>21)&0x1f;
7873         rs2[i]=0;
7874         rt1[i]=0;
7875         rt2[i]=0;
7876         // The JALR instruction writes to r31.
7877         if (op2&1) {
7878           rt1[i]=31;   
7879         }
7880         rs2[i]=CCREG;
7881         break;
7882       case CJUMP:
7883         rs1[i]=(source[i]>>21)&0x1f;
7884         rs2[i]=(source[i]>>16)&0x1f;
7885         rt1[i]=0;
7886         rt2[i]=0;
7887         if(op&2) { // BGTZ/BLEZ
7888           rs2[i]=0;
7889         }
7890         us1[i]=rs1[i];
7891         us2[i]=rs2[i];
7892         likely[i]=op>>4;
7893         break;
7894       case SJUMP:
7895         rs1[i]=(source[i]>>21)&0x1f;
7896         rs2[i]=CCREG;
7897         rt1[i]=0;
7898         rt2[i]=0;
7899         us1[i]=rs1[i];
7900         if(op2&0x10) { // BxxAL
7901           rt1[i]=31;
7902           // NOTE: If the branch is not taken, r31 is still overwritten
7903         }
7904         likely[i]=(op2&2)>>1;
7905         break;
7906       case FJUMP:
7907         rs1[i]=FSREG;
7908         rs2[i]=CSREG;
7909         rt1[i]=0;
7910         rt2[i]=0;
7911         likely[i]=((source[i])>>17)&1;
7912         break;
7913       case ALU:
7914         rs1[i]=(source[i]>>21)&0x1f; // source
7915         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7916         rt1[i]=(source[i]>>11)&0x1f; // destination
7917         rt2[i]=0;
7918         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7919           us1[i]=rs1[i];us2[i]=rs2[i];
7920         }
7921         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7922           dep1[i]=rs1[i];dep2[i]=rs2[i];
7923         }
7924         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7925           dep1[i]=rs1[i];dep2[i]=rs2[i];
7926         }
7927         break;
7928       case MULTDIV:
7929         rs1[i]=(source[i]>>21)&0x1f; // source
7930         rs2[i]=(source[i]>>16)&0x1f; // divisor
7931         rt1[i]=HIREG;
7932         rt2[i]=LOREG;
7933         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7934           us1[i]=rs1[i];us2[i]=rs2[i];
7935         }
7936         break;
7937       case MOV:
7938         rs1[i]=0;
7939         rs2[i]=0;
7940         rt1[i]=0;
7941         rt2[i]=0;
7942         if(op2==0x10) rs1[i]=HIREG; // MFHI
7943         if(op2==0x11) rt1[i]=HIREG; // MTHI
7944         if(op2==0x12) rs1[i]=LOREG; // MFLO
7945         if(op2==0x13) rt1[i]=LOREG; // MTLO
7946         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7947         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7948         dep1[i]=rs1[i];
7949         break;
7950       case SHIFT:
7951         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7952         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7953         rt1[i]=(source[i]>>11)&0x1f; // destination
7954         rt2[i]=0;
7955         // DSLLV/DSRLV/DSRAV are 64-bit
7956         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7957         break;
7958       case SHIFTIMM:
7959         rs1[i]=(source[i]>>16)&0x1f;
7960         rs2[i]=0;
7961         rt1[i]=(source[i]>>11)&0x1f;
7962         rt2[i]=0;
7963         imm[i]=(source[i]>>6)&0x1f;
7964         // DSxx32 instructions
7965         if(op2>=0x3c) imm[i]|=0x20;
7966         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7967         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7968         break;
7969       case COP0:
7970         rs1[i]=0;
7971         rs2[i]=0;
7972         rt1[i]=0;
7973         rt2[i]=0;
7974         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7975         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7976         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7977         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7978         break;
7979       case COP1:
7980         rs1[i]=0;
7981         rs2[i]=0;
7982         rt1[i]=0;
7983         rt2[i]=0;
7984         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7985         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7986         if(op2==5) us1[i]=rs1[i]; // DMTC1
7987         rs2[i]=CSREG;
7988         break;
7989       case C1LS:
7990         rs1[i]=(source[i]>>21)&0x1F;
7991         rs2[i]=CSREG;
7992         rt1[i]=0;
7993         rt2[i]=0;
7994         imm[i]=(short)source[i];
7995         break;
7996       case FLOAT:
7997       case FCONV:
7998         rs1[i]=0;
7999         rs2[i]=CSREG;
8000         rt1[i]=0;
8001         rt2[i]=0;
8002         break;
8003       case FCOMP:
8004         rs1[i]=FSREG;
8005         rs2[i]=CSREG;
8006         rt1[i]=FSREG;
8007         rt2[i]=0;
8008         break;
8009       case SYSCALL:
8010         rs1[i]=CCREG;
8011         rs2[i]=0;
8012         rt1[i]=0;
8013         rt2[i]=0;
8014         break;
8015       default:
8016         rs1[i]=0;
8017         rs2[i]=0;
8018         rt1[i]=0;
8019         rt2[i]=0;
8020     }
8021     /* Calculate branch target addresses */
8022     if(type==UJUMP)
8023       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8024     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8025       ba[i]=start+i*4+8; // Ignore never taken branch
8026     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8027       ba[i]=start+i*4+8; // Ignore never taken branch
8028     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8029       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8030     else ba[i]=-1;
8031     /* Is this the end of the block? */
8032     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8033       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8034         done=1;
8035         // Does the block continue due to a branch?
8036         for(j=i-1;j>=0;j--)
8037         {
8038           if(ba[j]==start+i*4+4) done=j=0;
8039           if(ba[j]==start+i*4+8) done=j=0;
8040         }
8041       }
8042       else {
8043         if(stop_after_jal) done=1;
8044         // Stop on BREAK
8045         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8046       }
8047       // Don't recompile stuff that's already compiled
8048       if(check_addr(start+i*4+4)) done=1;
8049       // Don't get too close to the limit
8050       if(i>MAXBLOCK/2) done=1;
8051     }
8052     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8053     assert(i<MAXBLOCK-1);
8054     if(start+i*4==pagelimit-4) done=1;
8055     assert(start+i*4<pagelimit);
8056     if (i==MAXBLOCK-1) done=1;
8057     // Stop if we're compiling junk
8058     if(itype[i]==NI&&opcode[i]==0x11) {
8059       done=stop_after_jal=1;
8060       printf("Disabled speculative precompilation\n");
8061     }
8062   }
8063   slen=i;
8064   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8065     if(start+i*4==pagelimit) {
8066       itype[i-1]=SPAN;
8067     }
8068   }
8069   assert(slen>0);
8070
8071   /* Pass 2 - Register dependencies and branch targets */
8072
8073   unneeded_registers(0,slen-1,0);
8074   
8075   /* Pass 3 - Register allocation */
8076
8077   struct regstat current; // Current register allocations/status
8078   current.is32=1;
8079   current.dirty=0;
8080   current.u=unneeded_reg[0];
8081   current.uu=unneeded_reg_upper[0];
8082   clear_all_regs(current.regmap);
8083   alloc_reg(&current,0,CCREG);
8084   dirty_reg(&current,CCREG);
8085   current.isconst=0;
8086   current.wasconst=0;
8087   int ds=0;
8088   int cc=0;
8089   int hr;
8090   
8091   provisional_32bit();
8092   
8093   if((u_int)addr&1) {
8094     // First instruction is delay slot
8095     cc=-1;
8096     bt[1]=1;
8097     ds=1;
8098     unneeded_reg[0]=1;
8099     unneeded_reg_upper[0]=1;
8100     current.regmap[HOST_BTREG]=BTREG;
8101   }
8102   
8103   for(i=0;i<slen;i++)
8104   {
8105     if(bt[i])
8106     {
8107       int hr;
8108       for(hr=0;hr<HOST_REGS;hr++)
8109       {
8110         // Is this really necessary?
8111         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8112       }
8113       current.isconst=0;
8114     }
8115     if(i>1)
8116     {
8117       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8118       {
8119         if(rs1[i-2]==0||rs2[i-2]==0)
8120         {
8121           if(rs1[i-2]) {
8122             current.is32|=1LL<<rs1[i-2];
8123             int hr=get_reg(current.regmap,rs1[i-2]|64);
8124             if(hr>=0) current.regmap[hr]=-1;
8125           }
8126           if(rs2[i-2]) {
8127             current.is32|=1LL<<rs2[i-2];
8128             int hr=get_reg(current.regmap,rs2[i-2]|64);
8129             if(hr>=0) current.regmap[hr]=-1;
8130           }
8131         }
8132       }
8133     }
8134     // If something jumps here with 64-bit values
8135     // then promote those registers to 64 bits
8136     if(bt[i])
8137     {
8138       uint64_t temp_is32=current.is32;
8139       for(j=i-1;j>=0;j--)
8140       {
8141         if(ba[j]==start+i*4) 
8142           temp_is32&=branch_regs[j].is32;
8143       }
8144       for(j=i;j<slen;j++)
8145       {
8146         if(ba[j]==start+i*4) 
8147           //temp_is32=1;
8148           temp_is32&=p32[j];
8149       }
8150       if(temp_is32!=current.is32) {
8151         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8152         #ifdef DESTRUCTIVE_WRITEBACK
8153         for(hr=0;hr<HOST_REGS;hr++)
8154         {
8155           int r=current.regmap[hr];
8156           if(r>0&&r<64)
8157           {
8158             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8159               temp_is32|=1LL<<r;
8160               //printf("restore %d\n",r);
8161             }
8162           }
8163         }
8164         #endif
8165         current.is32=temp_is32;
8166       }
8167     }
8168     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8169     regs[i].wasconst=current.isconst;
8170     regs[i].was32=current.is32;
8171     regs[i].wasdirty=current.dirty;
8172     #ifdef DESTRUCTIVE_WRITEBACK
8173     // To change a dirty register from 32 to 64 bits, we must write
8174     // it out during the previous cycle (for branches, 2 cycles)
8175     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8176     {
8177       uint64_t temp_is32=current.is32;
8178       for(j=i-1;j>=0;j--)
8179       {
8180         if(ba[j]==start+i*4+4) 
8181           temp_is32&=branch_regs[j].is32;
8182       }
8183       for(j=i;j<slen;j++)
8184       {
8185         if(ba[j]==start+i*4+4) 
8186           //temp_is32=1;
8187           temp_is32&=p32[j];
8188       }
8189       if(temp_is32!=current.is32) {
8190         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8191         for(hr=0;hr<HOST_REGS;hr++)
8192         {
8193           int r=current.regmap[hr];
8194           if(r>0)
8195           {
8196             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8197               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8198               {
8199                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8200                 {
8201                   //printf("dump %d/r%d\n",hr,r);
8202                   current.regmap[hr]=-1;
8203                   if(get_reg(current.regmap,r|64)>=0) 
8204                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8205                 }
8206               }
8207             }
8208           }
8209         }
8210       }
8211     }
8212     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8213     {
8214       uint64_t temp_is32=current.is32;
8215       for(j=i-1;j>=0;j--)
8216       {
8217         if(ba[j]==start+i*4+8) 
8218           temp_is32&=branch_regs[j].is32;
8219       }
8220       for(j=i;j<slen;j++)
8221       {
8222         if(ba[j]==start+i*4+8) 
8223           //temp_is32=1;
8224           temp_is32&=p32[j];
8225       }
8226       if(temp_is32!=current.is32) {
8227         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8228         for(hr=0;hr<HOST_REGS;hr++)
8229         {
8230           int r=current.regmap[hr];
8231           if(r>0)
8232           {
8233             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8234               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8235               {
8236                 //printf("dump %d/r%d\n",hr,r);
8237                 current.regmap[hr]=-1;
8238                 if(get_reg(current.regmap,r|64)>=0) 
8239                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8240               }
8241             }
8242           }
8243         }
8244       }
8245     }
8246     #endif
8247     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8248       if(i+1<slen) {
8249         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8250         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8251         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8252         current.u|=1;
8253         current.uu|=1;
8254       } else {
8255         current.u=1;
8256         current.uu=1;
8257       }
8258     } else {
8259       if(i+1<slen) {
8260         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8261         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8262         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8263         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8264         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8265         current.u|=1;
8266         current.uu|=1;
8267       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8268     }
8269     is_ds[i]=ds;
8270     if(ds) {
8271       ds=0; // Skip delay slot, already allocated as part of branch
8272       // ...but we need to alloc it in case something jumps here
8273       if(i+1<slen) {
8274         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8275         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8276       }else{
8277         current.u=branch_unneeded_reg[i-1];
8278         current.uu=branch_unneeded_reg_upper[i-1];
8279       }
8280       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8281       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8282       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8283       current.u|=1;
8284       current.uu|=1;
8285       struct regstat temp;
8286       memcpy(&temp,&current,sizeof(current));
8287       temp.wasdirty=temp.dirty;
8288       temp.was32=temp.is32;
8289       // TODO: Take into account unconditional branches, as below
8290       delayslot_alloc(&temp,i);
8291       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8292       regs[i].wasdirty=temp.wasdirty;
8293       regs[i].was32=temp.was32;
8294       regs[i].dirty=temp.dirty;
8295       regs[i].is32=temp.is32;
8296       regs[i].isconst=0;
8297       regs[i].wasconst=0;
8298       current.isconst=0;
8299       // Create entry (branch target) regmap
8300       for(hr=0;hr<HOST_REGS;hr++)
8301       {
8302         int r=temp.regmap[hr];
8303         if(r>=0) {
8304           if(r!=regmap_pre[i][hr]) {
8305             regs[i].regmap_entry[hr]=-1;
8306           }
8307           else
8308           {
8309             if(r<64){
8310               if((current.u>>r)&1) {
8311                 regs[i].regmap_entry[hr]=-1;
8312                 regs[i].regmap[hr]=-1;
8313                 //Don't clear regs in the delay slot as the branch might need them
8314                 //current.regmap[hr]=-1;
8315               }else
8316                 regs[i].regmap_entry[hr]=r;
8317             }
8318             else {
8319               if((current.uu>>(r&63))&1) {
8320                 regs[i].regmap_entry[hr]=-1;
8321                 regs[i].regmap[hr]=-1;
8322                 //Don't clear regs in the delay slot as the branch might need them
8323                 //current.regmap[hr]=-1;
8324               }else
8325                 regs[i].regmap_entry[hr]=r;
8326             }
8327           }
8328         } else {
8329           // First instruction expects CCREG to be allocated
8330           if(i==0&&hr==HOST_CCREG) 
8331             regs[i].regmap_entry[hr]=CCREG;
8332           else
8333             regs[i].regmap_entry[hr]=-1;
8334         }
8335       }
8336     }
8337     else { // Not delay slot
8338       switch(itype[i]) {
8339         case UJUMP:
8340           //current.isconst=0; // DEBUG
8341           //current.wasconst=0; // DEBUG
8342           //regs[i].wasconst=0; // DEBUG
8343           clear_const(&current,rt1[i]);
8344           alloc_cc(&current,i);
8345           dirty_reg(&current,CCREG);
8346           if (rt1[i]==31) {
8347             alloc_reg(&current,i,31);
8348             dirty_reg(&current,31);
8349             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8350             #ifdef REG_PREFETCH
8351             alloc_reg(&current,i,PTEMP);
8352             #endif
8353             //current.is32|=1LL<<rt1[i];
8354           }
8355           delayslot_alloc(&current,i+1);
8356           //current.isconst=0; // DEBUG
8357           ds=1;
8358           //printf("i=%d, isconst=%x\n",i,current.isconst);
8359           break;
8360         case RJUMP:
8361           //current.isconst=0;
8362           //current.wasconst=0;
8363           //regs[i].wasconst=0;
8364           clear_const(&current,rs1[i]);
8365           clear_const(&current,rt1[i]);
8366           alloc_cc(&current,i);
8367           dirty_reg(&current,CCREG);
8368           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8369             alloc_reg(&current,i,rs1[i]);
8370             if (rt1[i]==31) {
8371               alloc_reg(&current,i,31);
8372               dirty_reg(&current,31);
8373               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8374               #ifdef REG_PREFETCH
8375               alloc_reg(&current,i,PTEMP);
8376               #endif
8377             }
8378             #ifdef USE_MINI_HT
8379             if(rs1[i]==31) { // JALR
8380               alloc_reg(&current,i,RHASH);
8381               #ifndef HOST_IMM_ADDR32
8382               alloc_reg(&current,i,RHTBL);
8383               #endif
8384             }
8385             #endif
8386             delayslot_alloc(&current,i+1);
8387           } else {
8388             // The delay slot overwrites our source register,
8389             // allocate a temporary register to hold the old value.
8390             current.isconst=0;
8391             current.wasconst=0;
8392             regs[i].wasconst=0;
8393             delayslot_alloc(&current,i+1);
8394             current.isconst=0;
8395             alloc_reg(&current,i,RTEMP);
8396           }
8397           //current.isconst=0; // DEBUG
8398           ds=1;
8399           break;
8400         case CJUMP:
8401           //current.isconst=0;
8402           //current.wasconst=0;
8403           //regs[i].wasconst=0;
8404           clear_const(&current,rs1[i]);
8405           clear_const(&current,rs2[i]);
8406           if((opcode[i]&0x3E)==4) // BEQ/BNE
8407           {
8408             alloc_cc(&current,i);
8409             dirty_reg(&current,CCREG);
8410             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8411             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8412             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8413             {
8414               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8415               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8416             }
8417             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8418                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8419               // The delay slot overwrites one of our conditions.
8420               // Allocate the branch condition registers instead.
8421               // Note that such a sequence of instructions could
8422               // be considered a bug since the branch can not be
8423               // re-executed if an exception occurs.
8424               current.isconst=0;
8425               current.wasconst=0;
8426               regs[i].wasconst=0;
8427               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8428               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8429               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8430               {
8431                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8432                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8433               }
8434             }
8435             else delayslot_alloc(&current,i+1);
8436           }
8437           else
8438           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8439           {
8440             alloc_cc(&current,i);
8441             dirty_reg(&current,CCREG);
8442             alloc_reg(&current,i,rs1[i]);
8443             if(!(current.is32>>rs1[i]&1))
8444             {
8445               alloc_reg64(&current,i,rs1[i]);
8446             }
8447             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8448               // The delay slot overwrites one of our conditions.
8449               // Allocate the branch condition registers instead.
8450               // Note that such a sequence of instructions could
8451               // be considered a bug since the branch can not be
8452               // re-executed if an exception occurs.
8453               current.isconst=0;
8454               current.wasconst=0;
8455               regs[i].wasconst=0;
8456               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8457               if(!((current.is32>>rs1[i])&1))
8458               {
8459                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8460               }
8461             }
8462             else delayslot_alloc(&current,i+1);
8463           }
8464           else
8465           // Don't alloc the delay slot yet because we might not execute it
8466           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8467           {
8468             current.isconst=0;
8469             current.wasconst=0;
8470             regs[i].wasconst=0;
8471             alloc_cc(&current,i);
8472             dirty_reg(&current,CCREG);
8473             alloc_reg(&current,i,rs1[i]);
8474             alloc_reg(&current,i,rs2[i]);
8475             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8476             {
8477               alloc_reg64(&current,i,rs1[i]);
8478               alloc_reg64(&current,i,rs2[i]);
8479             }
8480           }
8481           else
8482           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8483           {
8484             current.isconst=0;
8485             current.wasconst=0;
8486             regs[i].wasconst=0;
8487             alloc_cc(&current,i);
8488             dirty_reg(&current,CCREG);
8489             alloc_reg(&current,i,rs1[i]);
8490             if(!(current.is32>>rs1[i]&1))
8491             {
8492               alloc_reg64(&current,i,rs1[i]);
8493             }
8494           }
8495           ds=1;
8496           //current.isconst=0;
8497           break;
8498         case SJUMP:
8499           //current.isconst=0;
8500           //current.wasconst=0;
8501           //regs[i].wasconst=0;
8502           clear_const(&current,rs1[i]);
8503           clear_const(&current,rt1[i]);
8504           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8505           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8506           {
8507             alloc_cc(&current,i);
8508             dirty_reg(&current,CCREG);
8509             alloc_reg(&current,i,rs1[i]);
8510             if(!(current.is32>>rs1[i]&1))
8511             {
8512               alloc_reg64(&current,i,rs1[i]);
8513             }
8514             if (rt1[i]==31) { // BLTZAL/BGEZAL
8515               alloc_reg(&current,i,31);
8516               dirty_reg(&current,31);
8517               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8518               //#ifdef REG_PREFETCH
8519               //alloc_reg(&current,i,PTEMP);
8520               //#endif
8521               //current.is32|=1LL<<rt1[i];
8522             }
8523             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8524               // The delay slot overwrites the branch condition.
8525               // Allocate the branch condition registers instead.
8526               // Note that such a sequence of instructions could
8527               // be considered a bug since the branch can not be
8528               // re-executed if an exception occurs.
8529               current.isconst=0;
8530               current.wasconst=0;
8531               regs[i].wasconst=0;
8532               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8533               if(!((current.is32>>rs1[i])&1))
8534               {
8535                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8536               }
8537             }
8538             else delayslot_alloc(&current,i+1);
8539           }
8540           else
8541           // Don't alloc the delay slot yet because we might not execute it
8542           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8543           {
8544             current.isconst=0;
8545             current.wasconst=0;
8546             regs[i].wasconst=0;
8547             alloc_cc(&current,i);
8548             dirty_reg(&current,CCREG);
8549             alloc_reg(&current,i,rs1[i]);
8550             if(!(current.is32>>rs1[i]&1))
8551             {
8552               alloc_reg64(&current,i,rs1[i]);
8553             }
8554           }
8555           ds=1;
8556           //current.isconst=0;
8557           break;
8558         case FJUMP:
8559           current.isconst=0;
8560           current.wasconst=0;
8561           regs[i].wasconst=0;
8562           if(likely[i]==0) // BC1F/BC1T
8563           {
8564             // TODO: Theoretically we can run out of registers here on x86.
8565             // The delay slot can allocate up to six, and we need to check
8566             // CSREG before executing the delay slot.  Possibly we can drop
8567             // the cycle count and then reload it after checking that the
8568             // FPU is in a usable state, or don't do out-of-order execution.
8569             alloc_cc(&current,i);
8570             dirty_reg(&current,CCREG);
8571             alloc_reg(&current,i,FSREG);
8572             alloc_reg(&current,i,CSREG);
8573             if(itype[i+1]==FCOMP) {
8574               // The delay slot overwrites the branch condition.
8575               // Allocate the branch condition registers instead.
8576               // Note that such a sequence of instructions could
8577               // be considered a bug since the branch can not be
8578               // re-executed if an exception occurs.
8579               alloc_cc(&current,i);
8580               dirty_reg(&current,CCREG);
8581               alloc_reg(&current,i,CSREG);
8582               alloc_reg(&current,i,FSREG);
8583             }
8584             else {
8585               delayslot_alloc(&current,i+1);
8586               alloc_reg(&current,i+1,CSREG);
8587             }
8588           }
8589           else
8590           // Don't alloc the delay slot yet because we might not execute it
8591           if(likely[i]) // BC1FL/BC1TL
8592           {
8593             alloc_cc(&current,i);
8594             dirty_reg(&current,CCREG);
8595             alloc_reg(&current,i,CSREG);
8596             alloc_reg(&current,i,FSREG);
8597           }
8598           ds=1;
8599           current.isconst=0;
8600           break;
8601         case IMM16:
8602           imm16_alloc(&current,i);
8603           break;
8604         case LOAD:
8605         case LOADLR:
8606           load_alloc(&current,i);
8607           break;
8608         case STORE:
8609         case STORELR:
8610           store_alloc(&current,i);
8611           break;
8612         case ALU:
8613           alu_alloc(&current,i);
8614           break;
8615         case SHIFT:
8616           shift_alloc(&current,i);
8617           break;
8618         case MULTDIV:
8619           multdiv_alloc(&current,i);
8620           break;
8621         case SHIFTIMM:
8622           shiftimm_alloc(&current,i);
8623           break;
8624         case MOV:
8625           mov_alloc(&current,i);
8626           break;
8627         case COP0:
8628           cop0_alloc(&current,i);
8629           break;
8630         case COP1:
8631           cop1_alloc(&current,i);
8632           break;
8633         case C1LS:
8634           c1ls_alloc(&current,i);
8635           break;
8636         case FCONV:
8637           fconv_alloc(&current,i);
8638           break;
8639         case FLOAT:
8640           float_alloc(&current,i);
8641           break;
8642         case FCOMP:
8643           fcomp_alloc(&current,i);
8644           break;
8645         case SYSCALL:
8646           syscall_alloc(&current,i);
8647           break;
8648         case SPAN:
8649           pagespan_alloc(&current,i);
8650           break;
8651       }
8652       
8653       // Drop the upper half of registers that have become 32-bit
8654       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8655       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8656         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8657         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8658         current.uu|=1;
8659       } else {
8660         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8661         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8662         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8663         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8664         current.uu|=1;
8665       }
8666
8667       // Create entry (branch target) regmap
8668       for(hr=0;hr<HOST_REGS;hr++)
8669       {
8670         int r,or,er;
8671         r=current.regmap[hr];
8672         if(r>=0) {
8673           if(r!=regmap_pre[i][hr]) {
8674             // TODO: delay slot (?)
8675             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8676             if(or<0||(r&63)>=TEMPREG){
8677               regs[i].regmap_entry[hr]=-1;
8678             }
8679             else
8680             {
8681               // Just move it to a different register
8682               regs[i].regmap_entry[hr]=r;
8683               // If it was dirty before, it's still dirty
8684               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8685             }
8686           }
8687           else
8688           {
8689             // Unneeded
8690             if(r==0){
8691               regs[i].regmap_entry[hr]=0;
8692             }
8693             else
8694             if(r<64){
8695               if((current.u>>r)&1) {
8696                 regs[i].regmap_entry[hr]=-1;
8697                 //regs[i].regmap[hr]=-1;
8698                 current.regmap[hr]=-1;
8699               }else
8700                 regs[i].regmap_entry[hr]=r;
8701             }
8702             else {
8703               if((current.uu>>(r&63))&1) {
8704                 regs[i].regmap_entry[hr]=-1;
8705                 //regs[i].regmap[hr]=-1;
8706                 current.regmap[hr]=-1;
8707               }else
8708                 regs[i].regmap_entry[hr]=r;
8709             }
8710           }
8711         } else {
8712           // Branches expect CCREG to be allocated at the target
8713           if(regmap_pre[i][hr]==CCREG) 
8714             regs[i].regmap_entry[hr]=CCREG;
8715           else
8716             regs[i].regmap_entry[hr]=-1;
8717         }
8718       }
8719       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8720     }
8721     /* Branch post-alloc */
8722     if(i>0)
8723     {
8724       current.was32=current.is32;
8725       current.wasdirty=current.dirty;
8726       switch(itype[i-1]) {
8727         case UJUMP:
8728           memcpy(&branch_regs[i-1],&current,sizeof(current));
8729           branch_regs[i-1].isconst=0;
8730           branch_regs[i-1].wasconst=0;
8731           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8732           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8733           alloc_cc(&branch_regs[i-1],i-1);
8734           dirty_reg(&branch_regs[i-1],CCREG);
8735           if(rt1[i-1]==31) { // JAL
8736             alloc_reg(&branch_regs[i-1],i-1,31);
8737             dirty_reg(&branch_regs[i-1],31);
8738             branch_regs[i-1].is32|=1LL<<31;
8739           }
8740           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8741           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8742           break;
8743         case RJUMP:
8744           memcpy(&branch_regs[i-1],&current,sizeof(current));
8745           branch_regs[i-1].isconst=0;
8746           branch_regs[i-1].wasconst=0;
8747           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8748           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8749           alloc_cc(&branch_regs[i-1],i-1);
8750           dirty_reg(&branch_regs[i-1],CCREG);
8751           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8752           if(rt1[i-1]==31) { // JALR
8753             alloc_reg(&branch_regs[i-1],i-1,31);
8754             dirty_reg(&branch_regs[i-1],31);
8755             branch_regs[i-1].is32|=1LL<<31;
8756           }
8757           #ifdef USE_MINI_HT
8758           if(rs1[i-1]==31) { // JALR
8759             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8760             #ifndef HOST_IMM_ADDR32
8761             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8762             #endif
8763           }
8764           #endif
8765           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8766           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8767           break;
8768         case CJUMP:
8769           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8770           {
8771             alloc_cc(&current,i-1);
8772             dirty_reg(&current,CCREG);
8773             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8774                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8775               // The delay slot overwrote one of our conditions
8776               // Delay slot goes after the test (in order)
8777               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8778               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8779               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8780               current.u|=1;
8781               current.uu|=1;
8782               delayslot_alloc(&current,i);
8783               current.isconst=0;
8784             }
8785             else
8786             {
8787               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8788               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8789               // Alloc the branch condition registers
8790               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8791               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8792               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8793               {
8794                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8795                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8796               }
8797             }
8798             memcpy(&branch_regs[i-1],&current,sizeof(current));
8799             branch_regs[i-1].isconst=0;
8800             branch_regs[i-1].wasconst=0;
8801             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8802             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8803           }
8804           else
8805           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8806           {
8807             alloc_cc(&current,i-1);
8808             dirty_reg(&current,CCREG);
8809             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8810               // The delay slot overwrote the branch condition
8811               // Delay slot goes after the test (in order)
8812               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8813               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8814               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8815               current.u|=1;
8816               current.uu|=1;
8817               delayslot_alloc(&current,i);
8818               current.isconst=0;
8819             }
8820             else
8821             {
8822               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8823               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8824               // Alloc the branch condition register
8825               alloc_reg(&current,i-1,rs1[i-1]);
8826               if(!(current.is32>>rs1[i-1]&1))
8827               {
8828                 alloc_reg64(&current,i-1,rs1[i-1]);
8829               }
8830             }
8831             memcpy(&branch_regs[i-1],&current,sizeof(current));
8832             branch_regs[i-1].isconst=0;
8833             branch_regs[i-1].wasconst=0;
8834             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8835             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8836           }
8837           else
8838           // Alloc the delay slot in case the branch is taken
8839           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8840           {
8841             memcpy(&branch_regs[i-1],&current,sizeof(current));
8842             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8843             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8844             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8845             alloc_cc(&branch_regs[i-1],i);
8846             dirty_reg(&branch_regs[i-1],CCREG);
8847             delayslot_alloc(&branch_regs[i-1],i);
8848             branch_regs[i-1].isconst=0;
8849             alloc_reg(&current,i,CCREG); // Not taken path
8850             dirty_reg(&current,CCREG);
8851             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8852           }
8853           else
8854           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8855           {
8856             memcpy(&branch_regs[i-1],&current,sizeof(current));
8857             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8858             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8859             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8860             alloc_cc(&branch_regs[i-1],i);
8861             dirty_reg(&branch_regs[i-1],CCREG);
8862             delayslot_alloc(&branch_regs[i-1],i);
8863             branch_regs[i-1].isconst=0;
8864             alloc_reg(&current,i,CCREG); // Not taken path
8865             dirty_reg(&current,CCREG);
8866             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8867           }
8868           break;
8869         case SJUMP:
8870           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8871           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8872           {
8873             alloc_cc(&current,i-1);
8874             dirty_reg(&current,CCREG);
8875             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8876               // The delay slot overwrote the branch condition
8877               // Delay slot goes after the test (in order)
8878               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8879               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8880               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8881               current.u|=1;
8882               current.uu|=1;
8883               delayslot_alloc(&current,i);
8884               current.isconst=0;
8885             }
8886             else
8887             {
8888               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8889               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8890               // Alloc the branch condition register
8891               alloc_reg(&current,i-1,rs1[i-1]);
8892               if(!(current.is32>>rs1[i-1]&1))
8893               {
8894                 alloc_reg64(&current,i-1,rs1[i-1]);
8895               }
8896             }
8897             memcpy(&branch_regs[i-1],&current,sizeof(current));
8898             branch_regs[i-1].isconst=0;
8899             branch_regs[i-1].wasconst=0;
8900             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8901             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8902           }
8903           else
8904           // Alloc the delay slot in case the branch is taken
8905           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8906           {
8907             memcpy(&branch_regs[i-1],&current,sizeof(current));
8908             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8909             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8910             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8911             alloc_cc(&branch_regs[i-1],i);
8912             dirty_reg(&branch_regs[i-1],CCREG);
8913             delayslot_alloc(&branch_regs[i-1],i);
8914             branch_regs[i-1].isconst=0;
8915             alloc_reg(&current,i,CCREG); // Not taken path
8916             dirty_reg(&current,CCREG);
8917             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8918           }
8919           // FIXME: BLTZAL/BGEZAL
8920           if(opcode2[i-1]&0x10) { // BxxZAL
8921             alloc_reg(&branch_regs[i-1],i-1,31);
8922             dirty_reg(&branch_regs[i-1],31);
8923             branch_regs[i-1].is32|=1LL<<31;
8924           }
8925           break;
8926         case FJUMP:
8927           if(likely[i-1]==0) // BC1F/BC1T
8928           {
8929             alloc_cc(&current,i-1);
8930             dirty_reg(&current,CCREG);
8931             if(itype[i]==FCOMP) {
8932               // The delay slot overwrote the branch condition
8933               // Delay slot goes after the test (in order)
8934               delayslot_alloc(&current,i);
8935               current.isconst=0;
8936             }
8937             else
8938             {
8939               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8940               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8941               // Alloc the branch condition register
8942               alloc_reg(&current,i-1,FSREG);
8943             }
8944             memcpy(&branch_regs[i-1],&current,sizeof(current));
8945             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8946           }
8947           else // BC1FL/BC1TL
8948           {
8949             // Alloc the delay slot in case the branch is taken
8950             memcpy(&branch_regs[i-1],&current,sizeof(current));
8951             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8952             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8953             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8954             alloc_cc(&branch_regs[i-1],i);
8955             dirty_reg(&branch_regs[i-1],CCREG);
8956             delayslot_alloc(&branch_regs[i-1],i);
8957             branch_regs[i-1].isconst=0;
8958             alloc_reg(&current,i,CCREG); // Not taken path
8959             dirty_reg(&current,CCREG);
8960             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8961           }
8962           break;
8963       }
8964
8965       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8966       {
8967         if(rt1[i-1]==31) // JAL/JALR
8968         {
8969           // Subroutine call will return here, don't alloc any registers
8970           current.is32=1;
8971           current.dirty=0;
8972           clear_all_regs(current.regmap);
8973           alloc_reg(&current,i,CCREG);
8974           dirty_reg(&current,CCREG);
8975         }
8976         else if(i+1<slen)
8977         {
8978           // Internal branch will jump here, match registers to caller
8979           current.is32=0x3FFFFFFFFLL;
8980           current.dirty=0;
8981           clear_all_regs(current.regmap);
8982           alloc_reg(&current,i,CCREG);
8983           dirty_reg(&current,CCREG);
8984           for(j=i-1;j>=0;j--)
8985           {
8986             if(ba[j]==start+i*4+4) {
8987               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8988               current.is32=branch_regs[j].is32;
8989               current.dirty=branch_regs[j].dirty;
8990               break;
8991             }
8992           }
8993           while(j>=0) {
8994             if(ba[j]==start+i*4+4) {
8995               for(hr=0;hr<HOST_REGS;hr++) {
8996                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8997                   current.regmap[hr]=-1;
8998                 }
8999                 current.is32&=branch_regs[j].is32;
9000                 current.dirty&=branch_regs[j].dirty;
9001               }
9002             }
9003             j--;
9004           }
9005         }
9006       }
9007     }
9008
9009     // Count cycles in between branches
9010     ccadj[i]=cc;
9011     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL))
9012     {
9013       cc=0;
9014     }
9015     else
9016     {
9017       cc++;
9018     }
9019
9020     flush_dirty_uppers(&current);
9021     if(!is_ds[i]) {
9022       regs[i].is32=current.is32;
9023       regs[i].dirty=current.dirty;
9024       regs[i].isconst=current.isconst;
9025       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9026     }
9027     for(hr=0;hr<HOST_REGS;hr++) {
9028       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9029         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9030           regs[i].wasconst&=~(1<<hr);
9031         }
9032       }
9033     }
9034     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9035   }
9036   
9037   /* Pass 4 - Cull unused host registers */
9038   
9039   uint64_t nr=0;
9040   
9041   for (i=slen-1;i>=0;i--)
9042   {
9043     int hr;
9044     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9045     {
9046       if(ba[i]<start || ba[i]>=(start+slen*4))
9047       {
9048         // Branch out of this block, don't need anything
9049         nr=0;
9050       }
9051       else
9052       {
9053         // Internal branch
9054         // Need whatever matches the target
9055         nr=0;
9056         int t=(ba[i]-start)>>2;
9057         for(hr=0;hr<HOST_REGS;hr++)
9058         {
9059           if(regs[i].regmap_entry[hr]>=0) {
9060             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9061           }
9062         }
9063       }
9064       // Conditional branch may need registers for following instructions
9065       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9066       {
9067         if(i<slen-2) {
9068           nr|=needed_reg[i+2];
9069           for(hr=0;hr<HOST_REGS;hr++)
9070           {
9071             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9072             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9073           }
9074         }
9075       }
9076       // Don't need stuff which is overwritten
9077       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9078       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9079       // Merge in delay slot
9080       for(hr=0;hr<HOST_REGS;hr++)
9081       {
9082         if(!likely[i]) {
9083           // These are overwritten unless the branch is "likely"
9084           // and the delay slot is nullified if not taken
9085           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9086           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9087         }
9088         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9089         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9090         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9091         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9092         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9093         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9094         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9095         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9096         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9097           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9098           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9099         }
9100         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9101           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9102           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9103         }
9104         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9105           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9106           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9107         }
9108       }
9109     }
9110     else if(itype[i]==SYSCALL)
9111     {
9112       // SYSCALL instruction (software interrupt)
9113       nr=0;
9114     }
9115     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9116     {
9117       // ERET instruction (return from interrupt)
9118       nr=0;
9119     }
9120     else // Non-branch
9121     {
9122       if(i<slen-1) {
9123         for(hr=0;hr<HOST_REGS;hr++) {
9124           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9125           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9126           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9127           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9128         }
9129       }
9130     }
9131     for(hr=0;hr<HOST_REGS;hr++)
9132     {
9133       // Overwritten registers are not needed
9134       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9135       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9136       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9137       // Source registers are needed
9138       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9139       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9140       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9141       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9142       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9143       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9144       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9145       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9146       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9147         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9148         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9149       }
9150       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9151         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9152         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9153       }
9154       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9155         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9156         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9157       }
9158       // Don't store a register immediately after writing it,
9159       // may prevent dual-issue.
9160       // But do so if this is a branch target, otherwise we
9161       // might have to load the register before the branch.
9162       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9163         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9164            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9165           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9166           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9167         }
9168         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9169            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9170           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9171           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9172         }
9173       }
9174     }
9175     // Cycle count is needed at branches.  Assume it is needed at the target too.
9176     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9177       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9178       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9179     }
9180     // Save it
9181     needed_reg[i]=nr;
9182     
9183     // Deallocate unneeded registers
9184     for(hr=0;hr<HOST_REGS;hr++)
9185     {
9186       if(!((nr>>hr)&1)) {
9187         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9188         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9189            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9190            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9191         {
9192           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9193           {
9194             if(likely[i]) {
9195               regs[i].regmap[hr]=-1;
9196               regs[i].isconst&=~(1<<hr);
9197               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9198             }
9199           }
9200         }
9201         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9202         {
9203           int d1=0,d2=0,map=0,temp=0;
9204           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9205           {
9206             d1=dep1[i+1];
9207             d2=dep2[i+1];
9208           }
9209           if(using_tlb) {
9210             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9211                itype[i+1]==STORE || itype[i+1]==STORELR ||
9212                itype[i+1]==C1LS )
9213             map=TLREG;
9214           } else
9215           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9216             map=INVCP;
9217           }
9218           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9219              itype[i+1]==C1LS )
9220             temp=FTEMP;
9221           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9222              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9223              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9224              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9225              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9226              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9227              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9228              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9229              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9230              regs[i].regmap[hr]!=map )
9231           {
9232             regs[i].regmap[hr]=-1;
9233             regs[i].isconst&=~(1<<hr);
9234             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9235                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9236                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9237                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9238                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9239                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9240                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9241                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9242                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9243                branch_regs[i].regmap[hr]!=map)
9244             {
9245               branch_regs[i].regmap[hr]=-1;
9246               branch_regs[i].regmap_entry[hr]=-1;
9247               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9248               {
9249                 if(!likely[i]&&i<slen-2) {
9250                   regmap_pre[i+2][hr]=-1;
9251                 }
9252               }
9253             }
9254           }
9255         }
9256         else
9257         {
9258           // Non-branch
9259           if(i>0)
9260           {
9261             int d1=0,d2=0,map=-1,temp=-1;
9262             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9263             {
9264               d1=dep1[i];
9265               d2=dep2[i];
9266             }
9267             if(using_tlb) {
9268               if(itype[i]==LOAD || itype[i]==LOADLR ||
9269                  itype[i]==STORE || itype[i]==STORELR ||
9270                  itype[i]==C1LS )
9271               map=TLREG;
9272             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9273               map=INVCP;
9274             }
9275             if(itype[i]==LOADLR || itype[i]==STORELR ||
9276                itype[i]==C1LS )
9277               temp=FTEMP;
9278             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9279                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9280                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9281                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9282                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9283                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9284             {
9285               if(i<slen-1&&!is_ds[i]) {
9286                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9287                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9288                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9289                 {
9290                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9291                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9292                 }
9293                 regmap_pre[i+1][hr]=-1;
9294                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9295               }
9296               regs[i].regmap[hr]=-1;
9297               regs[i].isconst&=~(1<<hr);
9298             }
9299           }
9300         }
9301       }
9302     }
9303   }
9304   
9305   /* Pass 5 - Pre-allocate registers */
9306   
9307   // If a register is allocated during a loop, try to allocate it for the
9308   // entire loop, if possible.  This avoids loading/storing registers
9309   // inside of the loop.
9310
9311   signed char f_regmap[HOST_REGS];
9312   clear_all_regs(f_regmap);
9313   for(i=0;i<slen-1;i++)
9314   {
9315     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9316     {
9317       if(ba[i]>=start && ba[i]<(start+i*4)) 
9318       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9319       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9320       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9321       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9322       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9323       {
9324         int t=(ba[i]-start)>>2;
9325         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9326         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9327         for(hr=0;hr<HOST_REGS;hr++)
9328         {
9329           if(regs[i].regmap[hr]>64) {
9330             if(!((regs[i].dirty>>hr)&1))
9331               f_regmap[hr]=regs[i].regmap[hr];
9332             else f_regmap[hr]=-1;
9333           }
9334           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9335           if(branch_regs[i].regmap[hr]>64) {
9336             if(!((branch_regs[i].dirty>>hr)&1))
9337               f_regmap[hr]=branch_regs[i].regmap[hr];
9338             else f_regmap[hr]=-1;
9339           }
9340           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9341           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9342           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9343           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9344           {
9345             // Test both in case the delay slot is ooo,
9346             // could be done better...
9347             if(count_free_regs(branch_regs[i].regmap)<2
9348              ||count_free_regs(regs[i].regmap)<2) 
9349               f_regmap[hr]=branch_regs[i].regmap[hr];
9350           }
9351           // Avoid dirty->clean transition
9352           // #ifdef DESTRUCTIVE_WRITEBACK here?
9353           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9354           if(f_regmap[hr]>0) {
9355             if(regs[t].regmap_entry[hr]<0) {
9356               int r=f_regmap[hr];
9357               for(j=t;j<=i;j++)
9358               {
9359                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9360                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9361                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9362                 if(r>63) {
9363                   // NB This can exclude the case where the upper-half
9364                   // register is lower numbered than the lower-half
9365                   // register.  Not sure if it's worth fixing...
9366                   if(get_reg(regs[j].regmap,r&63)<0) break;
9367                   if(regs[j].is32&(1LL<<(r&63))) break;
9368                 }
9369                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9370                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9371                   int k;
9372                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9373                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9374                     if(r>63) {
9375                       if(get_reg(regs[i].regmap,r&63)<0) break;
9376                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9377                     }
9378                     k=i;
9379                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9380                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9381                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9382                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9383                       ||itype[k-1]==FCOMP) {
9384                         if(count_free_regs(regs[k-1].regmap)<2) {
9385                           //printf("no free regs for store %x\n",start+(k-1)*4);
9386                           break;
9387                         }
9388                       }
9389                       else
9390                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9391                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9392                         //printf("no-match due to different register\n");
9393                         break;
9394                       }
9395                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9396                         //printf("no-match due to branch\n");
9397                         break;
9398                       }
9399                       // call/ret fast path assumes no registers allocated
9400                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9401                         break;
9402                       }
9403                       if(r>63) {
9404                         // NB This can exclude the case where the upper-half
9405                         // register is lower numbered than the lower-half
9406                         // register.  Not sure if it's worth fixing...
9407                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9408                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9409                       }
9410                       k--;
9411                     }
9412                     if(i<slen-1) {
9413                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9414                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9415                         //printf("bad match after branch\n");
9416                         break;
9417                       }
9418                     }
9419                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9420                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9421                       while(k<i) {
9422                         regs[k].regmap_entry[hr]=f_regmap[hr];
9423                         regs[k].regmap[hr]=f_regmap[hr];
9424                         regmap_pre[k+1][hr]=f_regmap[hr];
9425                         regs[k].wasdirty&=~(1<<hr);
9426                         regs[k].dirty&=~(1<<hr);
9427                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9428                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9429                         regs[k].wasconst&=~(1<<hr);
9430                         regs[k].isconst&=~(1<<hr);
9431                         k++;
9432                       }
9433                     }
9434                     else {
9435                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9436                       break;
9437                     }
9438                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9439                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9440                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9441                       regs[i].regmap_entry[hr]=f_regmap[hr];
9442                       regs[i].regmap[hr]=f_regmap[hr];
9443                       regs[i].wasdirty&=~(1<<hr);
9444                       regs[i].dirty&=~(1<<hr);
9445                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9446                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9447                       regs[i].wasconst&=~(1<<hr);
9448                       regs[i].isconst&=~(1<<hr);
9449                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9450                       branch_regs[i].wasdirty&=~(1<<hr);
9451                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9452                       branch_regs[i].regmap[hr]=f_regmap[hr];
9453                       branch_regs[i].dirty&=~(1<<hr);
9454                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9455                       branch_regs[i].wasconst&=~(1<<hr);
9456                       branch_regs[i].isconst&=~(1<<hr);
9457                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9458                         regmap_pre[i+2][hr]=f_regmap[hr];
9459                         regs[i+2].wasdirty&=~(1<<hr);
9460                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9461                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9462                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9463                       }
9464                     }
9465                   }
9466                   for(k=t;k<j;k++) {
9467                     regs[k].regmap_entry[hr]=f_regmap[hr];
9468                     regs[k].regmap[hr]=f_regmap[hr];
9469                     regmap_pre[k+1][hr]=f_regmap[hr];
9470                     regs[k+1].wasdirty&=~(1<<hr);
9471                     regs[k].dirty&=~(1<<hr);
9472                     regs[k].wasconst&=~(1<<hr);
9473                     regs[k].isconst&=~(1<<hr);
9474                   }
9475                   if(regs[j].regmap[hr]==f_regmap[hr])
9476                     regs[j].regmap_entry[hr]=f_regmap[hr];
9477                   break;
9478                 }
9479                 if(j==i) break;
9480                 if(regs[j].regmap[hr]>=0)
9481                   break;
9482                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9483                   //printf("no-match due to different register\n");
9484                   break;
9485                 }
9486                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9487                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9488                   break;
9489                 }
9490                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9491                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9492                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9493                   if(count_free_regs(regs[j].regmap)<2) {
9494                     //printf("No free regs for store %x\n",start+j*4);
9495                     break;
9496                   }
9497                 }
9498                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9499                 if(f_regmap[hr]>=64) {
9500                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9501                     break;
9502                   }
9503                   else
9504                   {
9505                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9506                       break;
9507                     }
9508                   }
9509                 }
9510               }
9511             }
9512           }
9513         }
9514       }
9515     }else{
9516       int count=0;
9517       for(hr=0;hr<HOST_REGS;hr++)
9518       {
9519         if(hr!=EXCLUDE_REG) {
9520           if(regs[i].regmap[hr]>64) {
9521             if(!((regs[i].dirty>>hr)&1))
9522               f_regmap[hr]=regs[i].regmap[hr];
9523           }
9524           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9525           else if(regs[i].regmap[hr]<0) count++;
9526         }
9527       }
9528       // Try to restore cycle count at branch targets
9529       if(bt[i]) {
9530         for(j=i;j<slen-1;j++) {
9531           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9532           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9533           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9534           ||itype[j]==FCOMP||itype[j]==FCONV) {
9535             if(count_free_regs(regs[j].regmap)<2) {
9536               //printf("no free regs for store %x\n",start+j*4);
9537               break;
9538             }
9539           }
9540           else
9541           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9542         }
9543         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9544           int k=i;
9545           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9546           while(k<j) {
9547             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9548             regs[k].regmap[HOST_CCREG]=CCREG;
9549             regmap_pre[k+1][HOST_CCREG]=CCREG;
9550             regs[k+1].wasdirty|=1<<HOST_CCREG;
9551             regs[k].dirty|=1<<HOST_CCREG;
9552             regs[k].wasconst&=~(1<<HOST_CCREG);
9553             regs[k].isconst&=~(1<<HOST_CCREG);
9554             k++;
9555           }
9556           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9557         }
9558         // Work backwards from the branch target
9559         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9560         {
9561           //printf("Extend backwards\n");
9562           int k;
9563           k=i;
9564           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9565             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9566             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9567             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9568               if(count_free_regs(regs[k-1].regmap)<2) {
9569                 //printf("no free regs for store %x\n",start+(k-1)*4);
9570                 break;
9571               }
9572             }
9573             else
9574             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9575             k--;
9576           }
9577           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9578             //printf("Extend CC, %x ->\n",start+k*4);
9579             while(k<=i) {
9580               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9581               regs[k].regmap[HOST_CCREG]=CCREG;
9582               regmap_pre[k+1][HOST_CCREG]=CCREG;
9583               regs[k+1].wasdirty|=1<<HOST_CCREG;
9584               regs[k].dirty|=1<<HOST_CCREG;
9585               regs[k].wasconst&=~(1<<HOST_CCREG);
9586               regs[k].isconst&=~(1<<HOST_CCREG);
9587               k++;
9588             }
9589           }
9590           else {
9591             //printf("Fail Extend CC, %x ->\n",start+k*4);
9592           }
9593         }
9594       }
9595       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9596          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9597          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9598          itype[i]!=FCONV&&itype[i]!=FCOMP)
9599       {
9600         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9601       }
9602     }
9603   }
9604   
9605   // This allocates registers (if possible) one instruction prior
9606   // to use, which can avoid a load-use penalty on certain CPUs.
9607   for(i=0;i<slen-1;i++)
9608   {
9609     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9610     {
9611       if(!bt[i+1])
9612       {
9613         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9614         {
9615           if(rs1[i+1]) {
9616             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9617             {
9618               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9619               {
9620                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9621                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9622                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9623                 regs[i].isconst&=~(1<<hr);
9624                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9625                 constmap[i][hr]=constmap[i+1][hr];
9626                 regs[i+1].wasdirty&=~(1<<hr);
9627                 regs[i].dirty&=~(1<<hr);
9628               }
9629             }
9630           }
9631           if(rs2[i+1]) {
9632             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9633             {
9634               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9635               {
9636                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9637                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9638                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9639                 regs[i].isconst&=~(1<<hr);
9640                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9641                 constmap[i][hr]=constmap[i+1][hr];
9642                 regs[i+1].wasdirty&=~(1<<hr);
9643                 regs[i].dirty&=~(1<<hr);
9644               }
9645             }
9646           }
9647           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9648             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9649             {
9650               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9651               {
9652                 regs[i].regmap[hr]=rs1[i+1];
9653                 regmap_pre[i+1][hr]=rs1[i+1];
9654                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9655                 regs[i].isconst&=~(1<<hr);
9656                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9657                 constmap[i][hr]=constmap[i+1][hr];
9658                 regs[i+1].wasdirty&=~(1<<hr);
9659                 regs[i].dirty&=~(1<<hr);
9660               }
9661             }
9662           }
9663           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9664             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9665             {
9666               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9667               {
9668                 regs[i].regmap[hr]=rs1[i+1];
9669                 regmap_pre[i+1][hr]=rs1[i+1];
9670                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9671                 regs[i].isconst&=~(1<<hr);
9672                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9673                 constmap[i][hr]=constmap[i+1][hr];
9674                 regs[i+1].wasdirty&=~(1<<hr);
9675                 regs[i].dirty&=~(1<<hr);
9676               }
9677             }
9678           }
9679           #ifndef HOST_IMM_ADDR32
9680           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9681             hr=get_reg(regs[i+1].regmap,TLREG);
9682             if(hr>=0) {
9683               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9684               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9685                 int nr;
9686                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9687                 {
9688                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9689                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9690                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9691                   regs[i].isconst&=~(1<<hr);
9692                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9693                   constmap[i][hr]=constmap[i+1][hr];
9694                   regs[i+1].wasdirty&=~(1<<hr);
9695                   regs[i].dirty&=~(1<<hr);
9696                 }
9697                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9698                 {
9699                   // move it to another register
9700                   regs[i+1].regmap[hr]=-1;
9701                   regmap_pre[i+2][hr]=-1;
9702                   regs[i+1].regmap[nr]=TLREG;
9703                   regmap_pre[i+2][nr]=TLREG;
9704                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9705                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9706                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9707                   regs[i].isconst&=~(1<<nr);
9708                   regs[i+1].isconst&=~(1<<nr);
9709                   regs[i].dirty&=~(1<<nr);
9710                   regs[i+1].wasdirty&=~(1<<nr);
9711                   regs[i+1].dirty&=~(1<<nr);
9712                   regs[i+2].wasdirty&=~(1<<nr);
9713                 }
9714               }
9715             }
9716           }
9717           #endif
9718           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9719             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9720               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9721               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9722               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9723               assert(hr>=0);
9724               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9725               {
9726                 regs[i].regmap[hr]=rs1[i+1];
9727                 regmap_pre[i+1][hr]=rs1[i+1];
9728                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9729                 regs[i].isconst&=~(1<<hr);
9730                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9731                 constmap[i][hr]=constmap[i+1][hr];
9732                 regs[i+1].wasdirty&=~(1<<hr);
9733                 regs[i].dirty&=~(1<<hr);
9734               }
9735             }
9736           }
9737           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9738             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9739               int nr;
9740               hr=get_reg(regs[i+1].regmap,FTEMP);
9741               assert(hr>=0);
9742               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9743               {
9744                 regs[i].regmap[hr]=rs1[i+1];
9745                 regmap_pre[i+1][hr]=rs1[i+1];
9746                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9747                 regs[i].isconst&=~(1<<hr);
9748                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9749                 constmap[i][hr]=constmap[i+1][hr];
9750                 regs[i+1].wasdirty&=~(1<<hr);
9751                 regs[i].dirty&=~(1<<hr);
9752               }
9753               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9754               {
9755                 // move it to another register
9756                 regs[i+1].regmap[hr]=-1;
9757                 regmap_pre[i+2][hr]=-1;
9758                 regs[i+1].regmap[nr]=FTEMP;
9759                 regmap_pre[i+2][nr]=FTEMP;
9760                 regs[i].regmap[nr]=rs1[i+1];
9761                 regmap_pre[i+1][nr]=rs1[i+1];
9762                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9763                 regs[i].isconst&=~(1<<nr);
9764                 regs[i+1].isconst&=~(1<<nr);
9765                 regs[i].dirty&=~(1<<nr);
9766                 regs[i+1].wasdirty&=~(1<<nr);
9767                 regs[i+1].dirty&=~(1<<nr);
9768                 regs[i+2].wasdirty&=~(1<<nr);
9769               }
9770             }
9771           }
9772           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9773             if(itype[i+1]==LOAD) 
9774               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9775             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9776               hr=get_reg(regs[i+1].regmap,FTEMP);
9777             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9778               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9779               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9780             }
9781             if(hr>=0&&regs[i].regmap[hr]<0) {
9782               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9783               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9784                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9785                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9786                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9787                 regs[i].isconst&=~(1<<hr);
9788                 regs[i+1].wasdirty&=~(1<<hr);
9789                 regs[i].dirty&=~(1<<hr);
9790               }
9791             }
9792           }
9793         }
9794       }
9795     }
9796   }
9797   
9798   /* Pass 6 - Optimize clean/dirty state */
9799   clean_registers(0,slen-1,1);
9800   
9801   /* Pass 7 - Identify 32-bit registers */
9802   
9803   provisional_r32();
9804
9805   u_int r32=0;
9806   
9807   for (i=slen-1;i>=0;i--)
9808   {
9809     int hr;
9810     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9811     {
9812       if(ba[i]<start || ba[i]>=(start+slen*4))
9813       {
9814         // Branch out of this block, don't need anything
9815         r32=0;
9816       }
9817       else
9818       {
9819         // Internal branch
9820         // Need whatever matches the target
9821         // (and doesn't get overwritten by the delay slot instruction)
9822         r32=0;
9823         int t=(ba[i]-start)>>2;
9824         if(ba[i]>start+i*4) {
9825           // Forward branch
9826           if(!(requires_32bit[t]&~regs[i].was32))
9827             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9828         }else{
9829           // Backward branch
9830           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9831           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9832           if(!(pr32[t]&~regs[i].was32))
9833             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9834         }
9835       }
9836       // Conditional branch may need registers for following instructions
9837       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9838       {
9839         if(i<slen-2) {
9840           r32|=requires_32bit[i+2];
9841           r32&=regs[i].was32;
9842           // Mark this address as a branch target since it may be called
9843           // upon return from interrupt
9844           bt[i+2]=1;
9845         }
9846       }
9847       // Merge in delay slot
9848       if(!likely[i]) {
9849         // These are overwritten unless the branch is "likely"
9850         // and the delay slot is nullified if not taken
9851         r32&=~(1LL<<rt1[i+1]);
9852         r32&=~(1LL<<rt2[i+1]);
9853       }
9854       // Assume these are needed (delay slot)
9855       if(us1[i+1]>0)
9856       {
9857         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9858       }
9859       if(us2[i+1]>0)
9860       {
9861         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9862       }
9863       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9864       {
9865         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9866       }
9867       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9868       {
9869         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9870       }
9871     }
9872     else if(itype[i]==SYSCALL)
9873     {
9874       // SYSCALL instruction (software interrupt)
9875       r32=0;
9876     }
9877     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9878     {
9879       // ERET instruction (return from interrupt)
9880       r32=0;
9881     }
9882     // Check 32 bits
9883     r32&=~(1LL<<rt1[i]);
9884     r32&=~(1LL<<rt2[i]);
9885     if(us1[i]>0)
9886     {
9887       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9888     }
9889     if(us2[i]>0)
9890     {
9891       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9892     }
9893     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
9894     {
9895       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
9896     }
9897     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
9898     {
9899       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
9900     }
9901     requires_32bit[i]=r32;
9902     
9903     // Dirty registers which are 32-bit, require 32-bit input
9904     // as they will be written as 32-bit values
9905     for(hr=0;hr<HOST_REGS;hr++)
9906     {
9907       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
9908         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
9909           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
9910           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
9911         }
9912       }
9913     }
9914     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
9915   }
9916
9917   if(itype[slen-1]==SPAN) {
9918     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9919   }
9920   
9921   /* Debug/disassembly */
9922   if((void*)assem_debug==(void*)printf) 
9923   for(i=0;i<slen;i++)
9924   {
9925     printf("U:");
9926     int r;
9927     for(r=1;r<=CCREG;r++) {
9928       if((unneeded_reg[i]>>r)&1) {
9929         if(r==HIREG) printf(" HI");
9930         else if(r==LOREG) printf(" LO");
9931         else printf(" r%d",r);
9932       }
9933     }
9934     printf(" UU:");
9935     for(r=1;r<=CCREG;r++) {
9936       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
9937         if(r==HIREG) printf(" HI");
9938         else if(r==LOREG) printf(" LO");
9939         else printf(" r%d",r);
9940       }
9941     }
9942     printf(" 32:");
9943     for(r=0;r<=CCREG;r++) {
9944       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9945       if((regs[i].was32>>r)&1) {
9946         if(r==CCREG) printf(" CC");
9947         else if(r==HIREG) printf(" HI");
9948         else if(r==LOREG) printf(" LO");
9949         else printf(" r%d",r);
9950       }
9951     }
9952     printf("\n");
9953     #if defined(__i386__) || defined(__x86_64__)
9954     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9955     #endif
9956     #ifdef __arm__
9957     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9958     #endif
9959     printf("needs: ");
9960     if(needed_reg[i]&1) printf("eax ");
9961     if((needed_reg[i]>>1)&1) printf("ecx ");
9962     if((needed_reg[i]>>2)&1) printf("edx ");
9963     if((needed_reg[i]>>3)&1) printf("ebx ");
9964     if((needed_reg[i]>>5)&1) printf("ebp ");
9965     if((needed_reg[i]>>6)&1) printf("esi ");
9966     if((needed_reg[i]>>7)&1) printf("edi ");
9967     printf("r:");
9968     for(r=0;r<=CCREG;r++) {
9969       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9970       if((requires_32bit[i]>>r)&1) {
9971         if(r==CCREG) printf(" CC");
9972         else if(r==HIREG) printf(" HI");
9973         else if(r==LOREG) printf(" LO");
9974         else printf(" r%d",r);
9975       }
9976     }
9977     printf("\n");
9978     /*printf("pr:");
9979     for(r=0;r<=CCREG;r++) {
9980       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9981       if((pr32[i]>>r)&1) {
9982         if(r==CCREG) printf(" CC");
9983         else if(r==HIREG) printf(" HI");
9984         else if(r==LOREG) printf(" LO");
9985         else printf(" r%d",r);
9986       }
9987     }
9988     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
9989     printf("\n");*/
9990     #if defined(__i386__) || defined(__x86_64__)
9991     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9992     printf("dirty: ");
9993     if(regs[i].wasdirty&1) printf("eax ");
9994     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9995     if((regs[i].wasdirty>>2)&1) printf("edx ");
9996     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9997     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9998     if((regs[i].wasdirty>>6)&1) printf("esi ");
9999     if((regs[i].wasdirty>>7)&1) printf("edi ");
10000     #endif
10001     #ifdef __arm__
10002     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10003     printf("dirty: ");
10004     if(regs[i].wasdirty&1) printf("r0 ");
10005     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10006     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10007     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10008     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10009     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10010     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10011     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10012     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10013     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10014     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10015     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10016     #endif
10017     printf("\n");
10018     disassemble_inst(i);
10019     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10020     #if defined(__i386__) || defined(__x86_64__)
10021     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10022     if(regs[i].dirty&1) printf("eax ");
10023     if((regs[i].dirty>>1)&1) printf("ecx ");
10024     if((regs[i].dirty>>2)&1) printf("edx ");
10025     if((regs[i].dirty>>3)&1) printf("ebx ");
10026     if((regs[i].dirty>>5)&1) printf("ebp ");
10027     if((regs[i].dirty>>6)&1) printf("esi ");
10028     if((regs[i].dirty>>7)&1) printf("edi ");
10029     #endif
10030     #ifdef __arm__
10031     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10032     if(regs[i].dirty&1) printf("r0 ");
10033     if((regs[i].dirty>>1)&1) printf("r1 ");
10034     if((regs[i].dirty>>2)&1) printf("r2 ");
10035     if((regs[i].dirty>>3)&1) printf("r3 ");
10036     if((regs[i].dirty>>4)&1) printf("r4 ");
10037     if((regs[i].dirty>>5)&1) printf("r5 ");
10038     if((regs[i].dirty>>6)&1) printf("r6 ");
10039     if((regs[i].dirty>>7)&1) printf("r7 ");
10040     if((regs[i].dirty>>8)&1) printf("r8 ");
10041     if((regs[i].dirty>>9)&1) printf("r9 ");
10042     if((regs[i].dirty>>10)&1) printf("r10 ");
10043     if((regs[i].dirty>>12)&1) printf("r12 ");
10044     #endif
10045     printf("\n");
10046     if(regs[i].isconst) {
10047       printf("constants: ");
10048       #if defined(__i386__) || defined(__x86_64__)
10049       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10050       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10051       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10052       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10053       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10054       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10055       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10056       #endif
10057       #ifdef __arm__
10058       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10059       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10060       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10061       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10062       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10063       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10064       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10065       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10066       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10067       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10068       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10069       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10070       #endif
10071       printf("\n");
10072     }
10073     printf(" 32:");
10074     for(r=0;r<=CCREG;r++) {
10075       if((regs[i].is32>>r)&1) {
10076         if(r==CCREG) printf(" CC");
10077         else if(r==HIREG) printf(" HI");
10078         else if(r==LOREG) printf(" LO");
10079         else printf(" r%d",r);
10080       }
10081     }
10082     printf("\n");
10083     /*printf(" p32:");
10084     for(r=0;r<=CCREG;r++) {
10085       if((p32[i]>>r)&1) {
10086         if(r==CCREG) printf(" CC");
10087         else if(r==HIREG) printf(" HI");
10088         else if(r==LOREG) printf(" LO");
10089         else printf(" r%d",r);
10090       }
10091     }
10092     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10093     else printf("\n");*/
10094     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10095       #if defined(__i386__) || defined(__x86_64__)
10096       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10097       if(branch_regs[i].dirty&1) printf("eax ");
10098       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10099       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10100       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10101       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10102       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10103       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10104       #endif
10105       #ifdef __arm__
10106       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10107       if(branch_regs[i].dirty&1) printf("r0 ");
10108       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10109       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10110       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10111       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10112       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10113       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10114       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10115       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10116       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10117       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10118       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10119       #endif
10120       printf(" 32:");
10121       for(r=0;r<=CCREG;r++) {
10122         if((branch_regs[i].is32>>r)&1) {
10123           if(r==CCREG) printf(" CC");
10124           else if(r==HIREG) printf(" HI");
10125           else if(r==LOREG) printf(" LO");
10126           else printf(" r%d",r);
10127         }
10128       }
10129       printf("\n");
10130     }
10131   }
10132
10133   /* Pass 8 - Assembly */
10134   linkcount=0;stubcount=0;
10135   ds=0;is_delayslot=0;
10136   cop1_usable=0;
10137   uint64_t is32_pre=0;
10138   u_int dirty_pre=0;
10139   u_int beginning=(u_int)out;
10140   if((u_int)addr&1) {
10141     ds=1;
10142     pagespan_ds();
10143   }
10144   for(i=0;i<slen;i++)
10145   {
10146     //if(ds) printf("ds: ");
10147     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10148     if(ds) {
10149       ds=0; // Skip delay slot
10150       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10151       instr_addr[i]=0;
10152     } else {
10153       #ifndef DESTRUCTIVE_WRITEBACK
10154       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10155       {
10156         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10157               unneeded_reg[i],unneeded_reg_upper[i]);
10158         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10159               unneeded_reg[i],unneeded_reg_upper[i]);
10160       }
10161       is32_pre=regs[i].is32;
10162       dirty_pre=regs[i].dirty;
10163       #endif
10164       // write back
10165       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10166       {
10167         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10168                       unneeded_reg[i],unneeded_reg_upper[i]);
10169         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10170       }
10171       // branch target entry point
10172       instr_addr[i]=(u_int)out;
10173       assem_debug("<->\n");
10174       // load regs
10175       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10176         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10177       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10178       address_generation(i,&regs[i],regs[i].regmap_entry);
10179       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10180       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10181       {
10182         // Load the delay slot registers if necessary
10183         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10184           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10185         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10186           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10187         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10188           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10189       }
10190       else if(i+1<slen)
10191       {
10192         // Preload registers for following instruction
10193         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10194           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10195             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10196         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10197           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10198             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10199       }
10200       // TODO: if(is_ooo(i)) address_generation(i+1);
10201       if(itype[i]==CJUMP||itype[i]==FJUMP)
10202         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10203       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10204         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10205       if(bt[i]) cop1_usable=0;
10206       // assemble
10207       switch(itype[i]) {
10208         case ALU:
10209           alu_assemble(i,&regs[i]);break;
10210         case IMM16:
10211           imm16_assemble(i,&regs[i]);break;
10212         case SHIFT:
10213           shift_assemble(i,&regs[i]);break;
10214         case SHIFTIMM:
10215           shiftimm_assemble(i,&regs[i]);break;
10216         case LOAD:
10217           load_assemble(i,&regs[i]);break;
10218         case LOADLR:
10219           loadlr_assemble(i,&regs[i]);break;
10220         case STORE:
10221           store_assemble(i,&regs[i]);break;
10222         case STORELR:
10223           storelr_assemble(i,&regs[i]);break;
10224         case COP0:
10225           cop0_assemble(i,&regs[i]);break;
10226         case COP1:
10227           cop1_assemble(i,&regs[i]);break;
10228         case C1LS:
10229           c1ls_assemble(i,&regs[i]);break;
10230         case FCONV:
10231           fconv_assemble(i,&regs[i]);break;
10232         case FLOAT:
10233           float_assemble(i,&regs[i]);break;
10234         case FCOMP:
10235           fcomp_assemble(i,&regs[i]);break;
10236         case MULTDIV:
10237           multdiv_assemble(i,&regs[i]);break;
10238         case MOV:
10239           mov_assemble(i,&regs[i]);break;
10240         case SYSCALL:
10241           syscall_assemble(i,&regs[i]);break;
10242         case UJUMP:
10243           ujump_assemble(i,&regs[i]);ds=1;break;
10244         case RJUMP:
10245           rjump_assemble(i,&regs[i]);ds=1;break;
10246         case CJUMP:
10247           cjump_assemble(i,&regs[i]);ds=1;break;
10248         case SJUMP:
10249           sjump_assemble(i,&regs[i]);ds=1;break;
10250         case FJUMP:
10251           fjump_assemble(i,&regs[i]);ds=1;break;
10252         case SPAN:
10253           pagespan_assemble(i,&regs[i]);break;
10254       }
10255       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10256         literal_pool(1024);
10257       else
10258         literal_pool_jumpover(256);
10259     }
10260   }
10261   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10262   // If the block did not end with an unconditional branch,
10263   // add a jump to the next instruction.
10264   if(i>1) {
10265     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10266       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10267       assert(i==slen);
10268       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10269         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10270         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10271           emit_loadreg(CCREG,HOST_CCREG);
10272         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10273       }
10274       else if(!likely[i-2])
10275       {
10276         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10277         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10278       }
10279       else
10280       {
10281         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10282         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10283       }
10284       add_to_linker((int)out,start+i*4,0);
10285       emit_jmp(0);
10286     }
10287   }
10288   else
10289   {
10290     assert(i>0);
10291     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10292     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10293     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10294       emit_loadreg(CCREG,HOST_CCREG);
10295     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10296     add_to_linker((int)out,start+i*4,0);
10297     emit_jmp(0);
10298   }
10299
10300   // TODO: delay slot stubs?
10301   // Stubs
10302   for(i=0;i<stubcount;i++)
10303   {
10304     switch(stubs[i][0])
10305     {
10306       case LOADB_STUB:
10307       case LOADH_STUB:
10308       case LOADW_STUB:
10309       case LOADD_STUB:
10310       case LOADBU_STUB:
10311       case LOADHU_STUB:
10312         do_readstub(i);break;
10313       case STOREB_STUB:
10314       case STOREH_STUB:
10315       case STOREW_STUB:
10316       case STORED_STUB:
10317         do_writestub(i);break;
10318       case CC_STUB:
10319         do_ccstub(i);break;
10320       case INVCODE_STUB:
10321         do_invstub(i);break;
10322       case FP_STUB:
10323         do_cop1stub(i);break;
10324       case STORELR_STUB:
10325         do_unalignedwritestub(i);break;
10326     }
10327   }
10328
10329   /* Pass 9 - Linker */
10330   for(i=0;i<linkcount;i++)
10331   {
10332     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10333     literal_pool(64);
10334     if(!link_addr[i][2])
10335     {
10336       void *stub=out;
10337       void *addr=check_addr(link_addr[i][1]);
10338       emit_extjump(link_addr[i][0],link_addr[i][1]);
10339       if(addr) {
10340         set_jump_target(link_addr[i][0],(int)addr);
10341         add_link(link_addr[i][1],stub);
10342       }
10343       else set_jump_target(link_addr[i][0],(int)stub);
10344     }
10345     else
10346     {
10347       // Internal branch
10348       int target=(link_addr[i][1]-start)>>2;
10349       assert(target>=0&&target<slen);
10350       assert(instr_addr[target]);
10351       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10352       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10353       //#else
10354       set_jump_target(link_addr[i][0],instr_addr[target]);
10355       //#endif
10356     }
10357   }
10358   // External Branch Targets (jump_in)
10359   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10360   for(i=0;i<slen;i++)
10361   {
10362     if(bt[i]||i==0)
10363     {
10364       if(instr_addr[i]) // TODO - delay slots (=null)
10365       {
10366         u_int vaddr=start+i*4;
10367         u_int page=get_page(vaddr);
10368         u_int vpage=get_vpage(vaddr);
10369         literal_pool(256);
10370         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10371         if(!requires_32bit[i])
10372         {
10373           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10374           assem_debug("jump_in: %x\n",start+i*4);
10375           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10376           int entry_point=do_dirty_stub(i);
10377           ll_add(jump_in+page,vaddr,(void *)entry_point);
10378           // If there was an existing entry in the hash table,
10379           // replace it with the new address.
10380           // Don't add new entries.  We'll insert the
10381           // ones that actually get used in check_addr().
10382           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10383           if(ht_bin[0]==vaddr) {
10384             ht_bin[1]=entry_point;
10385           }
10386           if(ht_bin[2]==vaddr) {
10387             ht_bin[3]=entry_point;
10388           }
10389         }
10390         else
10391         {
10392           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10393           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10394           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10395           //int entry_point=(int)out;
10396           ////assem_debug("entry_point: %x\n",entry_point);
10397           //load_regs_entry(i);
10398           //if(entry_point==(int)out)
10399           //  entry_point=instr_addr[i];
10400           //else
10401           //  emit_jmp(instr_addr[i]);
10402           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10403           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10404           int entry_point=do_dirty_stub(i);
10405           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10406         }
10407       }
10408     }
10409   }
10410   // Write out the literal pool if necessary
10411   literal_pool(0);
10412   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10413   // Align code
10414   if(((u_int)out)&7) emit_addnop(13);
10415   #endif
10416   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10417   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10418   memcpy(copy,source,slen*4);
10419   copy+=slen*4;
10420   
10421   #ifdef __arm__
10422   __clear_cache((void *)beginning,out);
10423   #endif
10424   
10425   // If we're within 256K of the end of the buffer,
10426   // start over from the beginning. (Is 256K enough?)
10427   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10428   
10429   // Trap writes to any of the pages we compiled
10430   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10431     invalid_code[i]=0;
10432     memory_map[i]|=0x40000000;
10433     if((signed int)start>=(signed int)0xC0000000) {
10434       assert(using_tlb);
10435       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10436       invalid_code[j]=0;
10437       memory_map[j]|=0x40000000;
10438       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10439     }
10440   }
10441   
10442   /* Pass 10 - Free memory by expiring oldest blocks */
10443   
10444   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10445   while(expirep!=end)
10446   {
10447     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10448     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10449     inv_debug("EXP: Phase %d\n",expirep);
10450     switch((expirep>>11)&3)
10451     {
10452       case 0:
10453         // Clear jump_in and jump_dirty
10454         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10455         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10456         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10457         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10458         break;
10459       case 1:
10460         // Clear pointers
10461         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10462         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10463         break;
10464       case 2:
10465         // Clear hash table
10466         for(i=0;i<32;i++) {
10467           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10468           if((ht_bin[3]>>shift)==(base>>shift) ||
10469              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10470             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10471             ht_bin[2]=ht_bin[3]=-1;
10472           }
10473           if((ht_bin[1]>>shift)==(base>>shift) ||
10474              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10475             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10476             ht_bin[0]=ht_bin[2];
10477             ht_bin[1]=ht_bin[3];
10478             ht_bin[2]=ht_bin[3]=-1;
10479           }
10480         }
10481         break;
10482       case 3:
10483         // Clear jump_out
10484         #ifdef __arm__
10485         if((expirep&2047)==0)
10486           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10487         #endif
10488         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10489         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10490         break;
10491     }
10492     expirep=(expirep+1)&65535;
10493   }
10494   return 0;
10495 }