9e58ef0fc90ed8221e28511825075f80d17d2c27
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   uint64_t unneeded_reg[MAXBLOCK];
88   uint64_t unneeded_reg_upper[MAXBLOCK];
89   uint64_t branch_unneeded_reg[MAXBLOCK];
90   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
91   uint64_t p32[MAXBLOCK];
92   uint64_t pr32[MAXBLOCK];
93   signed char regmap_pre[MAXBLOCK][HOST_REGS];
94   signed char regmap[MAXBLOCK][HOST_REGS];
95   signed char regmap_entry[MAXBLOCK][HOST_REGS];
96   uint64_t constmap[MAXBLOCK][HOST_REGS];
97   uint64_t known_value[HOST_REGS];
98   u_int known_reg;
99   struct regstat regs[MAXBLOCK];
100   struct regstat branch_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124   u_int using_tlb;
125   u_int stop_after_jal;
126   extern u_char restore_candidate[512];
127   extern int cycle_count;
128
129   /* registers that may be allocated */
130   /* 1-31 gpr */
131 #define HIREG 32 // hi
132 #define LOREG 33 // lo
133 #define FSREG 34 // FPU status (FCSR)
134 #define CSREG 35 // Coprocessor status
135 #define CCREG 36 // Cycle count
136 #define INVCP 37 // Pointer to invalid_code
137 #define TEMPREG 38
138 #define FTEMP 38 // FPU temporary register
139 #define PTEMP 39 // Prefetch temporary register
140 #define TLREG 40 // TLB mapping offset
141 #define RHASH 41 // Return address hash
142 #define RHTBL 42 // Return address hash table address
143 #define RTEMP 43 // JR/JALR address register
144 #define MAXREG 43
145 #define AGEN1 44 // Address generation temporary register
146 #define AGEN2 45 // Address generation temporary register
147 #define MGEN1 46 // Maptable address generation temporary register
148 #define MGEN2 47 // Maptable address generation temporary register
149 #define BTREG 48 // Branch target temporary register
150
151   /* instruction types */
152 #define NOP 0     // No operation
153 #define LOAD 1    // Load
154 #define STORE 2   // Store
155 #define LOADLR 3  // Unaligned load
156 #define STORELR 4 // Unaligned store
157 #define MOV 5     // Move 
158 #define ALU 6     // Arithmetic/logic
159 #define MULTDIV 7 // Multiply/divide
160 #define SHIFT 8   // Shift by register
161 #define SHIFTIMM 9// Shift by immediate
162 #define IMM16 10  // 16-bit immediate
163 #define RJUMP 11  // Unconditional jump to register
164 #define UJUMP 12  // Unconditional jump
165 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
166 #define SJUMP 14  // Conditional branch (regimm format)
167 #define COP0 15   // Coprocessor 0
168 #define COP1 16   // Coprocessor 1
169 #define C1LS 17   // Coprocessor 1 load/store
170 #define FJUMP 18  // Conditional branch (floating point)
171 #define FLOAT 19  // Floating point unit
172 #define FCONV 20  // Convert integer to float
173 #define FCOMP 21  // Floating point compare (sets FSREG)
174 #define SYSCALL 22// SYSCALL
175 #define OTHER 23  // Other
176 #define SPAN 24   // Branch/delay slot spans 2 pages
177 #define NI 25     // Not implemented
178
179   /* stubs */
180 #define CC_STUB 1
181 #define FP_STUB 2
182 #define LOADB_STUB 3
183 #define LOADH_STUB 4
184 #define LOADW_STUB 5
185 #define LOADD_STUB 6
186 #define LOADBU_STUB 7
187 #define LOADHU_STUB 8
188 #define STOREB_STUB 9
189 #define STOREH_STUB 10
190 #define STOREW_STUB 11
191 #define STORED_STUB 12
192 #define STORELR_STUB 13
193 #define INVCODE_STUB 14
194
195   /* branch codes */
196 #define TAKEN 1
197 #define NOTTAKEN 2
198 #define NULLDS 3
199
200 // asm linkage
201 int new_recompile_block(int addr);
202 void *get_addr_ht(u_int vaddr);
203 void invalidate_block(u_int block);
204 void invalidate_addr(u_int addr);
205 void remove_hash(int vaddr);
206 void jump_vaddr();
207 void dyna_linker();
208 void dyna_linker_ds();
209 void verify_code();
210 void verify_code_vm();
211 void verify_code_ds();
212 void cc_interrupt();
213 void fp_exception();
214 void fp_exception_ds();
215 void jump_syscall();
216 void jump_eret();
217
218 // TLB
219 void TLBWI_new();
220 void TLBWR_new();
221 void read_nomem_new();
222 void read_nomemb_new();
223 void read_nomemh_new();
224 void read_nomemd_new();
225 void write_nomem_new();
226 void write_nomemb_new();
227 void write_nomemh_new();
228 void write_nomemd_new();
229 void write_rdram_new();
230 void write_rdramb_new();
231 void write_rdramh_new();
232 void write_rdramd_new();
233 extern u_int memory_map[1048576];
234
235 // Needed by assembler
236 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
237 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
238 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
239 void load_all_regs(signed char i_regmap[]);
240 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
241 void load_regs_entry(int t);
242 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
243
244 int tracedebug=0;
245
246 //#define DEBUG_CYCLE_COUNT 1
247
248 void nullf() {}
249 //#define assem_debug printf
250 //#define inv_debug printf
251 #define assem_debug nullf
252 #define inv_debug nullf
253
254 static void tlb_hacks()
255 {
256 #ifndef DISABLE_TLB
257   // Goldeneye hack
258   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
259   {
260     u_int addr;
261     int n;
262     switch (ROM_HEADER->Country_code&0xFF) 
263     {
264       case 0x45: // U
265         addr=0x34b30;
266         break;                   
267       case 0x4A: // J 
268         addr=0x34b70;    
269         break;    
270       case 0x50: // E 
271         addr=0x329f0;
272         break;                        
273       default: 
274         // Unknown country code
275         addr=0;
276         break;
277     }
278     u_int rom_addr=(u_int)rom;
279     #ifdef ROM_COPY
280     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
281     // in the lower 4G of memory to use this hack.  Copy it if necessary.
282     if((void *)rom>(void *)0xffffffff) {
283       munmap(ROM_COPY, 67108864);
284       if(mmap(ROM_COPY, 12582912,
285               PROT_READ | PROT_WRITE,
286               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
287               -1, 0) <= 0) {printf("mmap() failed\n");}
288       memcpy(ROM_COPY,rom,12582912);
289       rom_addr=(u_int)ROM_COPY;
290     }
291     #endif
292     if(addr) {
293       for(n=0x7F000;n<0x80000;n++) {
294         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
295       }
296     }
297   }
298 #endif
299 }
300
301 static u_int get_page(u_int vaddr)
302 {
303   u_int page=(vaddr^0x80000000)>>12;
304 #ifndef DISABLE_TLB
305   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
306 #endif
307   if(page>2048) page=2048+(page&2047);
308   return page;
309 }
310
311 static u_int get_vpage(u_int vaddr)
312 {
313   u_int vpage=(vaddr^0x80000000)>>12;
314 #ifndef DISABLE_TLB
315   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
316 #endif
317   if(vpage>2048) vpage=2048+(vpage&2047);
318   return vpage;
319 }
320
321 // Get address from virtual address
322 // This is called from the recompiled JR/JALR instructions
323 void *get_addr(u_int vaddr)
324 {
325   u_int page=get_page(vaddr);
326   u_int vpage=get_vpage(vaddr);
327   struct ll_entry *head;
328   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
329   head=jump_in[page];
330   while(head!=NULL) {
331     if(head->vaddr==vaddr&&head->reg32==0) {
332   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
333       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
334       ht_bin[3]=ht_bin[1];
335       ht_bin[2]=ht_bin[0];
336       ht_bin[1]=(int)head->addr;
337       ht_bin[0]=vaddr;
338       return head->addr;
339     }
340     head=head->next;
341   }
342   head=jump_dirty[vpage];
343   while(head!=NULL) {
344     if(head->vaddr==vaddr&&head->reg32==0) {
345       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
346       // Don't restore blocks which are about to expire from the cache
347       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
348       if(verify_dirty(head->addr)) {
349         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
350         invalid_code[vaddr>>12]=0;
351         memory_map[vaddr>>12]|=0x40000000;
352         if(vpage<2048) {
353 #ifndef DISABLE_TLB
354           if(tlb_LUT_r[vaddr>>12]) {
355             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
356             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
357           }
358 #endif
359           restore_candidate[vpage>>3]|=1<<(vpage&7);
360         }
361         else restore_candidate[page>>3]|=1<<(page&7);
362         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
363         if(ht_bin[0]==vaddr) {
364           ht_bin[1]=(int)head->addr; // Replace existing entry
365         }
366         else
367         {
368           ht_bin[3]=ht_bin[1];
369           ht_bin[2]=ht_bin[0];
370           ht_bin[1]=(int)head->addr;
371           ht_bin[0]=vaddr;
372         }
373         return head->addr;
374       }
375     }
376     head=head->next;
377   }
378   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
379   int r=new_recompile_block(vaddr);
380   if(r==0) return get_addr(vaddr);
381   // Execute in unmapped page, generate pagefault execption
382   Status|=2;
383   Cause=(vaddr<<31)|0x8;
384   EPC=(vaddr&1)?vaddr-5:vaddr;
385   BadVAddr=(vaddr&~1);
386   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
387   EntryHi=BadVAddr&0xFFFFE000;
388   return get_addr_ht(0x80000000);
389 }
390 // Look up address in hash table first
391 void *get_addr_ht(u_int vaddr)
392 {
393   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
394   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
395   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
396   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
397   return get_addr(vaddr);
398 }
399
400 void *get_addr_32(u_int vaddr,u_int flags)
401 {
402   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
403   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
404   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
405   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
406   u_int page=get_page(vaddr);
407   u_int vpage=get_vpage(vaddr);
408   struct ll_entry *head;
409   head=jump_in[page];
410   while(head!=NULL) {
411     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
412       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
413       if(head->reg32==0) {
414         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
415         if(ht_bin[0]==-1) {
416           ht_bin[1]=(int)head->addr;
417           ht_bin[0]=vaddr;
418         }else if(ht_bin[2]==-1) {
419           ht_bin[3]=(int)head->addr;
420           ht_bin[2]=vaddr;
421         }
422         //ht_bin[3]=ht_bin[1];
423         //ht_bin[2]=ht_bin[0];
424         //ht_bin[1]=(int)head->addr;
425         //ht_bin[0]=vaddr;
426       }
427       return head->addr;
428     }
429     head=head->next;
430   }
431   head=jump_dirty[vpage];
432   while(head!=NULL) {
433     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
434       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
435       // Don't restore blocks which are about to expire from the cache
436       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
437       if(verify_dirty(head->addr)) {
438         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
439         invalid_code[vaddr>>12]=0;
440         memory_map[vaddr>>12]|=0x40000000;
441         if(vpage<2048) {
442 #ifndef DISABLE_TLB
443           if(tlb_LUT_r[vaddr>>12]) {
444             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
445             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
446           }
447 #endif
448           restore_candidate[vpage>>3]|=1<<(vpage&7);
449         }
450         else restore_candidate[page>>3]|=1<<(page&7);
451         if(head->reg32==0) {
452           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
453           if(ht_bin[0]==-1) {
454             ht_bin[1]=(int)head->addr;
455             ht_bin[0]=vaddr;
456           }else if(ht_bin[2]==-1) {
457             ht_bin[3]=(int)head->addr;
458             ht_bin[2]=vaddr;
459           }
460           //ht_bin[3]=ht_bin[1];
461           //ht_bin[2]=ht_bin[0];
462           //ht_bin[1]=(int)head->addr;
463           //ht_bin[0]=vaddr;
464         }
465         return head->addr;
466       }
467     }
468     head=head->next;
469   }
470   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
471   int r=new_recompile_block(vaddr);
472   if(r==0) return get_addr(vaddr);
473   // Execute in unmapped page, generate pagefault execption
474   Status|=2;
475   Cause=(vaddr<<31)|0x8;
476   EPC=(vaddr&1)?vaddr-5:vaddr;
477   BadVAddr=(vaddr&~1);
478   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
479   EntryHi=BadVAddr&0xFFFFE000;
480   return get_addr_ht(0x80000000);
481 }
482
483 void clear_all_regs(signed char regmap[])
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
487 }
488
489 signed char get_reg(signed char regmap[],int r)
490 {
491   int hr;
492   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
493   return -1;
494 }
495
496 // Find a register that is available for two consecutive cycles
497 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
498 {
499   int hr;
500   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
501   return -1;
502 }
503
504 int count_free_regs(signed char regmap[])
505 {
506   int count=0;
507   int hr;
508   for(hr=0;hr<HOST_REGS;hr++)
509   {
510     if(hr!=EXCLUDE_REG) {
511       if(regmap[hr]<0) count++;
512     }
513   }
514   return count;
515 }
516
517 void dirty_reg(struct regstat *cur,signed char reg)
518 {
519   int hr;
520   if(!reg) return;
521   for (hr=0;hr<HOST_REGS;hr++) {
522     if((cur->regmap[hr]&63)==reg) {
523       cur->dirty|=1<<hr;
524     }
525   }
526 }
527
528 // If we dirty the lower half of a 64 bit register which is now being
529 // sign-extended, we need to dump the upper half.
530 // Note: Do this only after completion of the instruction, because
531 // some instructions may need to read the full 64-bit value even if
532 // overwriting it (eg SLTI, DSRA32).
533 static void flush_dirty_uppers(struct regstat *cur)
534 {
535   int hr,reg;
536   for (hr=0;hr<HOST_REGS;hr++) {
537     if((cur->dirty>>hr)&1) {
538       reg=cur->regmap[hr];
539       if(reg>=64) 
540         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
541     }
542   }
543 }
544
545 void set_const(struct regstat *cur,signed char reg,uint64_t value)
546 {
547   int hr;
548   if(!reg) return;
549   for (hr=0;hr<HOST_REGS;hr++) {
550     if(cur->regmap[hr]==reg) {
551       cur->isconst|=1<<hr;
552       cur->constmap[hr]=value;
553     }
554     else if((cur->regmap[hr]^64)==reg) {
555       cur->isconst|=1<<hr;
556       cur->constmap[hr]=value>>32;
557     }
558   }
559 }
560
561 void clear_const(struct regstat *cur,signed char reg)
562 {
563   int hr;
564   if(!reg) return;
565   for (hr=0;hr<HOST_REGS;hr++) {
566     if((cur->regmap[hr]&63)==reg) {
567       cur->isconst&=~(1<<hr);
568     }
569   }
570 }
571
572 int is_const(struct regstat *cur,signed char reg)
573 {
574   int hr;
575   if(!reg) return 1;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if((cur->regmap[hr]&63)==reg) {
578       return (cur->isconst>>hr)&1;
579     }
580   }
581   return 0;
582 }
583 uint64_t get_const(struct regstat *cur,signed char reg)
584 {
585   int hr;
586   if(!reg) return 0;
587   for (hr=0;hr<HOST_REGS;hr++) {
588     if(cur->regmap[hr]==reg) {
589       return cur->constmap[hr];
590     }
591   }
592   printf("Unknown constant in r%d\n",reg);
593   exit(1);
594 }
595
596 // Least soon needed registers
597 // Look at the next ten instructions and see which registers
598 // will be used.  Try not to reallocate these.
599 void lsn(u_char hsn[], int i, int *preferred_reg)
600 {
601   int j;
602   int b=-1;
603   for(j=0;j<9;j++)
604   {
605     if(i+j>=slen) {
606       j=slen-i-1;
607       break;
608     }
609     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
610     {
611       // Don't go past an unconditonal jump
612       j++;
613       break;
614     }
615   }
616   for(;j>=0;j--)
617   {
618     if(rs1[i+j]) hsn[rs1[i+j]]=j;
619     if(rs2[i+j]) hsn[rs2[i+j]]=j;
620     if(rt1[i+j]) hsn[rt1[i+j]]=j;
621     if(rt2[i+j]) hsn[rt2[i+j]]=j;
622     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
623       // Stores can allocate zero
624       hsn[rs1[i+j]]=j;
625       hsn[rs2[i+j]]=j;
626     }
627     // On some architectures stores need invc_ptr
628     #if defined(HOST_IMM8)
629     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
630       hsn[INVCP]=j;
631     }
632     #endif
633     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
634     {
635       hsn[CCREG]=j;
636       b=j;
637     }
638   }
639   if(b>=0)
640   {
641     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
642     {
643       // Follow first branch
644       int t=(ba[i+b]-start)>>2;
645       j=7-b;if(t+j>=slen) j=slen-t-1;
646       for(;j>=0;j--)
647       {
648         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
649         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
650         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
651         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
652       }
653     }
654     // TODO: preferred register based on backward branch
655   }
656   // Delay slot should preferably not overwrite branch conditions or cycle count
657   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
658     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
659     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
660     hsn[CCREG]=1;
661     // ...or hash tables
662     hsn[RHASH]=1;
663     hsn[RHTBL]=1;
664   }
665   // Coprocessor load/store needs FTEMP, even if not declared
666   if(itype[i]==C1LS) {
667     hsn[FTEMP]=0;
668   }
669   // Load L/R also uses FTEMP as a temporary register
670   if(itype[i]==LOADLR) {
671     hsn[FTEMP]=0;
672   }
673   // Also 64-bit SDL/SDR
674   if(opcode[i]==0x2c||opcode[i]==0x2d) {
675     hsn[FTEMP]=0;
676   }
677   // Don't remove the TLB registers either
678   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
679     hsn[TLREG]=0;
680   }
681   // Don't remove the miniht registers
682   if(itype[i]==UJUMP||itype[i]==RJUMP)
683   {
684     hsn[RHASH]=0;
685     hsn[RHTBL]=0;
686   }
687 }
688
689 // We only want to allocate registers if we're going to use them again soon
690 int needed_again(int r, int i)
691 {
692   int j;
693   int b=-1;
694   int rn=10;
695   int hr;
696   u_char hsn[MAXREG+1];
697   int preferred_reg;
698   
699   memset(hsn,10,sizeof(hsn));
700   lsn(hsn,i,&preferred_reg);
701   
702   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
703   {
704     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
705       return 0; // Don't need any registers if exiting the block
706   }
707   for(j=0;j<9;j++)
708   {
709     if(i+j>=slen) {
710       j=slen-i-1;
711       break;
712     }
713     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
714     {
715       // Don't go past an unconditonal jump
716       j++;
717       break;
718     }
719     if(itype[i+j]==SYSCALL||((source[i+j]&0xfc00003f)==0x0d))
720     {
721       break;
722     }
723   }
724   for(;j>=1;j--)
725   {
726     if(rs1[i+j]==r) rn=j;
727     if(rs2[i+j]==r) rn=j;
728     if((unneeded_reg[i+j]>>r)&1) rn=10;
729     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
730     {
731       b=j;
732     }
733   }
734   /*
735   if(b>=0)
736   {
737     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
738     {
739       // Follow first branch
740       int o=rn;
741       int t=(ba[i+b]-start)>>2;
742       j=7-b;if(t+j>=slen) j=slen-t-1;
743       for(;j>=0;j--)
744       {
745         if(!((unneeded_reg[t+j]>>r)&1)) {
746           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
747           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
748         }
749         else rn=o;
750       }
751     }
752   }*/
753   for(hr=0;hr<HOST_REGS;hr++) {
754     if(hr!=EXCLUDE_REG) {
755       if(rn<hsn[hr]) return 1;
756     }
757   }
758   return 0;
759 }
760
761 // Try to match register allocations at the end of a loop with those
762 // at the beginning
763 int loop_reg(int i, int r, int hr)
764 {
765   int j,k;
766   for(j=0;j<9;j++)
767   {
768     if(i+j>=slen) {
769       j=slen-i-1;
770       break;
771     }
772     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
773     {
774       // Don't go past an unconditonal jump
775       j++;
776       break;
777     }
778   }
779   k=0;
780   if(i>0){
781     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
782       k--;
783   }
784   for(;k<j;k++)
785   {
786     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
787     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
788     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
789     {
790       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
791       {
792         int t=(ba[i+k]-start)>>2;
793         int reg=get_reg(regs[t].regmap_entry,r);
794         if(reg>=0) return reg;
795         //reg=get_reg(regs[t+1].regmap_entry,r);
796         //if(reg>=0) return reg;
797       }
798     }
799   }
800   return hr;
801 }
802
803
804 // Allocate every register, preserving source/target regs
805 void alloc_all(struct regstat *cur,int i)
806 {
807   int hr;
808   
809   for(hr=0;hr<HOST_REGS;hr++) {
810     if(hr!=EXCLUDE_REG) {
811       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
812          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
813       {
814         cur->regmap[hr]=-1;
815         cur->dirty&=~(1<<hr);
816       }
817       // Don't need zeros
818       if((cur->regmap[hr]&63)==0)
819       {
820         cur->regmap[hr]=-1;
821         cur->dirty&=~(1<<hr);
822       }
823     }
824   }
825 }
826
827
828 void div64(int64_t dividend,int64_t divisor)
829 {
830   lo=dividend/divisor;
831   hi=dividend%divisor;
832   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
833   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
834 }
835 void divu64(uint64_t dividend,uint64_t divisor)
836 {
837   lo=dividend/divisor;
838   hi=dividend%divisor;
839   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
840   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
841 }
842
843 void mult64(uint64_t m1,uint64_t m2)
844 {
845    unsigned long long int op1, op2, op3, op4;
846    unsigned long long int result1, result2, result3, result4;
847    unsigned long long int temp1, temp2, temp3, temp4;
848    int sign = 0;
849    
850    if (m1 < 0)
851      {
852     op2 = -m1;
853     sign = 1 - sign;
854      }
855    else op2 = m1;
856    if (m2 < 0)
857      {
858     op4 = -m2;
859     sign = 1 - sign;
860      }
861    else op4 = m2;
862    
863    op1 = op2 & 0xFFFFFFFF;
864    op2 = (op2 >> 32) & 0xFFFFFFFF;
865    op3 = op4 & 0xFFFFFFFF;
866    op4 = (op4 >> 32) & 0xFFFFFFFF;
867    
868    temp1 = op1 * op3;
869    temp2 = (temp1 >> 32) + op1 * op4;
870    temp3 = op2 * op3;
871    temp4 = (temp3 >> 32) + op2 * op4;
872    
873    result1 = temp1 & 0xFFFFFFFF;
874    result2 = temp2 + (temp3 & 0xFFFFFFFF);
875    result3 = (result2 >> 32) + temp4;
876    result4 = (result3 >> 32);
877    
878    lo = result1 | (result2 << 32);
879    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
880    if (sign)
881      {
882     hi = ~hi;
883     if (!lo) hi++;
884     else lo = ~lo + 1;
885      }
886 }
887
888 void multu64(uint64_t m1,uint64_t m2)
889 {
890    unsigned long long int op1, op2, op3, op4;
891    unsigned long long int result1, result2, result3, result4;
892    unsigned long long int temp1, temp2, temp3, temp4;
893    
894    op1 = m1 & 0xFFFFFFFF;
895    op2 = (m1 >> 32) & 0xFFFFFFFF;
896    op3 = m2 & 0xFFFFFFFF;
897    op4 = (m2 >> 32) & 0xFFFFFFFF;
898    
899    temp1 = op1 * op3;
900    temp2 = (temp1 >> 32) + op1 * op4;
901    temp3 = op2 * op3;
902    temp4 = (temp3 >> 32) + op2 * op4;
903    
904    result1 = temp1 & 0xFFFFFFFF;
905    result2 = temp2 + (temp3 & 0xFFFFFFFF);
906    result3 = (result2 >> 32) + temp4;
907    result4 = (result3 >> 32);
908    
909    lo = result1 | (result2 << 32);
910    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
911    
912   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
913   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
914 }
915
916 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
917 {
918   if(bits) {
919     original<<=64-bits;
920     original>>=64-bits;
921     loaded<<=bits;
922     original|=loaded;
923   }
924   else original=loaded;
925   return original;
926 }
927 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
928 {
929   if(bits^56) {
930     original>>=64-(bits^56);
931     original<<=64-(bits^56);
932     loaded>>=bits^56;
933     original|=loaded;
934   }
935   else original=loaded;
936   return original;
937 }
938
939 #ifdef __i386__
940 #include "assem_x86.c"
941 #endif
942 #ifdef __x86_64__
943 #include "assem_x64.c"
944 #endif
945 #ifdef __arm__
946 #include "assem_arm.c"
947 #endif
948
949 // Add virtual address mapping to linked list
950 void ll_add(struct ll_entry **head,int vaddr,void *addr)
951 {
952   struct ll_entry *new_entry;
953   new_entry=malloc(sizeof(struct ll_entry));
954   assert(new_entry!=NULL);
955   new_entry->vaddr=vaddr;
956   new_entry->reg32=0;
957   new_entry->addr=addr;
958   new_entry->next=*head;
959   *head=new_entry;
960 }
961
962 // Add virtual address mapping for 32-bit compiled block
963 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
964 {
965   struct ll_entry *new_entry;
966   new_entry=malloc(sizeof(struct ll_entry));
967   assert(new_entry!=NULL);
968   new_entry->vaddr=vaddr;
969   new_entry->reg32=reg32;
970   new_entry->addr=addr;
971   new_entry->next=*head;
972   *head=new_entry;
973 }
974
975 // Check if an address is already compiled
976 // but don't return addresses which are about to expire from the cache
977 void *check_addr(u_int vaddr)
978 {
979   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
980   if(ht_bin[0]==vaddr) {
981     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
982       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
983   }
984   if(ht_bin[2]==vaddr) {
985     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
986       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
987   }
988   u_int page=get_page(vaddr);
989   struct ll_entry *head;
990   head=jump_in[page];
991   while(head!=NULL) {
992     if(head->vaddr==vaddr&&head->reg32==0) {
993       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
994         // Update existing entry with current address
995         if(ht_bin[0]==vaddr) {
996           ht_bin[1]=(int)head->addr;
997           return head->addr;
998         }
999         if(ht_bin[2]==vaddr) {
1000           ht_bin[3]=(int)head->addr;
1001           return head->addr;
1002         }
1003         // Insert into hash table with low priority.
1004         // Don't evict existing entries, as they are probably
1005         // addresses that are being accessed frequently.
1006         if(ht_bin[0]==-1) {
1007           ht_bin[1]=(int)head->addr;
1008           ht_bin[0]=vaddr;
1009         }else if(ht_bin[2]==-1) {
1010           ht_bin[3]=(int)head->addr;
1011           ht_bin[2]=vaddr;
1012         }
1013         return head->addr;
1014       }
1015     }
1016     head=head->next;
1017   }
1018   return 0;
1019 }
1020
1021 void remove_hash(int vaddr)
1022 {
1023   //printf("remove hash: %x\n",vaddr);
1024   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1025   if(ht_bin[2]==vaddr) {
1026     ht_bin[2]=ht_bin[3]=-1;
1027   }
1028   if(ht_bin[0]==vaddr) {
1029     ht_bin[0]=ht_bin[2];
1030     ht_bin[1]=ht_bin[3];
1031     ht_bin[2]=ht_bin[3]=-1;
1032   }
1033 }
1034
1035 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1036 {
1037   struct ll_entry *next;
1038   while(*head) {
1039     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1040        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1041     {
1042       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1043       remove_hash((*head)->vaddr);
1044       next=(*head)->next;
1045       free(*head);
1046       *head=next;
1047     }
1048     else
1049     {
1050       head=&((*head)->next);
1051     }
1052   }
1053 }
1054
1055 // Remove all entries from linked list
1056 void ll_clear(struct ll_entry **head)
1057 {
1058   struct ll_entry *cur;
1059   struct ll_entry *next;
1060   if(cur=*head) {
1061     *head=0;
1062     while(cur) {
1063       next=cur->next;
1064       free(cur);
1065       cur=next;
1066     }
1067   }
1068 }
1069
1070 // Dereference the pointers and remove if it matches
1071 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1072 {
1073   while(head) {
1074     int ptr=get_pointer(head->addr);
1075     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1076     if(((ptr>>shift)==(addr>>shift)) ||
1077        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1078     {
1079       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1080       kill_pointer(head->addr);
1081     }
1082     head=head->next;
1083   }
1084 }
1085
1086 // This is called when we write to a compiled block (see do_invstub)
1087 int invalidate_page(u_int page)
1088 {
1089   int modified=0;
1090   struct ll_entry *head;
1091   struct ll_entry *next;
1092   head=jump_in[page];
1093   jump_in[page]=0;
1094   while(head!=NULL) {
1095     inv_debug("INVALIDATE: %x\n",head->vaddr);
1096     remove_hash(head->vaddr);
1097     next=head->next;
1098     free(head);
1099     head=next;
1100   }
1101   head=jump_out[page];
1102   jump_out[page]=0;
1103   while(head!=NULL) {
1104     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1105     kill_pointer(head->addr);
1106     modified=1;
1107     next=head->next;
1108     free(head);
1109     head=next;
1110   }
1111   return modified;
1112 }
1113 void invalidate_block(u_int block)
1114 {
1115   int modified;
1116   u_int page=get_page(block<<12);
1117   u_int vpage=get_vpage(block<<12);
1118   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1119   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1120   u_int first,last;
1121   first=last=page;
1122   struct ll_entry *head;
1123   head=jump_dirty[vpage];
1124   //printf("page=%d vpage=%d\n",page,vpage);
1125   while(head!=NULL) {
1126     u_int start,end;
1127     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1128       get_bounds((int)head->addr,&start,&end);
1129       //printf("start: %x end: %x\n",start,end);
1130       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1131         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1132           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1133           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1134         }
1135       }
1136 #ifndef DISABLE_TLB
1137       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1138         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1139           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1140           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1141         }
1142       }
1143 #endif
1144     }
1145     head=head->next;
1146   }
1147   //printf("first=%d last=%d\n",first,last);
1148   modified=invalidate_page(page);
1149   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1150   assert(last<page+5);
1151   // Invalidate the adjacent pages if a block crosses a 4K boundary
1152   while(first<page) {
1153     invalidate_page(first);
1154     first++;
1155   }
1156   for(first=page+1;first<last;first++) {
1157     invalidate_page(first);
1158   }
1159   
1160   // Don't trap writes
1161   invalid_code[block]=1;
1162 #ifndef DISABLE_TLB
1163   // If there is a valid TLB entry for this page, remove write protect
1164   if(tlb_LUT_w[block]) {
1165     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1166     // CHECK: Is this right?
1167     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1168     u_int real_block=tlb_LUT_w[block]>>12;
1169     invalid_code[real_block]=1;
1170     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1171   }
1172   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1173 #endif
1174   #ifdef __arm__
1175   if(modified)
1176     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1177   #endif
1178   #ifdef USE_MINI_HT
1179   memset(mini_ht,-1,sizeof(mini_ht));
1180   #endif
1181 }
1182 void invalidate_addr(u_int addr)
1183 {
1184   invalidate_block(addr>>12);
1185 }
1186 void invalidate_all_pages()
1187 {
1188   u_int page,n;
1189   for(page=0;page<4096;page++)
1190     invalidate_page(page);
1191   for(page=0;page<1048576;page++)
1192     if(!invalid_code[page]) {
1193       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1194       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1195     }
1196   #ifdef __arm__
1197   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1198   #endif
1199   #ifdef USE_MINI_HT
1200   memset(mini_ht,-1,sizeof(mini_ht));
1201   #endif
1202   #ifndef DISABLE_TLB
1203   // TLB
1204   for(page=0;page<0x100000;page++) {
1205     if(tlb_LUT_r[page]) {
1206       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1207       if(!tlb_LUT_w[page]||!invalid_code[page])
1208         memory_map[page]|=0x40000000; // Write protect
1209     }
1210     else memory_map[page]=-1;
1211     if(page==0x80000) page=0xC0000;
1212   }
1213   tlb_hacks();
1214   #endif
1215 }
1216
1217 // Add an entry to jump_out after making a link
1218 void add_link(u_int vaddr,void *src)
1219 {
1220   u_int page=get_page(vaddr);
1221   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1222   ll_add(jump_out+page,vaddr,src);
1223   //int ptr=get_pointer(src);
1224   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1225 }
1226
1227 // If a code block was found to be unmodified (bit was set in
1228 // restore_candidate) and it remains unmodified (bit is clear
1229 // in invalid_code) then move the entries for that 4K page from
1230 // the dirty list to the clean list.
1231 void clean_blocks(u_int page)
1232 {
1233   struct ll_entry *head;
1234   inv_debug("INV: clean_blocks page=%d\n",page);
1235   head=jump_dirty[page];
1236   while(head!=NULL) {
1237     if(!invalid_code[head->vaddr>>12]) {
1238       // Don't restore blocks which are about to expire from the cache
1239       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1240         u_int start,end;
1241         if(verify_dirty((int)head->addr)) {
1242           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1243           u_int i;
1244           u_int inv=0;
1245           get_bounds((int)head->addr,&start,&end);
1246           if(start-(u_int)rdram<0x800000) {
1247             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1248               inv|=invalid_code[i];
1249             }
1250           }
1251           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1252             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1253             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1254             if(addr<start||addr>=end) inv=1;
1255           }
1256           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1257             inv=1;
1258           }
1259           if(!inv) {
1260             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1261             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1262               u_int ppage=page;
1263 #ifndef DISABLE_TLB
1264               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1265 #endif
1266               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1267               //printf("page=%x, addr=%x\n",page,head->vaddr);
1268               //assert(head->vaddr>>12==(page|0x80000));
1269               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1270               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1271               if(!head->reg32) {
1272                 if(ht_bin[0]==head->vaddr) {
1273                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1274                 }
1275                 if(ht_bin[2]==head->vaddr) {
1276                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1277                 }
1278               }
1279             }
1280           }
1281         }
1282       }
1283     }
1284     head=head->next;
1285   }
1286 }
1287
1288
1289 void mov_alloc(struct regstat *current,int i)
1290 {
1291   // Note: Don't need to actually alloc the source registers
1292   if((~current->is32>>rs1[i])&1) {
1293     //alloc_reg64(current,i,rs1[i]);
1294     alloc_reg64(current,i,rt1[i]);
1295     current->is32&=~(1LL<<rt1[i]);
1296   } else {
1297     //alloc_reg(current,i,rs1[i]);
1298     alloc_reg(current,i,rt1[i]);
1299     current->is32|=(1LL<<rt1[i]);
1300   }
1301   clear_const(current,rs1[i]);
1302   clear_const(current,rt1[i]);
1303   dirty_reg(current,rt1[i]);
1304 }
1305
1306 void shiftimm_alloc(struct regstat *current,int i)
1307 {
1308   clear_const(current,rs1[i]);
1309   clear_const(current,rt1[i]);
1310   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1311   {
1312     if(rt1[i]) {
1313       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1314       else lt1[i]=rs1[i];
1315       alloc_reg(current,i,rt1[i]);
1316       current->is32|=1LL<<rt1[i];
1317       dirty_reg(current,rt1[i]);
1318     }
1319   }
1320   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1321   {
1322     if(rt1[i]) {
1323       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1324       alloc_reg64(current,i,rt1[i]);
1325       current->is32&=~(1LL<<rt1[i]);
1326       dirty_reg(current,rt1[i]);
1327     }
1328   }
1329   if(opcode2[i]==0x3c) // DSLL32
1330   {
1331     if(rt1[i]) {
1332       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1333       alloc_reg64(current,i,rt1[i]);
1334       current->is32&=~(1LL<<rt1[i]);
1335       dirty_reg(current,rt1[i]);
1336     }
1337   }
1338   if(opcode2[i]==0x3e) // DSRL32
1339   {
1340     if(rt1[i]) {
1341       alloc_reg64(current,i,rs1[i]);
1342       if(imm[i]==32) {
1343         alloc_reg64(current,i,rt1[i]);
1344         current->is32&=~(1LL<<rt1[i]);
1345       } else {
1346         alloc_reg(current,i,rt1[i]);
1347         current->is32|=1LL<<rt1[i];
1348       }
1349       dirty_reg(current,rt1[i]);
1350     }
1351   }
1352   if(opcode2[i]==0x3f) // DSRA32
1353   {
1354     if(rt1[i]) {
1355       alloc_reg64(current,i,rs1[i]);
1356       alloc_reg(current,i,rt1[i]);
1357       current->is32|=1LL<<rt1[i];
1358       dirty_reg(current,rt1[i]);
1359     }
1360   }
1361 }
1362
1363 void shift_alloc(struct regstat *current,int i)
1364 {
1365   if(rt1[i]) {
1366     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1367     {
1368       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1369       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1370       alloc_reg(current,i,rt1[i]);
1371       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1372       current->is32|=1LL<<rt1[i];
1373     } else { // DSLLV/DSRLV/DSRAV
1374       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1375       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1376       alloc_reg64(current,i,rt1[i]);
1377       current->is32&=~(1LL<<rt1[i]);
1378       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1379         alloc_reg_temp(current,i,-1);
1380     }
1381     clear_const(current,rs1[i]);
1382     clear_const(current,rs2[i]);
1383     clear_const(current,rt1[i]);
1384     dirty_reg(current,rt1[i]);
1385   }
1386 }
1387
1388 void alu_alloc(struct regstat *current,int i)
1389 {
1390   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1391     if(rt1[i]) {
1392       if(rs1[i]&&rs2[i]) {
1393         alloc_reg(current,i,rs1[i]);
1394         alloc_reg(current,i,rs2[i]);
1395       }
1396       else {
1397         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1398         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1399       }
1400       alloc_reg(current,i,rt1[i]);
1401     }
1402     current->is32|=1LL<<rt1[i];
1403   }
1404   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1405     if(rt1[i]) {
1406       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1407       {
1408         alloc_reg64(current,i,rs1[i]);
1409         alloc_reg64(current,i,rs2[i]);
1410         alloc_reg(current,i,rt1[i]);
1411       } else {
1412         alloc_reg(current,i,rs1[i]);
1413         alloc_reg(current,i,rs2[i]);
1414         alloc_reg(current,i,rt1[i]);
1415       }
1416     }
1417     current->is32|=1LL<<rt1[i];
1418   }
1419   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1420     if(rt1[i]) {
1421       if(rs1[i]&&rs2[i]) {
1422         alloc_reg(current,i,rs1[i]);
1423         alloc_reg(current,i,rs2[i]);
1424       }
1425       else
1426       {
1427         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1428         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1429       }
1430       alloc_reg(current,i,rt1[i]);
1431       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1432       {
1433         if(!((current->uu>>rt1[i])&1)) {
1434           alloc_reg64(current,i,rt1[i]);
1435         }
1436         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1437           if(rs1[i]&&rs2[i]) {
1438             alloc_reg64(current,i,rs1[i]);
1439             alloc_reg64(current,i,rs2[i]);
1440           }
1441           else
1442           {
1443             // Is is really worth it to keep 64-bit values in registers?
1444             #ifdef NATIVE_64BIT
1445             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1446             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1447             #endif
1448           }
1449         }
1450         current->is32&=~(1LL<<rt1[i]);
1451       } else {
1452         current->is32|=1LL<<rt1[i];
1453       }
1454     }
1455   }
1456   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1457     if(rt1[i]) {
1458       if(rs1[i]&&rs2[i]) {
1459         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1460           alloc_reg64(current,i,rs1[i]);
1461           alloc_reg64(current,i,rs2[i]);
1462           alloc_reg64(current,i,rt1[i]);
1463         } else {
1464           alloc_reg(current,i,rs1[i]);
1465           alloc_reg(current,i,rs2[i]);
1466           alloc_reg(current,i,rt1[i]);
1467         }
1468       }
1469       else {
1470         alloc_reg(current,i,rt1[i]);
1471         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1472           // DADD used as move, or zeroing
1473           // If we have a 64-bit source, then make the target 64 bits too
1474           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1475             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1476             alloc_reg64(current,i,rt1[i]);
1477           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1478             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1479             alloc_reg64(current,i,rt1[i]);
1480           }
1481           if(opcode2[i]>=0x2e&&rs2[i]) {
1482             // DSUB used as negation - 64-bit result
1483             // If we have a 32-bit register, extend it to 64 bits
1484             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1485             alloc_reg64(current,i,rt1[i]);
1486           }
1487         }
1488       }
1489       if(rs1[i]&&rs2[i]) {
1490         current->is32&=~(1LL<<rt1[i]);
1491       } else if(rs1[i]) {
1492         current->is32&=~(1LL<<rt1[i]);
1493         if((current->is32>>rs1[i])&1)
1494           current->is32|=1LL<<rt1[i];
1495       } else if(rs2[i]) {
1496         current->is32&=~(1LL<<rt1[i]);
1497         if((current->is32>>rs2[i])&1)
1498           current->is32|=1LL<<rt1[i];
1499       } else {
1500         current->is32|=1LL<<rt1[i];
1501       }
1502     }
1503   }
1504   clear_const(current,rs1[i]);
1505   clear_const(current,rs2[i]);
1506   clear_const(current,rt1[i]);
1507   dirty_reg(current,rt1[i]);
1508 }
1509
1510 void imm16_alloc(struct regstat *current,int i)
1511 {
1512   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1513   else lt1[i]=rs1[i];
1514   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1515   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1516     current->is32&=~(1LL<<rt1[i]);
1517     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1518       // TODO: Could preserve the 32-bit flag if the immediate is zero
1519       alloc_reg64(current,i,rt1[i]);
1520       alloc_reg64(current,i,rs1[i]);
1521     }
1522     clear_const(current,rs1[i]);
1523     clear_const(current,rt1[i]);
1524   }
1525   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1526     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1527     current->is32|=1LL<<rt1[i];
1528     clear_const(current,rs1[i]);
1529     clear_const(current,rt1[i]);
1530   }
1531   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1532     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1533       if(rs1[i]!=rt1[i]) {
1534         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1535         alloc_reg64(current,i,rt1[i]);
1536         current->is32&=~(1LL<<rt1[i]);
1537       }
1538     }
1539     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1540     if(is_const(current,rs1[i])) {
1541       int v=get_const(current,rs1[i]);
1542       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1543       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1544       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1545     }
1546     else clear_const(current,rt1[i]);
1547   }
1548   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1549     if(is_const(current,rs1[i])) {
1550       int v=get_const(current,rs1[i]);
1551       set_const(current,rt1[i],v+imm[i]);
1552     }
1553     else clear_const(current,rt1[i]);
1554     current->is32|=1LL<<rt1[i];
1555   }
1556   else {
1557     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1558     current->is32|=1LL<<rt1[i];
1559   }
1560   dirty_reg(current,rt1[i]);
1561 }
1562
1563 void load_alloc(struct regstat *current,int i)
1564 {
1565   clear_const(current,rt1[i]);
1566   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1567   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1568   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1569   if(rt1[i]) {
1570     alloc_reg(current,i,rt1[i]);
1571     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1572     {
1573       current->is32&=~(1LL<<rt1[i]);
1574       alloc_reg64(current,i,rt1[i]);
1575     }
1576     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1577     {
1578       current->is32&=~(1LL<<rt1[i]);
1579       alloc_reg64(current,i,rt1[i]);
1580       alloc_all(current,i);
1581       alloc_reg64(current,i,FTEMP);
1582     }
1583     else current->is32|=1LL<<rt1[i];
1584     dirty_reg(current,rt1[i]);
1585     // If using TLB, need a register for pointer to the mapping table
1586     if(using_tlb) alloc_reg(current,i,TLREG);
1587     // LWL/LWR need a temporary register for the old value
1588     if(opcode[i]==0x22||opcode[i]==0x26)
1589     {
1590       alloc_reg(current,i,FTEMP);
1591       alloc_reg_temp(current,i,-1);
1592     }
1593   }
1594   else
1595   {
1596     // Load to r0 (dummy load)
1597     // but we still need a register to calculate the address
1598     alloc_reg_temp(current,i,-1);
1599   }
1600 }
1601
1602 void store_alloc(struct regstat *current,int i)
1603 {
1604   clear_const(current,rs2[i]);
1605   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1606   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1607   alloc_reg(current,i,rs2[i]);
1608   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1609     alloc_reg64(current,i,rs2[i]);
1610     if(rs2[i]) alloc_reg(current,i,FTEMP);
1611   }
1612   // If using TLB, need a register for pointer to the mapping table
1613   if(using_tlb) alloc_reg(current,i,TLREG);
1614   #if defined(HOST_IMM8)
1615   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1616   else alloc_reg(current,i,INVCP);
1617   #endif
1618   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1619     alloc_reg(current,i,FTEMP);
1620   }
1621   // We need a temporary register for address generation
1622   alloc_reg_temp(current,i,-1);
1623 }
1624
1625 void c1ls_alloc(struct regstat *current,int i)
1626 {
1627   //clear_const(current,rs1[i]); // FIXME
1628   clear_const(current,rt1[i]);
1629   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1630   alloc_reg(current,i,CSREG); // Status
1631   alloc_reg(current,i,FTEMP);
1632   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1633     alloc_reg64(current,i,FTEMP);
1634   }
1635   // If using TLB, need a register for pointer to the mapping table
1636   if(using_tlb) alloc_reg(current,i,TLREG);
1637   #if defined(HOST_IMM8)
1638   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1639   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1640     alloc_reg(current,i,INVCP);
1641   #endif
1642   // We need a temporary register for address generation
1643   alloc_reg_temp(current,i,-1);
1644 }
1645
1646 #ifndef multdiv_alloc
1647 void multdiv_alloc(struct regstat *current,int i)
1648 {
1649   //  case 0x18: MULT
1650   //  case 0x19: MULTU
1651   //  case 0x1A: DIV
1652   //  case 0x1B: DIVU
1653   //  case 0x1C: DMULT
1654   //  case 0x1D: DMULTU
1655   //  case 0x1E: DDIV
1656   //  case 0x1F: DDIVU
1657   clear_const(current,rs1[i]);
1658   clear_const(current,rs2[i]);
1659   if(rs1[i]&&rs2[i])
1660   {
1661     if((opcode2[i]&4)==0) // 32-bit
1662     {
1663       current->u&=~(1LL<<HIREG);
1664       current->u&=~(1LL<<LOREG);
1665       alloc_reg(current,i,HIREG);
1666       alloc_reg(current,i,LOREG);
1667       alloc_reg(current,i,rs1[i]);
1668       alloc_reg(current,i,rs2[i]);
1669       current->is32|=1LL<<HIREG;
1670       current->is32|=1LL<<LOREG;
1671       dirty_reg(current,HIREG);
1672       dirty_reg(current,LOREG);
1673     }
1674     else // 64-bit
1675     {
1676       current->u&=~(1LL<<HIREG);
1677       current->u&=~(1LL<<LOREG);
1678       current->uu&=~(1LL<<HIREG);
1679       current->uu&=~(1LL<<LOREG);
1680       alloc_reg64(current,i,HIREG);
1681       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1682       alloc_reg64(current,i,rs1[i]);
1683       alloc_reg64(current,i,rs2[i]);
1684       alloc_all(current,i);
1685       current->is32&=~(1LL<<HIREG);
1686       current->is32&=~(1LL<<LOREG);
1687       dirty_reg(current,HIREG);
1688       dirty_reg(current,LOREG);
1689     }
1690   }
1691   else
1692   {
1693     // Multiply by zero is zero.
1694     // MIPS does not have a divide by zero exception.
1695     // The result is undefined, we return zero.
1696     alloc_reg(current,i,HIREG);
1697     alloc_reg(current,i,LOREG);
1698     current->is32|=1LL<<HIREG;
1699     current->is32|=1LL<<LOREG;
1700     dirty_reg(current,HIREG);
1701     dirty_reg(current,LOREG);
1702   }
1703 }
1704 #endif
1705
1706 void cop0_alloc(struct regstat *current,int i)
1707 {
1708   if(opcode2[i]==0) // MFC0
1709   {
1710     if(rt1[i]) {
1711       clear_const(current,rt1[i]);
1712       alloc_all(current,i);
1713       alloc_reg(current,i,rt1[i]);
1714       current->is32|=1LL<<rt1[i];
1715       dirty_reg(current,rt1[i]);
1716     }
1717   }
1718   else if(opcode2[i]==4) // MTC0
1719   {
1720     if(rs1[i]){
1721       clear_const(current,rs1[i]);
1722       alloc_reg(current,i,rs1[i]);
1723       alloc_all(current,i);
1724     }
1725     else {
1726       alloc_all(current,i); // FIXME: Keep r0
1727       current->u&=~1LL;
1728       alloc_reg(current,i,0);
1729     }
1730   }
1731   else
1732   {
1733     // TLBR/TLBWI/TLBWR/TLBP/ERET
1734     assert(opcode2[i]==0x10);
1735     alloc_all(current,i);
1736   }
1737 }
1738
1739 void cop1_alloc(struct regstat *current,int i)
1740 {
1741   alloc_reg(current,i,CSREG); // Load status
1742   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1743   {
1744     assert(rt1[i]);
1745     clear_const(current,rt1[i]);
1746     if(opcode2[i]==1) {
1747       alloc_reg64(current,i,rt1[i]); // DMFC1
1748       current->is32&=~(1LL<<rt1[i]);
1749     }else{
1750       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1751       current->is32|=1LL<<rt1[i];
1752     }
1753     dirty_reg(current,rt1[i]);
1754     alloc_reg_temp(current,i,-1);
1755   }
1756   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1757   {
1758     if(rs1[i]){
1759       clear_const(current,rs1[i]);
1760       if(opcode2[i]==5)
1761         alloc_reg64(current,i,rs1[i]); // DMTC1
1762       else
1763         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1764       alloc_reg_temp(current,i,-1);
1765     }
1766     else {
1767       current->u&=~1LL;
1768       alloc_reg(current,i,0);
1769       alloc_reg_temp(current,i,-1);
1770     }
1771   }
1772 }
1773 void fconv_alloc(struct regstat *current,int i)
1774 {
1775   alloc_reg(current,i,CSREG); // Load status
1776   alloc_reg_temp(current,i,-1);
1777 }
1778 void float_alloc(struct regstat *current,int i)
1779 {
1780   alloc_reg(current,i,CSREG); // Load status
1781   alloc_reg_temp(current,i,-1);
1782 }
1783 void fcomp_alloc(struct regstat *current,int i)
1784 {
1785   alloc_reg(current,i,CSREG); // Load status
1786   alloc_reg(current,i,FSREG); // Load flags
1787   dirty_reg(current,FSREG); // Flag will be modified
1788   alloc_reg_temp(current,i,-1);
1789 }
1790
1791 void syscall_alloc(struct regstat *current,int i)
1792 {
1793   alloc_cc(current,i);
1794   dirty_reg(current,CCREG);
1795   alloc_all(current,i);
1796   current->isconst=0;
1797 }
1798
1799 void delayslot_alloc(struct regstat *current,int i)
1800 {
1801   switch(itype[i]) {
1802     case UJUMP:
1803     case CJUMP:
1804     case SJUMP:
1805     case RJUMP:
1806     case FJUMP:
1807     case SYSCALL:
1808     case SPAN:
1809       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1810       printf("Disabled speculative precompilation\n");
1811       stop_after_jal=1;
1812       break;
1813     case IMM16:
1814       imm16_alloc(current,i);
1815       break;
1816     case LOAD:
1817     case LOADLR:
1818       load_alloc(current,i);
1819       break;
1820     case STORE:
1821     case STORELR:
1822       store_alloc(current,i);
1823       break;
1824     case ALU:
1825       alu_alloc(current,i);
1826       break;
1827     case SHIFT:
1828       shift_alloc(current,i);
1829       break;
1830     case MULTDIV:
1831       multdiv_alloc(current,i);
1832       break;
1833     case SHIFTIMM:
1834       shiftimm_alloc(current,i);
1835       break;
1836     case MOV:
1837       mov_alloc(current,i);
1838       break;
1839     case COP0:
1840       cop0_alloc(current,i);
1841       break;
1842     case COP1:
1843       cop1_alloc(current,i);
1844       break;
1845     case C1LS:
1846       c1ls_alloc(current,i);
1847       break;
1848     case FCONV:
1849       fconv_alloc(current,i);
1850       break;
1851     case FLOAT:
1852       float_alloc(current,i);
1853       break;
1854     case FCOMP:
1855       fcomp_alloc(current,i);
1856       break;
1857   }
1858 }
1859
1860 // Special case where a branch and delay slot span two pages in virtual memory
1861 static void pagespan_alloc(struct regstat *current,int i)
1862 {
1863   current->isconst=0;
1864   current->wasconst=0;
1865   regs[i].wasconst=0;
1866   alloc_all(current,i);
1867   alloc_cc(current,i);
1868   dirty_reg(current,CCREG);
1869   if(opcode[i]==3) // JAL
1870   {
1871     alloc_reg(current,i,31);
1872     dirty_reg(current,31);
1873   }
1874   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1875   {
1876     alloc_reg(current,i,rs1[i]);
1877     if (rt1[i]==31) {
1878       alloc_reg(current,i,31);
1879       dirty_reg(current,31);
1880     }
1881   }
1882   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1883   {
1884     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1885     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1886     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1887     {
1888       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1889       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1890     }
1891   }
1892   else
1893   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1894   {
1895     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1896     if(!((current->is32>>rs1[i])&1))
1897     {
1898       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1899     }
1900   }
1901   else
1902   if(opcode[i]==0x11) // BC1
1903   {
1904     alloc_reg(current,i,FSREG);
1905     alloc_reg(current,i,CSREG);
1906   }
1907   //else ...
1908 }
1909
1910 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1911 {
1912   stubs[stubcount][0]=type;
1913   stubs[stubcount][1]=addr;
1914   stubs[stubcount][2]=retaddr;
1915   stubs[stubcount][3]=a;
1916   stubs[stubcount][4]=b;
1917   stubs[stubcount][5]=c;
1918   stubs[stubcount][6]=d;
1919   stubs[stubcount][7]=e;
1920   stubcount++;
1921 }
1922
1923 // Write out a single register
1924 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1925 {
1926   int hr;
1927   for(hr=0;hr<HOST_REGS;hr++) {
1928     if(hr!=EXCLUDE_REG) {
1929       if((regmap[hr]&63)==r) {
1930         if((dirty>>hr)&1) {
1931           if(regmap[hr]<64) {
1932             emit_storereg(r,hr);
1933 #ifndef FORCE32
1934             if((is32>>regmap[hr])&1) {
1935               emit_sarimm(hr,31,hr);
1936               emit_storereg(r|64,hr);
1937             }
1938 #endif
1939           }else{
1940             emit_storereg(r|64,hr);
1941           }
1942         }
1943       }
1944     }
1945   }
1946 }
1947
1948 int mchecksum()
1949 {
1950   //if(!tracedebug) return 0;
1951   int i;
1952   int sum=0;
1953   for(i=0;i<2097152;i++) {
1954     unsigned int temp=sum;
1955     sum<<=1;
1956     sum|=(~temp)>>31;
1957     sum^=((u_int *)rdram)[i];
1958   }
1959   return sum;
1960 }
1961 int rchecksum()
1962 {
1963   int i;
1964   int sum=0;
1965   for(i=0;i<64;i++)
1966     sum^=((u_int *)reg)[i];
1967   return sum;
1968 }
1969 void rlist()
1970 {
1971   int i;
1972   printf("TRACE: ");
1973   for(i=0;i<32;i++)
1974     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1975   printf("\n");
1976 #ifndef DISABLE_COP1
1977   printf("TRACE: ");
1978   for(i=0;i<32;i++)
1979     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1980   printf("\n");
1981 #endif
1982 }
1983
1984 void enabletrace()
1985 {
1986   tracedebug=1;
1987 }
1988
1989 void memdebug(int i)
1990 {
1991   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1992   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1993   //rlist();
1994   //if(tracedebug) {
1995   //if(Count>=-2084597794) {
1996   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1997   //if(0) {
1998     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1999     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2000     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2001     rlist();
2002     #ifdef __i386__
2003     printf("TRACE: %x\n",(&i)[-1]);
2004     #endif
2005     #ifdef __arm__
2006     int j;
2007     printf("TRACE: %x \n",(&j)[10]);
2008     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2009     #endif
2010     //fflush(stdout);
2011   }
2012   //printf("TRACE: %x\n",(&i)[-1]);
2013 }
2014
2015 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2016 {
2017   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2018 }
2019
2020 void alu_assemble(int i,struct regstat *i_regs)
2021 {
2022   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2023     if(rt1[i]) {
2024       signed char s1,s2,t;
2025       t=get_reg(i_regs->regmap,rt1[i]);
2026       if(t>=0) {
2027         s1=get_reg(i_regs->regmap,rs1[i]);
2028         s2=get_reg(i_regs->regmap,rs2[i]);
2029         if(rs1[i]&&rs2[i]) {
2030           assert(s1>=0);
2031           assert(s2>=0);
2032           if(opcode2[i]&2) emit_sub(s1,s2,t);
2033           else emit_add(s1,s2,t);
2034         }
2035         else if(rs1[i]) {
2036           if(s1>=0) emit_mov(s1,t);
2037           else emit_loadreg(rs1[i],t);
2038         }
2039         else if(rs2[i]) {
2040           if(s2>=0) {
2041             if(opcode2[i]&2) emit_neg(s2,t);
2042             else emit_mov(s2,t);
2043           }
2044           else {
2045             emit_loadreg(rs2[i],t);
2046             if(opcode2[i]&2) emit_neg(t,t);
2047           }
2048         }
2049         else emit_zeroreg(t);
2050       }
2051     }
2052   }
2053   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2054     if(rt1[i]) {
2055       signed char s1l,s2l,s1h,s2h,tl,th;
2056       tl=get_reg(i_regs->regmap,rt1[i]);
2057       th=get_reg(i_regs->regmap,rt1[i]|64);
2058       if(tl>=0) {
2059         s1l=get_reg(i_regs->regmap,rs1[i]);
2060         s2l=get_reg(i_regs->regmap,rs2[i]);
2061         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2062         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2063         if(rs1[i]&&rs2[i]) {
2064           assert(s1l>=0);
2065           assert(s2l>=0);
2066           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2067           else emit_adds(s1l,s2l,tl);
2068           if(th>=0) {
2069             #ifdef INVERTED_CARRY
2070             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2071             #else
2072             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2073             #endif
2074             else emit_add(s1h,s2h,th);
2075           }
2076         }
2077         else if(rs1[i]) {
2078           if(s1l>=0) emit_mov(s1l,tl);
2079           else emit_loadreg(rs1[i],tl);
2080           if(th>=0) {
2081             if(s1h>=0) emit_mov(s1h,th);
2082             else emit_loadreg(rs1[i]|64,th);
2083           }
2084         }
2085         else if(rs2[i]) {
2086           if(s2l>=0) {
2087             if(opcode2[i]&2) emit_negs(s2l,tl);
2088             else emit_mov(s2l,tl);
2089           }
2090           else {
2091             emit_loadreg(rs2[i],tl);
2092             if(opcode2[i]&2) emit_negs(tl,tl);
2093           }
2094           if(th>=0) {
2095             #ifdef INVERTED_CARRY
2096             if(s2h>=0) emit_mov(s2h,th);
2097             else emit_loadreg(rs2[i]|64,th);
2098             if(opcode2[i]&2) {
2099               emit_adcimm(-1,th); // x86 has inverted carry flag
2100               emit_not(th,th);
2101             }
2102             #else
2103             if(opcode2[i]&2) {
2104               if(s2h>=0) emit_rscimm(s2h,0,th);
2105               else {
2106                 emit_loadreg(rs2[i]|64,th);
2107                 emit_rscimm(th,0,th);
2108               }
2109             }else{
2110               if(s2h>=0) emit_mov(s2h,th);
2111               else emit_loadreg(rs2[i]|64,th);
2112             }
2113             #endif
2114           }
2115         }
2116         else {
2117           emit_zeroreg(tl);
2118           if(th>=0) emit_zeroreg(th);
2119         }
2120       }
2121     }
2122   }
2123   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2124     if(rt1[i]) {
2125       signed char s1l,s1h,s2l,s2h,t;
2126       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2127       {
2128         t=get_reg(i_regs->regmap,rt1[i]);
2129         //assert(t>=0);
2130         if(t>=0) {
2131           s1l=get_reg(i_regs->regmap,rs1[i]);
2132           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2133           s2l=get_reg(i_regs->regmap,rs2[i]);
2134           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2135           if(rs2[i]==0) // rx<r0
2136           {
2137             assert(s1h>=0);
2138             if(opcode2[i]==0x2a) // SLT
2139               emit_shrimm(s1h,31,t);
2140             else // SLTU (unsigned can not be less than zero)
2141               emit_zeroreg(t);
2142           }
2143           else if(rs1[i]==0) // r0<rx
2144           {
2145             assert(s2h>=0);
2146             if(opcode2[i]==0x2a) // SLT
2147               emit_set_gz64_32(s2h,s2l,t);
2148             else // SLTU (set if not zero)
2149               emit_set_nz64_32(s2h,s2l,t);
2150           }
2151           else {
2152             assert(s1l>=0);assert(s1h>=0);
2153             assert(s2l>=0);assert(s2h>=0);
2154             if(opcode2[i]==0x2a) // SLT
2155               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2156             else // SLTU
2157               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2158           }
2159         }
2160       } else {
2161         t=get_reg(i_regs->regmap,rt1[i]);
2162         //assert(t>=0);
2163         if(t>=0) {
2164           s1l=get_reg(i_regs->regmap,rs1[i]);
2165           s2l=get_reg(i_regs->regmap,rs2[i]);
2166           if(rs2[i]==0) // rx<r0
2167           {
2168             assert(s1l>=0);
2169             if(opcode2[i]==0x2a) // SLT
2170               emit_shrimm(s1l,31,t);
2171             else // SLTU (unsigned can not be less than zero)
2172               emit_zeroreg(t);
2173           }
2174           else if(rs1[i]==0) // r0<rx
2175           {
2176             assert(s2l>=0);
2177             if(opcode2[i]==0x2a) // SLT
2178               emit_set_gz32(s2l,t);
2179             else // SLTU (set if not zero)
2180               emit_set_nz32(s2l,t);
2181           }
2182           else{
2183             assert(s1l>=0);assert(s2l>=0);
2184             if(opcode2[i]==0x2a) // SLT
2185               emit_set_if_less32(s1l,s2l,t);
2186             else // SLTU
2187               emit_set_if_carry32(s1l,s2l,t);
2188           }
2189         }
2190       }
2191     }
2192   }
2193   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2194     if(rt1[i]) {
2195       signed char s1l,s1h,s2l,s2h,th,tl;
2196       tl=get_reg(i_regs->regmap,rt1[i]);
2197       th=get_reg(i_regs->regmap,rt1[i]|64);
2198       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2199       {
2200         assert(tl>=0);
2201         if(tl>=0) {
2202           s1l=get_reg(i_regs->regmap,rs1[i]);
2203           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2204           s2l=get_reg(i_regs->regmap,rs2[i]);
2205           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2206           if(rs1[i]&&rs2[i]) {
2207             assert(s1l>=0);assert(s1h>=0);
2208             assert(s2l>=0);assert(s2h>=0);
2209             if(opcode2[i]==0x24) { // AND
2210               emit_and(s1l,s2l,tl);
2211               emit_and(s1h,s2h,th);
2212             } else
2213             if(opcode2[i]==0x25) { // OR
2214               emit_or(s1l,s2l,tl);
2215               emit_or(s1h,s2h,th);
2216             } else
2217             if(opcode2[i]==0x26) { // XOR
2218               emit_xor(s1l,s2l,tl);
2219               emit_xor(s1h,s2h,th);
2220             } else
2221             if(opcode2[i]==0x27) { // NOR
2222               emit_or(s1l,s2l,tl);
2223               emit_or(s1h,s2h,th);
2224               emit_not(tl,tl);
2225               emit_not(th,th);
2226             }
2227           }
2228           else
2229           {
2230             if(opcode2[i]==0x24) { // AND
2231               emit_zeroreg(tl);
2232               emit_zeroreg(th);
2233             } else
2234             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2235               if(rs1[i]){
2236                 if(s1l>=0) emit_mov(s1l,tl);
2237                 else emit_loadreg(rs1[i],tl);
2238                 if(s1h>=0) emit_mov(s1h,th);
2239                 else emit_loadreg(rs1[i]|64,th);
2240               }
2241               else
2242               if(rs2[i]){
2243                 if(s2l>=0) emit_mov(s2l,tl);
2244                 else emit_loadreg(rs2[i],tl);
2245                 if(s2h>=0) emit_mov(s2h,th);
2246                 else emit_loadreg(rs2[i]|64,th);
2247               }
2248               else{
2249                 emit_zeroreg(tl);
2250                 emit_zeroreg(th);
2251               }
2252             } else
2253             if(opcode2[i]==0x27) { // NOR
2254               if(rs1[i]){
2255                 if(s1l>=0) emit_not(s1l,tl);
2256                 else{
2257                   emit_loadreg(rs1[i],tl);
2258                   emit_not(tl,tl);
2259                 }
2260                 if(s1h>=0) emit_not(s1h,th);
2261                 else{
2262                   emit_loadreg(rs1[i]|64,th);
2263                   emit_not(th,th);
2264                 }
2265               }
2266               else
2267               if(rs2[i]){
2268                 if(s2l>=0) emit_not(s2l,tl);
2269                 else{
2270                   emit_loadreg(rs2[i],tl);
2271                   emit_not(tl,tl);
2272                 }
2273                 if(s2h>=0) emit_not(s2h,th);
2274                 else{
2275                   emit_loadreg(rs2[i]|64,th);
2276                   emit_not(th,th);
2277                 }
2278               }
2279               else {
2280                 emit_movimm(-1,tl);
2281                 emit_movimm(-1,th);
2282               }
2283             }
2284           }
2285         }
2286       }
2287       else
2288       {
2289         // 32 bit
2290         if(tl>=0) {
2291           s1l=get_reg(i_regs->regmap,rs1[i]);
2292           s2l=get_reg(i_regs->regmap,rs2[i]);
2293           if(rs1[i]&&rs2[i]) {
2294             assert(s1l>=0);
2295             assert(s2l>=0);
2296             if(opcode2[i]==0x24) { // AND
2297               emit_and(s1l,s2l,tl);
2298             } else
2299             if(opcode2[i]==0x25) { // OR
2300               emit_or(s1l,s2l,tl);
2301             } else
2302             if(opcode2[i]==0x26) { // XOR
2303               emit_xor(s1l,s2l,tl);
2304             } else
2305             if(opcode2[i]==0x27) { // NOR
2306               emit_or(s1l,s2l,tl);
2307               emit_not(tl,tl);
2308             }
2309           }
2310           else
2311           {
2312             if(opcode2[i]==0x24) { // AND
2313               emit_zeroreg(tl);
2314             } else
2315             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2316               if(rs1[i]){
2317                 if(s1l>=0) emit_mov(s1l,tl);
2318                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2319               }
2320               else
2321               if(rs2[i]){
2322                 if(s2l>=0) emit_mov(s2l,tl);
2323                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2324               }
2325               else emit_zeroreg(tl);
2326             } else
2327             if(opcode2[i]==0x27) { // NOR
2328               if(rs1[i]){
2329                 if(s1l>=0) emit_not(s1l,tl);
2330                 else {
2331                   emit_loadreg(rs1[i],tl);
2332                   emit_not(tl,tl);
2333                 }
2334               }
2335               else
2336               if(rs2[i]){
2337                 if(s2l>=0) emit_not(s2l,tl);
2338                 else {
2339                   emit_loadreg(rs2[i],tl);
2340                   emit_not(tl,tl);
2341                 }
2342               }
2343               else emit_movimm(-1,tl);
2344             }
2345           }
2346         }
2347       }
2348     }
2349   }
2350 }
2351
2352 void imm16_assemble(int i,struct regstat *i_regs)
2353 {
2354   if (opcode[i]==0x0f) { // LUI
2355     if(rt1[i]) {
2356       signed char t;
2357       t=get_reg(i_regs->regmap,rt1[i]);
2358       //assert(t>=0);
2359       if(t>=0) {
2360         if(!((i_regs->isconst>>t)&1))
2361           emit_movimm(imm[i]<<16,t);
2362       }
2363     }
2364   }
2365   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2366     if(rt1[i]) {
2367       signed char s,t;
2368       t=get_reg(i_regs->regmap,rt1[i]);
2369       s=get_reg(i_regs->regmap,rs1[i]);
2370       if(rs1[i]) {
2371         //assert(t>=0);
2372         //assert(s>=0);
2373         if(t>=0) {
2374           if(!((i_regs->isconst>>t)&1)) {
2375             if(s<0) {
2376               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2377               emit_addimm(t,imm[i],t);
2378             }else{
2379               if(!((i_regs->wasconst>>s)&1))
2380                 emit_addimm(s,imm[i],t);
2381               else
2382                 emit_movimm(constmap[i][s]+imm[i],t);
2383             }
2384           }
2385         }
2386       } else {
2387         if(t>=0) {
2388           if(!((i_regs->isconst>>t)&1))
2389             emit_movimm(imm[i],t);
2390         }
2391       }
2392     }
2393   }
2394   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2395     if(rt1[i]) {
2396       signed char sh,sl,th,tl;
2397       th=get_reg(i_regs->regmap,rt1[i]|64);
2398       tl=get_reg(i_regs->regmap,rt1[i]);
2399       sh=get_reg(i_regs->regmap,rs1[i]|64);
2400       sl=get_reg(i_regs->regmap,rs1[i]);
2401       if(tl>=0) {
2402         if(rs1[i]) {
2403           assert(sh>=0);
2404           assert(sl>=0);
2405           if(th>=0) {
2406             emit_addimm64_32(sh,sl,imm[i],th,tl);
2407           }
2408           else {
2409             emit_addimm(sl,imm[i],tl);
2410           }
2411         } else {
2412           emit_movimm(imm[i],tl);
2413           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2414         }
2415       }
2416     }
2417   }
2418   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2419     if(rt1[i]) {
2420       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2421       signed char sh,sl,t;
2422       t=get_reg(i_regs->regmap,rt1[i]);
2423       sh=get_reg(i_regs->regmap,rs1[i]|64);
2424       sl=get_reg(i_regs->regmap,rs1[i]);
2425       //assert(t>=0);
2426       if(t>=0) {
2427         if(rs1[i]>0) {
2428           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2429           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2430             if(opcode[i]==0x0a) { // SLTI
2431               if(sl<0) {
2432                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2433                 emit_slti32(t,imm[i],t);
2434               }else{
2435                 emit_slti32(sl,imm[i],t);
2436               }
2437             }
2438             else { // SLTIU
2439               if(sl<0) {
2440                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2441                 emit_sltiu32(t,imm[i],t);
2442               }else{
2443                 emit_sltiu32(sl,imm[i],t);
2444               }
2445             }
2446           }else{ // 64-bit
2447             assert(sl>=0);
2448             if(opcode[i]==0x0a) // SLTI
2449               emit_slti64_32(sh,sl,imm[i],t);
2450             else // SLTIU
2451               emit_sltiu64_32(sh,sl,imm[i],t);
2452           }
2453         }else{
2454           // SLTI(U) with r0 is just stupid,
2455           // nonetheless examples can be found
2456           if(opcode[i]==0x0a) // SLTI
2457             if(0<imm[i]) emit_movimm(1,t);
2458             else emit_zeroreg(t);
2459           else // SLTIU
2460           {
2461             if(imm[i]) emit_movimm(1,t);
2462             else emit_zeroreg(t);
2463           }
2464         }
2465       }
2466     }
2467   }
2468   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2469     if(rt1[i]) {
2470       signed char sh,sl,th,tl;
2471       th=get_reg(i_regs->regmap,rt1[i]|64);
2472       tl=get_reg(i_regs->regmap,rt1[i]);
2473       sh=get_reg(i_regs->regmap,rs1[i]|64);
2474       sl=get_reg(i_regs->regmap,rs1[i]);
2475       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2476         if(opcode[i]==0x0c) //ANDI
2477         {
2478           if(rs1[i]) {
2479             if(sl<0) {
2480               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2481               emit_andimm(tl,imm[i],tl);
2482             }else{
2483               if(!((i_regs->wasconst>>sl)&1))
2484                 emit_andimm(sl,imm[i],tl);
2485               else
2486                 emit_movimm(constmap[i][sl]&imm[i],tl);
2487             }
2488           }
2489           else
2490             emit_zeroreg(tl);
2491           if(th>=0) emit_zeroreg(th);
2492         }
2493         else
2494         {
2495           if(rs1[i]) {
2496             if(sl<0) {
2497               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2498             }
2499             if(th>=0) {
2500               if(sh<0) {
2501                 emit_loadreg(rs1[i]|64,th);
2502               }else{
2503                 emit_mov(sh,th);
2504               }
2505             }
2506             if(opcode[i]==0x0d) //ORI
2507             if(sl<0) {
2508               emit_orimm(tl,imm[i],tl);
2509             }else{
2510               if(!((i_regs->wasconst>>sl)&1))
2511                 emit_orimm(sl,imm[i],tl);
2512               else
2513                 emit_movimm(constmap[i][sl]|imm[i],tl);
2514             }
2515             if(opcode[i]==0x0e) //XORI
2516             if(sl<0) {
2517               emit_xorimm(tl,imm[i],tl);
2518             }else{
2519               if(!((i_regs->wasconst>>sl)&1))
2520                 emit_xorimm(sl,imm[i],tl);
2521               else
2522                 emit_movimm(constmap[i][sl]^imm[i],tl);
2523             }
2524           }
2525           else {
2526             emit_movimm(imm[i],tl);
2527             if(th>=0) emit_zeroreg(th);
2528           }
2529         }
2530       }
2531     }
2532   }
2533 }
2534
2535 void shiftimm_assemble(int i,struct regstat *i_regs)
2536 {
2537   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2538   {
2539     if(rt1[i]) {
2540       signed char s,t;
2541       t=get_reg(i_regs->regmap,rt1[i]);
2542       s=get_reg(i_regs->regmap,rs1[i]);
2543       //assert(t>=0);
2544       if(t>=0){
2545         if(rs1[i]==0)
2546         {
2547           emit_zeroreg(t);
2548         }
2549         else
2550         {
2551           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2552           if(imm[i]) {
2553             if(opcode2[i]==0) // SLL
2554             {
2555               emit_shlimm(s<0?t:s,imm[i],t);
2556             }
2557             if(opcode2[i]==2) // SRL
2558             {
2559               emit_shrimm(s<0?t:s,imm[i],t);
2560             }
2561             if(opcode2[i]==3) // SRA
2562             {
2563               emit_sarimm(s<0?t:s,imm[i],t);
2564             }
2565           }else{
2566             // Shift by zero
2567             if(s>=0 && s!=t) emit_mov(s,t);
2568           }
2569         }
2570       }
2571       //emit_storereg(rt1[i],t); //DEBUG
2572     }
2573   }
2574   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2575   {
2576     if(rt1[i]) {
2577       signed char sh,sl,th,tl;
2578       th=get_reg(i_regs->regmap,rt1[i]|64);
2579       tl=get_reg(i_regs->regmap,rt1[i]);
2580       sh=get_reg(i_regs->regmap,rs1[i]|64);
2581       sl=get_reg(i_regs->regmap,rs1[i]);
2582       if(tl>=0) {
2583         if(rs1[i]==0)
2584         {
2585           emit_zeroreg(tl);
2586           if(th>=0) emit_zeroreg(th);
2587         }
2588         else
2589         {
2590           assert(sl>=0);
2591           assert(sh>=0);
2592           if(imm[i]) {
2593             if(opcode2[i]==0x38) // DSLL
2594             {
2595               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2596               emit_shlimm(sl,imm[i],tl);
2597             }
2598             if(opcode2[i]==0x3a) // DSRL
2599             {
2600               emit_shrdimm(sl,sh,imm[i],tl);
2601               if(th>=0) emit_shrimm(sh,imm[i],th);
2602             }
2603             if(opcode2[i]==0x3b) // DSRA
2604             {
2605               emit_shrdimm(sl,sh,imm[i],tl);
2606               if(th>=0) emit_sarimm(sh,imm[i],th);
2607             }
2608           }else{
2609             // Shift by zero
2610             if(sl!=tl) emit_mov(sl,tl);
2611             if(th>=0&&sh!=th) emit_mov(sh,th);
2612           }
2613         }
2614       }
2615     }
2616   }
2617   if(opcode2[i]==0x3c) // DSLL32
2618   {
2619     if(rt1[i]) {
2620       signed char sl,tl,th;
2621       tl=get_reg(i_regs->regmap,rt1[i]);
2622       th=get_reg(i_regs->regmap,rt1[i]|64);
2623       sl=get_reg(i_regs->regmap,rs1[i]);
2624       if(th>=0||tl>=0){
2625         assert(tl>=0);
2626         assert(th>=0);
2627         assert(sl>=0);
2628         emit_mov(sl,th);
2629         emit_zeroreg(tl);
2630         if(imm[i]>32)
2631         {
2632           emit_shlimm(th,imm[i]&31,th);
2633         }
2634       }
2635     }
2636   }
2637   if(opcode2[i]==0x3e) // DSRL32
2638   {
2639     if(rt1[i]) {
2640       signed char sh,tl,th;
2641       tl=get_reg(i_regs->regmap,rt1[i]);
2642       th=get_reg(i_regs->regmap,rt1[i]|64);
2643       sh=get_reg(i_regs->regmap,rs1[i]|64);
2644       if(tl>=0){
2645         assert(sh>=0);
2646         emit_mov(sh,tl);
2647         if(th>=0) emit_zeroreg(th);
2648         if(imm[i]>32)
2649         {
2650           emit_shrimm(tl,imm[i]&31,tl);
2651         }
2652       }
2653     }
2654   }
2655   if(opcode2[i]==0x3f) // DSRA32
2656   {
2657     if(rt1[i]) {
2658       signed char sh,tl;
2659       tl=get_reg(i_regs->regmap,rt1[i]);
2660       sh=get_reg(i_regs->regmap,rs1[i]|64);
2661       if(tl>=0){
2662         assert(sh>=0);
2663         emit_mov(sh,tl);
2664         if(imm[i]>32)
2665         {
2666           emit_sarimm(tl,imm[i]&31,tl);
2667         }
2668       }
2669     }
2670   }
2671 }
2672
2673 #ifndef shift_assemble
2674 void shift_assemble(int i,struct regstat *i_regs)
2675 {
2676   printf("Need shift_assemble for this architecture.\n");
2677   exit(1);
2678 }
2679 #endif
2680
2681 void load_assemble(int i,struct regstat *i_regs)
2682 {
2683   int s,th,tl,addr,map=-1;
2684   int offset;
2685   int jaddr=0;
2686   int memtarget,c=0;
2687   u_int hr,reglist=0;
2688   th=get_reg(i_regs->regmap,rt1[i]|64);
2689   tl=get_reg(i_regs->regmap,rt1[i]);
2690   s=get_reg(i_regs->regmap,rs1[i]);
2691   offset=imm[i];
2692   for(hr=0;hr<HOST_REGS;hr++) {
2693     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2694   }
2695   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2696   if(s>=0) {
2697     c=(i_regs->wasconst>>s)&1;
2698     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2699     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2700   }
2701   if(offset||s<0||c) addr=tl;
2702   else addr=s;
2703   //printf("load_assemble: c=%d\n",c);
2704   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2705   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2706   if(tl>=0) {
2707     //assert(tl>=0);
2708     //assert(rt1[i]);
2709     reglist&=~(1<<tl);
2710     if(th>=0) reglist&=~(1<<th);
2711     if(!using_tlb) {
2712       if(!c) {
2713 //#define R29_HACK 1
2714         #ifdef R29_HACK
2715         // Strmnnrmn's speed hack
2716         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2717         #endif
2718         {
2719           emit_cmpimm(addr,0x800000);
2720           jaddr=(int)out;
2721           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2722           // Hint to branch predictor that the branch is unlikely to be taken
2723           if(rs1[i]>=28)
2724             emit_jno_unlikely(0);
2725           else
2726           #endif
2727           emit_jno(0);
2728         }
2729       }
2730     }else{ // using tlb
2731       int x=0;
2732       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2733       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2734       map=get_reg(i_regs->regmap,TLREG);
2735       assert(map>=0);
2736       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2737       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2738     }
2739     if (opcode[i]==0x20) { // LB
2740       if(!c||memtarget) {
2741         #ifdef HOST_IMM_ADDR32
2742         if(c)
2743           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2744         else
2745         #endif
2746         {
2747           //emit_xorimm(addr,3,tl);
2748           //gen_tlb_addr_r(tl,map);
2749           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2750           int x=0;
2751           if(!c) emit_xorimm(addr,3,tl);
2752           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2753           emit_movsbl_indexed_tlb(x,tl,map,tl);
2754         }
2755         if(jaddr)
2756           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2757       }
2758       else
2759         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2760     }
2761     if (opcode[i]==0x21) { // LH
2762       if(!c||memtarget) {
2763         #ifdef HOST_IMM_ADDR32
2764         if(c)
2765           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2766         else
2767         #endif
2768         {
2769           int x=0;
2770           if(!c) emit_xorimm(addr,2,tl);
2771           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2772           //#ifdef
2773           //emit_movswl_indexed_tlb(x,tl,map,tl);
2774           //else
2775           if(map>=0) {
2776             gen_tlb_addr_r(tl,map);
2777             emit_movswl_indexed(x,tl,tl);
2778           }else
2779             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2780         }
2781         if(jaddr)
2782           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2783       }
2784       else
2785         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2786     }
2787     if (opcode[i]==0x23) { // LW
2788       if(!c||memtarget) {
2789         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2790         #ifdef HOST_IMM_ADDR32
2791         if(c)
2792           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2793         else
2794         #endif
2795         emit_readword_indexed_tlb(0,addr,map,tl);
2796         if(jaddr)
2797           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2798       }
2799       else
2800         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2801     }
2802     if (opcode[i]==0x24) { // LBU
2803       if(!c||memtarget) {
2804         #ifdef HOST_IMM_ADDR32
2805         if(c)
2806           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2807         else
2808         #endif
2809         {
2810           //emit_xorimm(addr,3,tl);
2811           //gen_tlb_addr_r(tl,map);
2812           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2813           int x=0;
2814           if(!c) emit_xorimm(addr,3,tl);
2815           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2816           emit_movzbl_indexed_tlb(x,tl,map,tl);
2817         }
2818         if(jaddr)
2819           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2820       }
2821       else
2822         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2823     }
2824     if (opcode[i]==0x25) { // LHU
2825       if(!c||memtarget) {
2826         #ifdef HOST_IMM_ADDR32
2827         if(c)
2828           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2829         else
2830         #endif
2831         {
2832           int x=0;
2833           if(!c) emit_xorimm(addr,2,tl);
2834           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2835           //#ifdef
2836           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2837           //#else
2838           if(map>=0) {
2839             gen_tlb_addr_r(tl,map);
2840             emit_movzwl_indexed(x,tl,tl);
2841           }else
2842             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2843           if(jaddr)
2844             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2845         }
2846       }
2847       else
2848         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2849     }
2850     if (opcode[i]==0x27) { // LWU
2851       assert(th>=0);
2852       if(!c||memtarget) {
2853         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2854         #ifdef HOST_IMM_ADDR32
2855         if(c)
2856           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2857         else
2858         #endif
2859         emit_readword_indexed_tlb(0,addr,map,tl);
2860         if(jaddr)
2861           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2862       }
2863       else {
2864         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2865       }
2866       emit_zeroreg(th);
2867     }
2868     if (opcode[i]==0x37) { // LD
2869       if(!c||memtarget) {
2870         //gen_tlb_addr_r(tl,map);
2871         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2872         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2873         #ifdef HOST_IMM_ADDR32
2874         if(c)
2875           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2876         else
2877         #endif
2878         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2879         if(jaddr)
2880           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2881       }
2882       else
2883         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2884     }
2885     //emit_storereg(rt1[i],tl); // DEBUG
2886   }
2887   //if(opcode[i]==0x23)
2888   //if(opcode[i]==0x24)
2889   //if(opcode[i]==0x23||opcode[i]==0x24)
2890   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2891   {
2892     //emit_pusha();
2893     save_regs(0x100f);
2894         emit_readword((int)&last_count,ECX);
2895         #ifdef __i386__
2896         if(get_reg(i_regs->regmap,CCREG)<0)
2897           emit_loadreg(CCREG,HOST_CCREG);
2898         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2899         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2900         emit_writeword(HOST_CCREG,(int)&Count);
2901         #endif
2902         #ifdef __arm__
2903         if(get_reg(i_regs->regmap,CCREG)<0)
2904           emit_loadreg(CCREG,0);
2905         else
2906           emit_mov(HOST_CCREG,0);
2907         emit_add(0,ECX,0);
2908         emit_addimm(0,2*ccadj[i],0);
2909         emit_writeword(0,(int)&Count);
2910         #endif
2911     emit_call((int)memdebug);
2912     //emit_popa();
2913     restore_regs(0x100f);
2914   }/**/
2915 }
2916
2917 #ifndef loadlr_assemble
2918 void loadlr_assemble(int i,struct regstat *i_regs)
2919 {
2920   printf("Need loadlr_assemble for this architecture.\n");
2921   exit(1);
2922 }
2923 #endif
2924
2925 void store_assemble(int i,struct regstat *i_regs)
2926 {
2927   int s,th,tl,map=-1;
2928   int addr,temp;
2929   int offset;
2930   int jaddr=0,jaddr2,type;
2931   int memtarget,c=0;
2932   int agr=AGEN1+(i&1);
2933   u_int hr,reglist=0;
2934   th=get_reg(i_regs->regmap,rs2[i]|64);
2935   tl=get_reg(i_regs->regmap,rs2[i]);
2936   s=get_reg(i_regs->regmap,rs1[i]);
2937   temp=get_reg(i_regs->regmap,agr);
2938   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2939   offset=imm[i];
2940   if(s>=0) {
2941     c=(i_regs->wasconst>>s)&1;
2942     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2943     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2944   }
2945   assert(tl>=0);
2946   assert(temp>=0);
2947   for(hr=0;hr<HOST_REGS;hr++) {
2948     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2949   }
2950   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2951   if(offset||s<0||c) addr=temp;
2952   else addr=s;
2953   if(!using_tlb) {
2954     if(!c) {
2955       #ifdef R29_HACK
2956       // Strmnnrmn's speed hack
2957       memtarget=1;
2958       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2959       #endif
2960       emit_cmpimm(addr,0x800000);
2961       #ifdef DESTRUCTIVE_SHIFT
2962       if(s==addr) emit_mov(s,temp);
2963       #endif
2964       #ifdef R29_HACK
2965       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2966       #endif
2967       {
2968         jaddr=(int)out;
2969         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2970         // Hint to branch predictor that the branch is unlikely to be taken
2971         if(rs1[i]>=28)
2972           emit_jno_unlikely(0);
2973         else
2974         #endif
2975         emit_jno(0);
2976       }
2977     }
2978   }else{ // using tlb
2979     int x=0;
2980     if (opcode[i]==0x28) x=3; // SB
2981     if (opcode[i]==0x29) x=2; // SH
2982     map=get_reg(i_regs->regmap,TLREG);
2983     assert(map>=0);
2984     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
2985     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
2986   }
2987
2988   if (opcode[i]==0x28) { // SB
2989     if(!c||memtarget) {
2990       int x=0;
2991       if(!c) emit_xorimm(addr,3,temp);
2992       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2993       //gen_tlb_addr_w(temp,map);
2994       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2995       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
2996     }
2997     type=STOREB_STUB;
2998   }
2999   if (opcode[i]==0x29) { // SH
3000     if(!c||memtarget) {
3001       int x=0;
3002       if(!c) emit_xorimm(addr,2,temp);
3003       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3004       //#ifdef
3005       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3006       //#else
3007       if(map>=0) {
3008         gen_tlb_addr_w(temp,map);
3009         emit_writehword_indexed(tl,x,temp);
3010       }else
3011         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3012     }
3013     type=STOREH_STUB;
3014   }
3015   if (opcode[i]==0x2B) { // SW
3016     if(!c||memtarget)
3017       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3018       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3019     type=STOREW_STUB;
3020   }
3021   if (opcode[i]==0x3F) { // SD
3022     if(!c||memtarget) {
3023       if(rs2[i]) {
3024         assert(th>=0);
3025         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3026         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3027         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3028       }else{
3029         // Store zero
3030         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3031         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3032         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3033       }
3034     }
3035     type=STORED_STUB;
3036   }
3037   if(jaddr) {
3038     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3039   } else if(!memtarget) {
3040     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3041   }
3042   if(!using_tlb) {
3043     if(!c||memtarget) {
3044       #ifdef DESTRUCTIVE_SHIFT
3045       // The x86 shift operation is 'destructive'; it overwrites the
3046       // source register, so we need to make a copy first and use that.
3047       addr=temp;
3048       #endif
3049       #if defined(HOST_IMM8)
3050       int ir=get_reg(i_regs->regmap,INVCP);
3051       assert(ir>=0);
3052       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3053       #else
3054       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3055       #endif
3056       jaddr2=(int)out;
3057       emit_jne(0);
3058       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3059     }
3060   }
3061   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3062   //if(opcode[i]==0x2B || opcode[i]==0x28)
3063   //if(opcode[i]==0x2B || opcode[i]==0x29)
3064   //if(opcode[i]==0x2B)
3065   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3066   {
3067     //emit_pusha();
3068     save_regs(0x100f);
3069         emit_readword((int)&last_count,ECX);
3070         #ifdef __i386__
3071         if(get_reg(i_regs->regmap,CCREG)<0)
3072           emit_loadreg(CCREG,HOST_CCREG);
3073         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3074         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3075         emit_writeword(HOST_CCREG,(int)&Count);
3076         #endif
3077         #ifdef __arm__
3078         if(get_reg(i_regs->regmap,CCREG)<0)
3079           emit_loadreg(CCREG,0);
3080         else
3081           emit_mov(HOST_CCREG,0);
3082         emit_add(0,ECX,0);
3083         emit_addimm(0,2*ccadj[i],0);
3084         emit_writeword(0,(int)&Count);
3085         #endif
3086     emit_call((int)memdebug);
3087     //emit_popa();
3088     restore_regs(0x100f);
3089   }/**/
3090 }
3091
3092 void storelr_assemble(int i,struct regstat *i_regs)
3093 {
3094   int s,th,tl;
3095   int temp;
3096   int temp2;
3097   int offset;
3098   int jaddr=0,jaddr2;
3099   int case1,case2,case3;
3100   int done0,done1,done2;
3101   int memtarget,c=0;
3102   u_int hr,reglist=0;
3103   th=get_reg(i_regs->regmap,rs2[i]|64);
3104   tl=get_reg(i_regs->regmap,rs2[i]);
3105   s=get_reg(i_regs->regmap,rs1[i]);
3106   temp=get_reg(i_regs->regmap,-1);
3107   offset=imm[i];
3108   if(s>=0) {
3109     c=(i_regs->isconst>>s)&1;
3110     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3111     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3112   }
3113   assert(tl>=0);
3114   for(hr=0;hr<HOST_REGS;hr++) {
3115     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3116   }
3117   if(tl>=0) {
3118     assert(temp>=0);
3119     if(!using_tlb) {
3120       if(!c) {
3121         emit_cmpimm(s<0||offset?temp:s,0x800000);
3122         if(!offset&&s!=temp) emit_mov(s,temp);
3123         jaddr=(int)out;
3124         emit_jno(0);
3125       }
3126       else
3127       {
3128         if(!memtarget||!rs1[i]) {
3129           jaddr=(int)out;
3130           emit_jmp(0);
3131         }
3132       }
3133       if((u_int)rdram!=0x80000000) 
3134         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3135     }else{ // using tlb
3136       int map=get_reg(i_regs->regmap,TLREG);
3137       assert(map>=0);
3138       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3139       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3140       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3141       if(!jaddr&&!memtarget) {
3142         jaddr=(int)out;
3143         emit_jmp(0);
3144       }
3145       gen_tlb_addr_w(temp,map);
3146     }
3147
3148     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3149       temp2=get_reg(i_regs->regmap,FTEMP);
3150       if(!rs2[i]) temp2=th=tl;
3151     }
3152
3153     emit_testimm(temp,2);
3154     case2=(int)out;
3155     emit_jne(0);
3156     emit_testimm(temp,1);
3157     case1=(int)out;
3158     emit_jne(0);
3159     // 0
3160     if (opcode[i]==0x2A) { // SWL
3161       emit_writeword_indexed(tl,0,temp);
3162     }
3163     if (opcode[i]==0x2E) { // SWR
3164       emit_writebyte_indexed(tl,3,temp);
3165     }
3166     if (opcode[i]==0x2C) { // SDL
3167       emit_writeword_indexed(th,0,temp);
3168       if(rs2[i]) emit_mov(tl,temp2);
3169     }
3170     if (opcode[i]==0x2D) { // SDR
3171       emit_writebyte_indexed(tl,3,temp);
3172       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3173     }
3174     done0=(int)out;
3175     emit_jmp(0);
3176     // 1
3177     set_jump_target(case1,(int)out);
3178     if (opcode[i]==0x2A) { // SWL
3179       // Write 3 msb into three least significant bytes
3180       if(rs2[i]) emit_rorimm(tl,8,tl);
3181       emit_writehword_indexed(tl,-1,temp);
3182       if(rs2[i]) emit_rorimm(tl,16,tl);
3183       emit_writebyte_indexed(tl,1,temp);
3184       if(rs2[i]) emit_rorimm(tl,8,tl);
3185     }
3186     if (opcode[i]==0x2E) { // SWR
3187       // Write two lsb into two most significant bytes
3188       emit_writehword_indexed(tl,1,temp);
3189     }
3190     if (opcode[i]==0x2C) { // SDL
3191       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3192       // Write 3 msb into three least significant bytes
3193       if(rs2[i]) emit_rorimm(th,8,th);
3194       emit_writehword_indexed(th,-1,temp);
3195       if(rs2[i]) emit_rorimm(th,16,th);
3196       emit_writebyte_indexed(th,1,temp);
3197       if(rs2[i]) emit_rorimm(th,8,th);
3198     }
3199     if (opcode[i]==0x2D) { // SDR
3200       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3201       // Write two lsb into two most significant bytes
3202       emit_writehword_indexed(tl,1,temp);
3203     }
3204     done1=(int)out;
3205     emit_jmp(0);
3206     // 2
3207     set_jump_target(case2,(int)out);
3208     emit_testimm(temp,1);
3209     case3=(int)out;
3210     emit_jne(0);
3211     if (opcode[i]==0x2A) { // SWL
3212       // Write two msb into two least significant bytes
3213       if(rs2[i]) emit_rorimm(tl,16,tl);
3214       emit_writehword_indexed(tl,-2,temp);
3215       if(rs2[i]) emit_rorimm(tl,16,tl);
3216     }
3217     if (opcode[i]==0x2E) { // SWR
3218       // Write 3 lsb into three most significant bytes
3219       emit_writebyte_indexed(tl,-1,temp);
3220       if(rs2[i]) emit_rorimm(tl,8,tl);
3221       emit_writehword_indexed(tl,0,temp);
3222       if(rs2[i]) emit_rorimm(tl,24,tl);
3223     }
3224     if (opcode[i]==0x2C) { // SDL
3225       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3226       // Write two msb into two least significant bytes
3227       if(rs2[i]) emit_rorimm(th,16,th);
3228       emit_writehword_indexed(th,-2,temp);
3229       if(rs2[i]) emit_rorimm(th,16,th);
3230     }
3231     if (opcode[i]==0x2D) { // SDR
3232       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3233       // Write 3 lsb into three most significant bytes
3234       emit_writebyte_indexed(tl,-1,temp);
3235       if(rs2[i]) emit_rorimm(tl,8,tl);
3236       emit_writehword_indexed(tl,0,temp);
3237       if(rs2[i]) emit_rorimm(tl,24,tl);
3238     }
3239     done2=(int)out;
3240     emit_jmp(0);
3241     // 3
3242     set_jump_target(case3,(int)out);
3243     if (opcode[i]==0x2A) { // SWL
3244       // Write msb into least significant byte
3245       if(rs2[i]) emit_rorimm(tl,24,tl);
3246       emit_writebyte_indexed(tl,-3,temp);
3247       if(rs2[i]) emit_rorimm(tl,8,tl);
3248     }
3249     if (opcode[i]==0x2E) { // SWR
3250       // Write entire word
3251       emit_writeword_indexed(tl,-3,temp);
3252     }
3253     if (opcode[i]==0x2C) { // SDL
3254       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3255       // Write msb into least significant byte
3256       if(rs2[i]) emit_rorimm(th,24,th);
3257       emit_writebyte_indexed(th,-3,temp);
3258       if(rs2[i]) emit_rorimm(th,8,th);
3259     }
3260     if (opcode[i]==0x2D) { // SDR
3261       if(rs2[i]) emit_mov(th,temp2);
3262       // Write entire word
3263       emit_writeword_indexed(tl,-3,temp);
3264     }
3265     set_jump_target(done0,(int)out);
3266     set_jump_target(done1,(int)out);
3267     set_jump_target(done2,(int)out);
3268     if (opcode[i]==0x2C) { // SDL
3269       emit_testimm(temp,4);
3270       done0=(int)out;
3271       emit_jne(0);
3272       emit_andimm(temp,~3,temp);
3273       emit_writeword_indexed(temp2,4,temp);
3274       set_jump_target(done0,(int)out);
3275     }
3276     if (opcode[i]==0x2D) { // SDR
3277       emit_testimm(temp,4);
3278       done0=(int)out;
3279       emit_jeq(0);
3280       emit_andimm(temp,~3,temp);
3281       emit_writeword_indexed(temp2,-4,temp);
3282       set_jump_target(done0,(int)out);
3283     }
3284     if(!c||!memtarget)
3285       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3286   }
3287   if(!using_tlb) {
3288     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3289     #if defined(HOST_IMM8)
3290     int ir=get_reg(i_regs->regmap,INVCP);
3291     assert(ir>=0);
3292     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3293     #else
3294     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3295     #endif
3296     jaddr2=(int)out;
3297     emit_jne(0);
3298     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3299   }
3300   /*
3301     emit_pusha();
3302     //save_regs(0x100f);
3303         emit_readword((int)&last_count,ECX);
3304         if(get_reg(i_regs->regmap,CCREG)<0)
3305           emit_loadreg(CCREG,HOST_CCREG);
3306         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3307         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3308         emit_writeword(HOST_CCREG,(int)&Count);
3309     emit_call((int)memdebug);
3310     emit_popa();
3311     //restore_regs(0x100f);
3312   /**/
3313 }
3314
3315 void c1ls_assemble(int i,struct regstat *i_regs)
3316 {
3317 #ifndef DISABLE_COP1
3318   int s,th,tl;
3319   int temp,ar;
3320   int map=-1;
3321   int offset;
3322   int c=0;
3323   int jaddr,jaddr2=0,jaddr3,type;
3324   int agr=AGEN1+(i&1);
3325   u_int hr,reglist=0;
3326   th=get_reg(i_regs->regmap,FTEMP|64);
3327   tl=get_reg(i_regs->regmap,FTEMP);
3328   s=get_reg(i_regs->regmap,rs1[i]);
3329   temp=get_reg(i_regs->regmap,agr);
3330   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3331   offset=imm[i];
3332   assert(tl>=0);
3333   assert(rs1[i]>0);
3334   assert(temp>=0);
3335   for(hr=0;hr<HOST_REGS;hr++) {
3336     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3337   }
3338   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3339   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3340   {
3341     // Loads use a temporary register which we need to save
3342     reglist|=1<<temp;
3343   }
3344   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3345     ar=temp;
3346   else // LWC1/LDC1
3347     ar=tl;
3348   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3349   //else c=(i_regs->wasconst>>s)&1;
3350   if(s>=0) c=(i_regs->wasconst>>s)&1;
3351   // Check cop1 unusable
3352   if(!cop1_usable) {
3353     signed char rs=get_reg(i_regs->regmap,CSREG);
3354     assert(rs>=0);
3355     emit_testimm(rs,0x20000000);
3356     jaddr=(int)out;
3357     emit_jeq(0);
3358     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3359     cop1_usable=1;
3360   }
3361   if (opcode[i]==0x39) { // SWC1 (get float address)
3362     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3363   }
3364   if (opcode[i]==0x3D) { // SDC1 (get double address)
3365     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3366   }
3367   // Generate address + offset
3368   if(!using_tlb) {
3369     if(!c)
3370       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3371   }
3372   else
3373   {
3374     map=get_reg(i_regs->regmap,TLREG);
3375     assert(map>=0);
3376     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3377       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3378     }
3379     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3380       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3381     }
3382   }
3383   if (opcode[i]==0x39) { // SWC1 (read float)
3384     emit_readword_indexed(0,tl,tl);
3385   }
3386   if (opcode[i]==0x3D) { // SDC1 (read double)
3387     emit_readword_indexed(4,tl,th);
3388     emit_readword_indexed(0,tl,tl);
3389   }
3390   if (opcode[i]==0x31) { // LWC1 (get target address)
3391     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3392   }
3393   if (opcode[i]==0x35) { // LDC1 (get target address)
3394     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3395   }
3396   if(!using_tlb) {
3397     if(!c) {
3398       jaddr2=(int)out;
3399       emit_jno(0);
3400     }
3401     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3402       jaddr2=(int)out;
3403       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3404     }
3405     #ifdef DESTRUCTIVE_SHIFT
3406     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3407       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3408     }
3409     #endif
3410   }else{
3411     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3412       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3413     }
3414     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3415       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3416     }
3417   }
3418   if (opcode[i]==0x31) { // LWC1
3419     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3420     //gen_tlb_addr_r(ar,map);
3421     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3422     #ifdef HOST_IMM_ADDR32
3423     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3424     else
3425     #endif
3426     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3427     type=LOADW_STUB;
3428   }
3429   if (opcode[i]==0x35) { // LDC1
3430     assert(th>=0);
3431     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3432     //gen_tlb_addr_r(ar,map);
3433     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3434     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3435     #ifdef HOST_IMM_ADDR32
3436     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3437     else
3438     #endif
3439     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3440     type=LOADD_STUB;
3441   }
3442   if (opcode[i]==0x39) { // SWC1
3443     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3444     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3445     type=STOREW_STUB;
3446   }
3447   if (opcode[i]==0x3D) { // SDC1
3448     assert(th>=0);
3449     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3450     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3451     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3452     type=STORED_STUB;
3453   }
3454   if(!using_tlb) {
3455     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3456       #ifndef DESTRUCTIVE_SHIFT
3457       temp=offset||c||s<0?ar:s;
3458       #endif
3459       #if defined(HOST_IMM8)
3460       int ir=get_reg(i_regs->regmap,INVCP);
3461       assert(ir>=0);
3462       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3463       #else
3464       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3465       #endif
3466       jaddr3=(int)out;
3467       emit_jne(0);
3468       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3469     }
3470   }
3471   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3472   if (opcode[i]==0x31) { // LWC1 (write float)
3473     emit_writeword_indexed(tl,0,temp);
3474   }
3475   if (opcode[i]==0x35) { // LDC1 (write double)
3476     emit_writeword_indexed(th,4,temp);
3477     emit_writeword_indexed(tl,0,temp);
3478   }
3479   //if(opcode[i]==0x39)
3480   /*if(opcode[i]==0x39||opcode[i]==0x31)
3481   {
3482     emit_pusha();
3483         emit_readword((int)&last_count,ECX);
3484         if(get_reg(i_regs->regmap,CCREG)<0)
3485           emit_loadreg(CCREG,HOST_CCREG);
3486         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3487         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3488         emit_writeword(HOST_CCREG,(int)&Count);
3489     emit_call((int)memdebug);
3490     emit_popa();
3491   }/**/
3492 #else
3493   cop1_unusable(i, i_regs);
3494 #endif
3495 }
3496
3497 #ifndef multdiv_assemble
3498 void multdiv_assemble(int i,struct regstat *i_regs)
3499 {
3500   printf("Need multdiv_assemble for this architecture.\n");
3501   exit(1);
3502 }
3503 #endif
3504
3505 void mov_assemble(int i,struct regstat *i_regs)
3506 {
3507   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3508   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3509   assert(rt1[i]>0);
3510   if(rt1[i]) {
3511     signed char sh,sl,th,tl;
3512     th=get_reg(i_regs->regmap,rt1[i]|64);
3513     tl=get_reg(i_regs->regmap,rt1[i]);
3514     //assert(tl>=0);
3515     if(tl>=0) {
3516       sh=get_reg(i_regs->regmap,rs1[i]|64);
3517       sl=get_reg(i_regs->regmap,rs1[i]);
3518       if(sl>=0) emit_mov(sl,tl);
3519       else emit_loadreg(rs1[i],tl);
3520       if(th>=0) {
3521         if(sh>=0) emit_mov(sh,th);
3522         else emit_loadreg(rs1[i]|64,th);
3523       }
3524     }
3525   }
3526 }
3527
3528 #ifndef fconv_assemble
3529 void fconv_assemble(int i,struct regstat *i_regs)
3530 {
3531   printf("Need fconv_assemble for this architecture.\n");
3532   exit(1);
3533 }
3534 #endif
3535
3536 #if 0
3537 void float_assemble(int i,struct regstat *i_regs)
3538 {
3539   printf("Need float_assemble for this architecture.\n");
3540   exit(1);
3541 }
3542 #endif
3543
3544 void syscall_assemble(int i,struct regstat *i_regs)
3545 {
3546   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3547   assert(ccreg==HOST_CCREG);
3548   assert(!is_delayslot);
3549   emit_movimm(start+i*4,EAX); // Get PC
3550   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3551   emit_jmp((int)jump_syscall);
3552 }
3553
3554 void ds_assemble(int i,struct regstat *i_regs)
3555 {
3556   is_delayslot=1;
3557   switch(itype[i]) {
3558     case ALU:
3559       alu_assemble(i,i_regs);break;
3560     case IMM16:
3561       imm16_assemble(i,i_regs);break;
3562     case SHIFT:
3563       shift_assemble(i,i_regs);break;
3564     case SHIFTIMM:
3565       shiftimm_assemble(i,i_regs);break;
3566     case LOAD:
3567       load_assemble(i,i_regs);break;
3568     case LOADLR:
3569       loadlr_assemble(i,i_regs);break;
3570     case STORE:
3571       store_assemble(i,i_regs);break;
3572     case STORELR:
3573       storelr_assemble(i,i_regs);break;
3574     case COP0:
3575       cop0_assemble(i,i_regs);break;
3576     case COP1:
3577       cop1_assemble(i,i_regs);break;
3578     case C1LS:
3579       c1ls_assemble(i,i_regs);break;
3580     case FCONV:
3581       fconv_assemble(i,i_regs);break;
3582     case FLOAT:
3583       float_assemble(i,i_regs);break;
3584     case FCOMP:
3585       fcomp_assemble(i,i_regs);break;
3586     case MULTDIV:
3587       multdiv_assemble(i,i_regs);break;
3588     case MOV:
3589       mov_assemble(i,i_regs);break;
3590     case SYSCALL:
3591     case SPAN:
3592     case UJUMP:
3593     case RJUMP:
3594     case CJUMP:
3595     case SJUMP:
3596     case FJUMP:
3597       printf("Jump in the delay slot.  This is probably a bug.\n");
3598   }
3599   is_delayslot=0;
3600 }
3601
3602 // Is the branch target a valid internal jump?
3603 int internal_branch(uint64_t i_is32,int addr)
3604 {
3605   if(addr&1) return 0; // Indirect (register) jump
3606   if(addr>=start && addr<start+slen*4-4)
3607   {
3608     int t=(addr-start)>>2;
3609     // Delay slots are not valid branch targets
3610     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3611     // 64 -> 32 bit transition requires a recompile
3612     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3613     {
3614       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3615       else printf("optimizable: yes\n");
3616     }*/
3617     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3618     if(requires_32bit[t]&~i_is32) return 0;
3619     else return 1;
3620   }
3621   return 0;
3622 }
3623
3624 #ifndef wb_invalidate
3625 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3626   uint64_t u,uint64_t uu)
3627 {
3628   int hr;
3629   for(hr=0;hr<HOST_REGS;hr++) {
3630     if(hr!=EXCLUDE_REG) {
3631       if(pre[hr]!=entry[hr]) {
3632         if(pre[hr]>=0) {
3633           if((dirty>>hr)&1) {
3634             if(get_reg(entry,pre[hr])<0) {
3635               if(pre[hr]<64) {
3636                 if(!((u>>pre[hr])&1)) {
3637                   emit_storereg(pre[hr],hr);
3638                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3639                     emit_sarimm(hr,31,hr);
3640                     emit_storereg(pre[hr]|64,hr);
3641                   }
3642                 }
3643               }else{
3644                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3645                   emit_storereg(pre[hr],hr);
3646                 }
3647               }
3648             }
3649           }
3650         }
3651       }
3652     }
3653   }
3654   // Move from one register to another (no writeback)
3655   for(hr=0;hr<HOST_REGS;hr++) {
3656     if(hr!=EXCLUDE_REG) {
3657       if(pre[hr]!=entry[hr]) {
3658         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3659           int nr;
3660           if((nr=get_reg(entry,pre[hr]))>=0) {
3661             emit_mov(hr,nr);
3662           }
3663         }
3664       }
3665     }
3666   }
3667 }
3668 #endif
3669
3670 // Load the specified registers
3671 // This only loads the registers given as arguments because
3672 // we don't want to load things that will be overwritten
3673 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3674 {
3675   int hr;
3676   // Load 32-bit regs
3677   for(hr=0;hr<HOST_REGS;hr++) {
3678     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3679       if(entry[hr]!=regmap[hr]) {
3680         if(regmap[hr]==rs1||regmap[hr]==rs2)
3681         {
3682           if(regmap[hr]==0) {
3683             emit_zeroreg(hr);
3684           }
3685           else
3686           {
3687             emit_loadreg(regmap[hr],hr);
3688           }
3689         }
3690       }
3691     }
3692   }
3693   //Load 64-bit regs
3694   for(hr=0;hr<HOST_REGS;hr++) {
3695     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3696       if(entry[hr]!=regmap[hr]) {
3697         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3698         {
3699           assert(regmap[hr]!=64);
3700           if((is32>>(regmap[hr]&63))&1) {
3701             int lr=get_reg(regmap,regmap[hr]-64);
3702             if(lr>=0)
3703               emit_sarimm(lr,31,hr);
3704             else
3705               emit_loadreg(regmap[hr],hr);
3706           }
3707           else
3708           {
3709             emit_loadreg(regmap[hr],hr);
3710           }
3711         }
3712       }
3713     }
3714   }
3715 }
3716
3717 // Load registers prior to the start of a loop
3718 // so that they are not loaded within the loop
3719 static void loop_preload(signed char pre[],signed char entry[])
3720 {
3721   int hr;
3722   for(hr=0;hr<HOST_REGS;hr++) {
3723     if(hr!=EXCLUDE_REG) {
3724       if(pre[hr]!=entry[hr]) {
3725         if(entry[hr]>=0) {
3726           if(get_reg(pre,entry[hr])<0) {
3727             assem_debug("loop preload:\n");
3728             //printf("loop preload: %d\n",hr);
3729             if(entry[hr]==0) {
3730               emit_zeroreg(hr);
3731             }
3732             else if(entry[hr]<TEMPREG)
3733             {
3734               emit_loadreg(entry[hr],hr);
3735             }
3736             else if(entry[hr]-64<TEMPREG)
3737             {
3738               emit_loadreg(entry[hr],hr);
3739             }
3740           }
3741         }
3742       }
3743     }
3744   }
3745 }
3746
3747 // Generate address for load/store instruction
3748 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3749 {
3750   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3751     int ra;
3752     int agr=AGEN1+(i&1);
3753     int mgr=MGEN1+(i&1);
3754     if(itype[i]==LOAD) {
3755       ra=get_reg(i_regs->regmap,rt1[i]);
3756       //if(rt1[i]) assert(ra>=0);
3757     }
3758     if(itype[i]==LOADLR) {
3759       ra=get_reg(i_regs->regmap,FTEMP);
3760     }
3761     if(itype[i]==STORE||itype[i]==STORELR) {
3762       ra=get_reg(i_regs->regmap,agr);
3763       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3764     }
3765     if(itype[i]==C1LS) {
3766       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3767         ra=get_reg(i_regs->regmap,FTEMP);
3768       else { // SWC1/SDC1
3769         ra=get_reg(i_regs->regmap,agr);
3770         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3771       }
3772     }
3773     int rs=get_reg(i_regs->regmap,rs1[i]);
3774     int rm=get_reg(i_regs->regmap,TLREG);
3775     if(ra>=0) {
3776       int offset=imm[i];
3777       int c=(i_regs->wasconst>>rs)&1;
3778       if(rs1[i]==0) {
3779         // Using r0 as a base address
3780         /*if(rm>=0) {
3781           if(!entry||entry[rm]!=mgr) {
3782             generate_map_const(offset,rm);
3783           } // else did it in the previous cycle
3784         }*/
3785         if(!entry||entry[ra]!=agr) {
3786           if (opcode[i]==0x22||opcode[i]==0x26) {
3787             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3788           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3789             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3790           }else{
3791             emit_movimm(offset,ra);
3792           }
3793         } // else did it in the previous cycle
3794       }
3795       else if(rs<0) {
3796         if(!entry||entry[ra]!=rs1[i])
3797           emit_loadreg(rs1[i],ra);
3798         //if(!entry||entry[ra]!=rs1[i])
3799         //  printf("poor load scheduling!\n");
3800       }
3801       else if(c) {
3802         if(rm>=0) {
3803           if(!entry||entry[rm]!=mgr) {
3804             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3805               // Stores to memory go thru the mapper to detect self-modifying
3806               // code, loads don't.
3807               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3808                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3809                 generate_map_const(constmap[i][rs]+offset,rm);
3810             }else{
3811               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3812                 generate_map_const(constmap[i][rs]+offset,rm);
3813             }
3814           }
3815         }
3816         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3817           if(!entry||entry[ra]!=agr) {
3818             if (opcode[i]==0x22||opcode[i]==0x26) {
3819               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3820             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3821               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3822             }else{
3823               #ifdef HOST_IMM_ADDR32
3824               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3825                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3826               #endif
3827               emit_movimm(constmap[i][rs]+offset,ra);
3828             }
3829           } // else did it in the previous cycle
3830         } // else load_consts already did it
3831       }
3832       if(offset&&!c&&rs1[i]) {
3833         if(rs>=0) {
3834           emit_addimm(rs,offset,ra);
3835         }else{
3836           emit_addimm(ra,offset,ra);
3837         }
3838       }
3839     }
3840   }
3841   // Preload constants for next instruction
3842   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3843     int agr,ra;
3844     #ifndef HOST_IMM_ADDR32
3845     // Mapper entry
3846     agr=MGEN1+((i+1)&1);
3847     ra=get_reg(i_regs->regmap,agr);
3848     if(ra>=0) {
3849       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3850       int offset=imm[i+1];
3851       int c=(regs[i+1].wasconst>>rs)&1;
3852       if(c) {
3853         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3854           // Stores to memory go thru the mapper to detect self-modifying
3855           // code, loads don't.
3856           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3857              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3858             generate_map_const(constmap[i+1][rs]+offset,ra);
3859         }else{
3860           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3861             generate_map_const(constmap[i+1][rs]+offset,ra);
3862         }
3863       }
3864       /*else if(rs1[i]==0) {
3865         generate_map_const(offset,ra);
3866       }*/
3867     }
3868     #endif
3869     // Actual address
3870     agr=AGEN1+((i+1)&1);
3871     ra=get_reg(i_regs->regmap,agr);
3872     if(ra>=0) {
3873       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3874       int offset=imm[i+1];
3875       int c=(regs[i+1].wasconst>>rs)&1;
3876       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3877         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3878           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3879         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3880           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3881         }else{
3882           #ifdef HOST_IMM_ADDR32
3883           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3884              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3885           #endif
3886           emit_movimm(constmap[i+1][rs]+offset,ra);
3887         }
3888       }
3889       else if(rs1[i+1]==0) {
3890         // Using r0 as a base address
3891         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3892           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3893         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3894           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3895         }else{
3896           emit_movimm(offset,ra);
3897         }
3898       }
3899     }
3900   }
3901 }
3902
3903 int get_final_value(int hr, int i, int *value)
3904 {
3905   int reg=regs[i].regmap[hr];
3906   while(i<slen-1) {
3907     if(regs[i+1].regmap[hr]!=reg) break;
3908     if(!((regs[i+1].isconst>>hr)&1)) break;
3909     if(bt[i+1]) break;
3910     i++;
3911   }
3912   if(i<slen-1) {
3913     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3914       *value=constmap[i][hr];
3915       return 1;
3916     }
3917     if(!bt[i+1]) {
3918       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3919         // Load in delay slot, out-of-order execution
3920         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3921         {
3922           #ifdef HOST_IMM_ADDR32
3923           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3924           #endif
3925           // Precompute load address
3926           *value=constmap[i][hr]+imm[i+2];
3927           return 1;
3928         }
3929       }
3930       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3931       {
3932         #ifdef HOST_IMM_ADDR32
3933         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3934         #endif
3935         // Precompute load address
3936         *value=constmap[i][hr]+imm[i+1];
3937         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3938         return 1;
3939       }
3940     }
3941   }
3942   *value=constmap[i][hr];
3943   //printf("c=%x\n",(int)constmap[i][hr]);
3944   if(i==slen-1) return 1;
3945   if(reg<64) {
3946     return !((unneeded_reg[i+1]>>reg)&1);
3947   }else{
3948     return !((unneeded_reg_upper[i+1]>>reg)&1);
3949   }
3950 }
3951
3952 // Load registers with known constants
3953 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3954 {
3955   int hr;
3956   // Load 32-bit regs
3957   for(hr=0;hr<HOST_REGS;hr++) {
3958     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3959       //if(entry[hr]!=regmap[hr]) {
3960       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3961         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3962           int value;
3963           if(get_final_value(hr,i,&value)) {
3964             if(value==0) {
3965               emit_zeroreg(hr);
3966             }
3967             else {
3968               emit_movimm(value,hr);
3969             }
3970           }
3971         }
3972       }
3973     }
3974   }
3975   // Load 64-bit regs
3976   for(hr=0;hr<HOST_REGS;hr++) {
3977     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3978       //if(entry[hr]!=regmap[hr]) {
3979       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3980         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3981           if((is32>>(regmap[hr]&63))&1) {
3982             int lr=get_reg(regmap,regmap[hr]-64);
3983             assert(lr>=0);
3984             emit_sarimm(lr,31,hr);
3985           }
3986           else
3987           {
3988             int value;
3989             if(get_final_value(hr,i,&value)) {
3990               if(value==0) {
3991                 emit_zeroreg(hr);
3992               }
3993               else {
3994                 emit_movimm(value,hr);
3995               }
3996             }
3997           }
3998         }
3999       }
4000     }
4001   }
4002 }
4003 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4004 {
4005   int hr;
4006   // Load 32-bit regs
4007   for(hr=0;hr<HOST_REGS;hr++) {
4008     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4009       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4010         int value=constmap[i][hr];
4011         if(value==0) {
4012           emit_zeroreg(hr);
4013         }
4014         else {
4015           emit_movimm(value,hr);
4016         }
4017       }
4018     }
4019   }
4020   // Load 64-bit regs
4021   for(hr=0;hr<HOST_REGS;hr++) {
4022     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4023       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4024         if((is32>>(regmap[hr]&63))&1) {
4025           int lr=get_reg(regmap,regmap[hr]-64);
4026           assert(lr>=0);
4027           emit_sarimm(lr,31,hr);
4028         }
4029         else
4030         {
4031           int value=constmap[i][hr];
4032           if(value==0) {
4033             emit_zeroreg(hr);
4034           }
4035           else {
4036             emit_movimm(value,hr);
4037           }
4038         }
4039       }
4040     }
4041   }
4042 }
4043
4044 // Write out all dirty registers (except cycle count)
4045 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4046 {
4047   int hr;
4048   for(hr=0;hr<HOST_REGS;hr++) {
4049     if(hr!=EXCLUDE_REG) {
4050       if(i_regmap[hr]>0) {
4051         if(i_regmap[hr]!=CCREG) {
4052           if((i_dirty>>hr)&1) {
4053             if(i_regmap[hr]<64) {
4054               emit_storereg(i_regmap[hr],hr);
4055 #ifndef FORCE32
4056               if( ((i_is32>>i_regmap[hr])&1) ) {
4057                 #ifdef DESTRUCTIVE_WRITEBACK
4058                 emit_sarimm(hr,31,hr);
4059                 emit_storereg(i_regmap[hr]|64,hr);
4060                 #else
4061                 emit_sarimm(hr,31,HOST_TEMPREG);
4062                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4063                 #endif
4064               }
4065 #endif
4066             }else{
4067               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4068                 emit_storereg(i_regmap[hr],hr);
4069               }
4070             }
4071           }
4072         }
4073       }
4074     }
4075   }
4076 }
4077 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4078 // This writes the registers not written by store_regs_bt
4079 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4080 {
4081   int hr;
4082   int t=(addr-start)>>2;
4083   for(hr=0;hr<HOST_REGS;hr++) {
4084     if(hr!=EXCLUDE_REG) {
4085       if(i_regmap[hr]>0) {
4086         if(i_regmap[hr]!=CCREG) {
4087           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4088             if((i_dirty>>hr)&1) {
4089               if(i_regmap[hr]<64) {
4090                 emit_storereg(i_regmap[hr],hr);
4091 #ifndef FORCE32
4092                 if( ((i_is32>>i_regmap[hr])&1) ) {
4093                   #ifdef DESTRUCTIVE_WRITEBACK
4094                   emit_sarimm(hr,31,hr);
4095                   emit_storereg(i_regmap[hr]|64,hr);
4096                   #else
4097                   emit_sarimm(hr,31,HOST_TEMPREG);
4098                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4099                   #endif
4100                 }
4101 #endif
4102               }else{
4103                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4104                   emit_storereg(i_regmap[hr],hr);
4105                 }
4106               }
4107             }
4108           }
4109         }
4110       }
4111     }
4112   }
4113 }
4114
4115 // Load all registers (except cycle count)
4116 void load_all_regs(signed char i_regmap[])
4117 {
4118   int hr;
4119   for(hr=0;hr<HOST_REGS;hr++) {
4120     if(hr!=EXCLUDE_REG) {
4121       if(i_regmap[hr]==0) {
4122         emit_zeroreg(hr);
4123       }
4124       else
4125       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4126       {
4127         emit_loadreg(i_regmap[hr],hr);
4128       }
4129     }
4130   }
4131 }
4132
4133 // Load all current registers also needed by next instruction
4134 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4135 {
4136   int hr;
4137   for(hr=0;hr<HOST_REGS;hr++) {
4138     if(hr!=EXCLUDE_REG) {
4139       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4140         if(i_regmap[hr]==0) {
4141           emit_zeroreg(hr);
4142         }
4143         else
4144         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4145         {
4146           emit_loadreg(i_regmap[hr],hr);
4147         }
4148       }
4149     }
4150   }
4151 }
4152
4153 // Load all regs, storing cycle count if necessary
4154 void load_regs_entry(int t)
4155 {
4156   int hr;
4157   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4158   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4159   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4160     emit_storereg(CCREG,HOST_CCREG);
4161   }
4162   // Load 32-bit regs
4163   for(hr=0;hr<HOST_REGS;hr++) {
4164     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4165       if(regs[t].regmap_entry[hr]==0) {
4166         emit_zeroreg(hr);
4167       }
4168       else if(regs[t].regmap_entry[hr]!=CCREG)
4169       {
4170         emit_loadreg(regs[t].regmap_entry[hr],hr);
4171       }
4172     }
4173   }
4174   // Load 64-bit regs
4175   for(hr=0;hr<HOST_REGS;hr++) {
4176     if(regs[t].regmap_entry[hr]>=64) {
4177       assert(regs[t].regmap_entry[hr]!=64);
4178       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4179         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4180         if(lr<0) {
4181           emit_loadreg(regs[t].regmap_entry[hr],hr);
4182         }
4183         else
4184         {
4185           emit_sarimm(lr,31,hr);
4186         }
4187       }
4188       else
4189       {
4190         emit_loadreg(regs[t].regmap_entry[hr],hr);
4191       }
4192     }
4193   }
4194 }
4195
4196 // Store dirty registers prior to branch
4197 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4198 {
4199   if(internal_branch(i_is32,addr))
4200   {
4201     int t=(addr-start)>>2;
4202     int hr;
4203     for(hr=0;hr<HOST_REGS;hr++) {
4204       if(hr!=EXCLUDE_REG) {
4205         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4206           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4207             if((i_dirty>>hr)&1) {
4208               if(i_regmap[hr]<64) {
4209                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4210                   emit_storereg(i_regmap[hr],hr);
4211                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4212                     #ifdef DESTRUCTIVE_WRITEBACK
4213                     emit_sarimm(hr,31,hr);
4214                     emit_storereg(i_regmap[hr]|64,hr);
4215                     #else
4216                     emit_sarimm(hr,31,HOST_TEMPREG);
4217                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4218                     #endif
4219                   }
4220                 }
4221               }else{
4222                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4223                   emit_storereg(i_regmap[hr],hr);
4224                 }
4225               }
4226             }
4227           }
4228         }
4229       }
4230     }
4231   }
4232   else
4233   {
4234     // Branch out of this block, write out all dirty regs
4235     wb_dirtys(i_regmap,i_is32,i_dirty);
4236   }
4237 }
4238
4239 // Load all needed registers for branch target
4240 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4241 {
4242   //if(addr>=start && addr<(start+slen*4))
4243   if(internal_branch(i_is32,addr))
4244   {
4245     int t=(addr-start)>>2;
4246     int hr;
4247     // Store the cycle count before loading something else
4248     if(i_regmap[HOST_CCREG]!=CCREG) {
4249       assert(i_regmap[HOST_CCREG]==-1);
4250     }
4251     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4252       emit_storereg(CCREG,HOST_CCREG);
4253     }
4254     // Load 32-bit regs
4255     for(hr=0;hr<HOST_REGS;hr++) {
4256       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4257         #ifdef DESTRUCTIVE_WRITEBACK
4258         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4259         #else
4260         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4261         #endif
4262           if(regs[t].regmap_entry[hr]==0) {
4263             emit_zeroreg(hr);
4264           }
4265           else if(regs[t].regmap_entry[hr]!=CCREG)
4266           {
4267             emit_loadreg(regs[t].regmap_entry[hr],hr);
4268           }
4269         }
4270       }
4271     }
4272     //Load 64-bit regs
4273     for(hr=0;hr<HOST_REGS;hr++) {
4274       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4275         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4276           assert(regs[t].regmap_entry[hr]!=64);
4277           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4278             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4279             if(lr<0) {
4280               emit_loadreg(regs[t].regmap_entry[hr],hr);
4281             }
4282             else
4283             {
4284               emit_sarimm(lr,31,hr);
4285             }
4286           }
4287           else
4288           {
4289             emit_loadreg(regs[t].regmap_entry[hr],hr);
4290           }
4291         }
4292         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4293           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4294           assert(lr>=0);
4295           emit_sarimm(lr,31,hr);
4296         }
4297       }
4298     }
4299   }
4300 }
4301
4302 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4303 {
4304   if(addr>=start && addr<start+slen*4-4)
4305   {
4306     int t=(addr-start)>>2;
4307     int hr;
4308     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4309     for(hr=0;hr<HOST_REGS;hr++)
4310     {
4311       if(hr!=EXCLUDE_REG)
4312       {
4313         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4314         {
4315           if(regs[t].regmap_entry[hr]!=-1)
4316           {
4317             return 0;
4318           }
4319           else 
4320           if((i_dirty>>hr)&1)
4321           {
4322             if(i_regmap[hr]<64)
4323             {
4324               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4325                 return 0;
4326             }
4327             else
4328             {
4329               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4330                 return 0;
4331             }
4332           }
4333         }
4334         else // Same register but is it 32-bit or dirty?
4335         if(i_regmap[hr]>=0)
4336         {
4337           if(!((regs[t].dirty>>hr)&1))
4338           {
4339             if((i_dirty>>hr)&1)
4340             {
4341               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4342               {
4343                 //printf("%x: dirty no match\n",addr);
4344                 return 0;
4345               }
4346             }
4347           }
4348           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4349           {
4350             //printf("%x: is32 no match\n",addr);
4351             return 0;
4352           }
4353         }
4354       }
4355     }
4356     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4357     if(requires_32bit[t]&~i_is32) return 0;
4358     // Delay slots are not valid branch targets
4359     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4360     // Delay slots require additional processing, so do not match
4361     if(is_ds[t]) return 0;
4362   }
4363   else
4364   {
4365     int hr;
4366     for(hr=0;hr<HOST_REGS;hr++)
4367     {
4368       if(hr!=EXCLUDE_REG)
4369       {
4370         if(i_regmap[hr]>=0)
4371         {
4372           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4373           {
4374             if((i_dirty>>hr)&1)
4375             {
4376               return 0;
4377             }
4378           }
4379         }
4380       }
4381     }
4382   }
4383   return 1;
4384 }
4385
4386 // Used when a branch jumps into the delay slot of another branch
4387 void ds_assemble_entry(int i)
4388 {
4389   int t=(ba[i]-start)>>2;
4390   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4391   assem_debug("Assemble delay slot at %x\n",ba[i]);
4392   assem_debug("<->\n");
4393   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4394     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4395   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4396   address_generation(t,&regs[t],regs[t].regmap_entry);
4397   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4398     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4399   cop1_usable=0;
4400   is_delayslot=0;
4401   switch(itype[t]) {
4402     case ALU:
4403       alu_assemble(t,&regs[t]);break;
4404     case IMM16:
4405       imm16_assemble(t,&regs[t]);break;
4406     case SHIFT:
4407       shift_assemble(t,&regs[t]);break;
4408     case SHIFTIMM:
4409       shiftimm_assemble(t,&regs[t]);break;
4410     case LOAD:
4411       load_assemble(t,&regs[t]);break;
4412     case LOADLR:
4413       loadlr_assemble(t,&regs[t]);break;
4414     case STORE:
4415       store_assemble(t,&regs[t]);break;
4416     case STORELR:
4417       storelr_assemble(t,&regs[t]);break;
4418     case COP0:
4419       cop0_assemble(t,&regs[t]);break;
4420     case COP1:
4421       cop1_assemble(t,&regs[t]);break;
4422     case C1LS:
4423       c1ls_assemble(t,&regs[t]);break;
4424     case FCONV:
4425       fconv_assemble(t,&regs[t]);break;
4426     case FLOAT:
4427       float_assemble(t,&regs[t]);break;
4428     case FCOMP:
4429       fcomp_assemble(t,&regs[t]);break;
4430     case MULTDIV:
4431       multdiv_assemble(t,&regs[t]);break;
4432     case MOV:
4433       mov_assemble(t,&regs[t]);break;
4434     case SYSCALL:
4435     case SPAN:
4436     case UJUMP:
4437     case RJUMP:
4438     case CJUMP:
4439     case SJUMP:
4440     case FJUMP:
4441       printf("Jump in the delay slot.  This is probably a bug.\n");
4442   }
4443   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4444   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4445   if(internal_branch(regs[t].is32,ba[i]+4))
4446     assem_debug("branch: internal\n");
4447   else
4448     assem_debug("branch: external\n");
4449   assert(internal_branch(regs[t].is32,ba[i]+4));
4450   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4451   emit_jmp(0);
4452 }
4453
4454 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4455 {
4456   int count;
4457   int jaddr;
4458   int idle=0;
4459   if(itype[i]==RJUMP)
4460   {
4461     *adj=0;
4462   }
4463   //if(ba[i]>=start && ba[i]<(start+slen*4))
4464   if(internal_branch(branch_regs[i].is32,ba[i]))
4465   {
4466     int t=(ba[i]-start)>>2;
4467     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4468     else *adj=ccadj[t];
4469   }
4470   else
4471   {
4472     *adj=0;
4473   }
4474   count=ccadj[i];
4475   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4476     // Idle loop
4477     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4478     idle=(int)out;
4479     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4480     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4481     jaddr=(int)out;
4482     emit_jmp(0);
4483   }
4484   else if(*adj==0||invert) {
4485     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4486     jaddr=(int)out;
4487     emit_jns(0);
4488   }
4489   else
4490   {
4491     emit_cmpimm(HOST_CCREG,-2*(count+2));
4492     jaddr=(int)out;
4493     emit_jns(0);
4494   }
4495   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4496 }
4497
4498 void do_ccstub(int n)
4499 {
4500   literal_pool(256);
4501   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4502   set_jump_target(stubs[n][1],(int)out);
4503   int i=stubs[n][4];
4504   if(stubs[n][6]==NULLDS) {
4505     // Delay slot instruction is nullified ("likely" branch)
4506     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4507   }
4508   else if(stubs[n][6]!=TAKEN) {
4509     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4510   }
4511   else {
4512     if(internal_branch(branch_regs[i].is32,ba[i]))
4513       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4514   }
4515   if(stubs[n][5]!=-1)
4516   {
4517     // Save PC as return address
4518     emit_movimm(stubs[n][5],EAX);
4519     emit_writeword(EAX,(int)&pcaddr);
4520   }
4521   else
4522   {
4523     // Return address depends on which way the branch goes
4524     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4525     {
4526       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4527       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4528       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4529       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4530       if(rs1[i]==0)
4531       {
4532         s1l=s2l;s1h=s2h;
4533         s2l=s2h=-1;
4534       }
4535       else if(rs2[i]==0)
4536       {
4537         s2l=s2h=-1;
4538       }
4539       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4540         s1h=s2h=-1;
4541       }
4542       assert(s1l>=0);
4543       #ifdef DESTRUCTIVE_WRITEBACK
4544       if(rs1[i]) {
4545         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4546           emit_loadreg(rs1[i],s1l);
4547       } 
4548       else {
4549         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4550           emit_loadreg(rs2[i],s1l);
4551       }
4552       if(s2l>=0)
4553         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4554           emit_loadreg(rs2[i],s2l);
4555       #endif
4556       int hr=0;
4557       int addr,alt,ntaddr;
4558       while(hr<HOST_REGS)
4559       {
4560         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4561            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4562            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4563         {
4564           addr=hr++;break;
4565         }
4566         hr++;
4567       }
4568       while(hr<HOST_REGS)
4569       {
4570         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4571            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4572            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4573         {
4574           alt=hr++;break;
4575         }
4576         hr++;
4577       }
4578       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4579       {
4580         while(hr<HOST_REGS)
4581         {
4582           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4583              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4584              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4585           {
4586             ntaddr=hr;break;
4587           }
4588           hr++;
4589         }
4590         assert(hr<HOST_REGS);
4591       }
4592       if((opcode[i]&0x2f)==4) // BEQ
4593       {
4594         #ifdef HAVE_CMOV_IMM
4595         if(s1h<0) {
4596           if(s2l>=0) emit_cmp(s1l,s2l);
4597           else emit_test(s1l,s1l);
4598           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4599         }
4600         else
4601         #endif
4602         {
4603           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4604           if(s1h>=0) {
4605             if(s2h>=0) emit_cmp(s1h,s2h);
4606             else emit_test(s1h,s1h);
4607             emit_cmovne_reg(alt,addr);
4608           }
4609           if(s2l>=0) emit_cmp(s1l,s2l);
4610           else emit_test(s1l,s1l);
4611           emit_cmovne_reg(alt,addr);
4612         }
4613       }
4614       if((opcode[i]&0x2f)==5) // BNE
4615       {
4616         #ifdef HAVE_CMOV_IMM
4617         if(s1h<0) {
4618           if(s2l>=0) emit_cmp(s1l,s2l);
4619           else emit_test(s1l,s1l);
4620           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4621         }
4622         else
4623         #endif
4624         {
4625           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4626           if(s1h>=0) {
4627             if(s2h>=0) emit_cmp(s1h,s2h);
4628             else emit_test(s1h,s1h);
4629             emit_cmovne_reg(alt,addr);
4630           }
4631           if(s2l>=0) emit_cmp(s1l,s2l);
4632           else emit_test(s1l,s1l);
4633           emit_cmovne_reg(alt,addr);
4634         }
4635       }
4636       if((opcode[i]&0x2f)==6) // BLEZ
4637       {
4638         //emit_movimm(ba[i],alt);
4639         //emit_movimm(start+i*4+8,addr);
4640         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4641         emit_cmpimm(s1l,1);
4642         if(s1h>=0) emit_mov(addr,ntaddr);
4643         emit_cmovl_reg(alt,addr);
4644         if(s1h>=0) {
4645           emit_test(s1h,s1h);
4646           emit_cmovne_reg(ntaddr,addr);
4647           emit_cmovs_reg(alt,addr);
4648         }
4649       }
4650       if((opcode[i]&0x2f)==7) // BGTZ
4651       {
4652         //emit_movimm(ba[i],addr);
4653         //emit_movimm(start+i*4+8,ntaddr);
4654         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4655         emit_cmpimm(s1l,1);
4656         if(s1h>=0) emit_mov(addr,alt);
4657         emit_cmovl_reg(ntaddr,addr);
4658         if(s1h>=0) {
4659           emit_test(s1h,s1h);
4660           emit_cmovne_reg(alt,addr);
4661           emit_cmovs_reg(ntaddr,addr);
4662         }
4663       }
4664       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4665       {
4666         //emit_movimm(ba[i],alt);
4667         //emit_movimm(start+i*4+8,addr);
4668         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4669         if(s1h>=0) emit_test(s1h,s1h);
4670         else emit_test(s1l,s1l);
4671         emit_cmovs_reg(alt,addr);
4672       }
4673       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4674       {
4675         //emit_movimm(ba[i],addr);
4676         //emit_movimm(start+i*4+8,alt);
4677         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4678         if(s1h>=0) emit_test(s1h,s1h);
4679         else emit_test(s1l,s1l);
4680         emit_cmovs_reg(alt,addr);
4681       }
4682       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4683         if(source[i]&0x10000) // BC1T
4684         {
4685           //emit_movimm(ba[i],alt);
4686           //emit_movimm(start+i*4+8,addr);
4687           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4688           emit_testimm(s1l,0x800000);
4689           emit_cmovne_reg(alt,addr);
4690         }
4691         else // BC1F
4692         {
4693           //emit_movimm(ba[i],addr);
4694           //emit_movimm(start+i*4+8,alt);
4695           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4696           emit_testimm(s1l,0x800000);
4697           emit_cmovne_reg(alt,addr);
4698         }
4699       }
4700       emit_writeword(addr,(int)&pcaddr);
4701     }
4702     else
4703     if(itype[i]==RJUMP)
4704     {
4705       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4706       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4707         r=get_reg(branch_regs[i].regmap,RTEMP);
4708       }
4709       emit_writeword(r,(int)&pcaddr);
4710     }
4711     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4712   }
4713   // Update cycle count
4714   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4715   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4716   emit_call((int)cc_interrupt);
4717   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4718   if(stubs[n][6]==TAKEN) {
4719     if(internal_branch(branch_regs[i].is32,ba[i]))
4720       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4721     else if(itype[i]==RJUMP) {
4722       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4723         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4724       else
4725         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4726     }
4727   }else if(stubs[n][6]==NOTTAKEN) {
4728     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4729     else load_all_regs(branch_regs[i].regmap);
4730   }else if(stubs[n][6]==NULLDS) {
4731     // Delay slot instruction is nullified ("likely" branch)
4732     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4733     else load_all_regs(regs[i].regmap);
4734   }else{
4735     load_all_regs(branch_regs[i].regmap);
4736   }
4737   emit_jmp(stubs[n][2]); // return address
4738   
4739   /* This works but uses a lot of memory...
4740   emit_readword((int)&last_count,ECX);
4741   emit_add(HOST_CCREG,ECX,EAX);
4742   emit_writeword(EAX,(int)&Count);
4743   emit_call((int)gen_interupt);
4744   emit_readword((int)&Count,HOST_CCREG);
4745   emit_readword((int)&next_interupt,EAX);
4746   emit_readword((int)&pending_exception,EBX);
4747   emit_writeword(EAX,(int)&last_count);
4748   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4749   emit_test(EBX,EBX);
4750   int jne_instr=(int)out;
4751   emit_jne(0);
4752   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4753   load_all_regs(branch_regs[i].regmap);
4754   emit_jmp(stubs[n][2]); // return address
4755   set_jump_target(jne_instr,(int)out);
4756   emit_readword((int)&pcaddr,EAX);
4757   // Call get_addr_ht instead of doing the hash table here.
4758   // This code is executed infrequently and takes up a lot of space
4759   // so smaller is better.
4760   emit_storereg(CCREG,HOST_CCREG);
4761   emit_pushreg(EAX);
4762   emit_call((int)get_addr_ht);
4763   emit_loadreg(CCREG,HOST_CCREG);
4764   emit_addimm(ESP,4,ESP);
4765   emit_jmpreg(EAX);*/
4766 }
4767
4768 add_to_linker(int addr,int target,int ext)
4769 {
4770   link_addr[linkcount][0]=addr;
4771   link_addr[linkcount][1]=target;
4772   link_addr[linkcount][2]=ext;  
4773   linkcount++;
4774 }
4775
4776 void ujump_assemble(int i,struct regstat *i_regs)
4777 {
4778   signed char *i_regmap=i_regs->regmap;
4779   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4780   address_generation(i+1,i_regs,regs[i].regmap_entry);
4781   #ifdef REG_PREFETCH
4782   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4783   if(rt1[i]==31&&temp>=0) 
4784   {
4785     int return_address=start+i*4+8;
4786     if(get_reg(branch_regs[i].regmap,31)>0) 
4787     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4788   }
4789   #endif
4790   ds_assemble(i+1,i_regs);
4791   uint64_t bc_unneeded=branch_regs[i].u;
4792   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4793   bc_unneeded|=1|(1LL<<rt1[i]);
4794   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4795   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4796                 bc_unneeded,bc_unneeded_upper);
4797   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4798   if(rt1[i]==31) {
4799     int rt;
4800     unsigned int return_address;
4801     assert(rt1[i+1]!=31);
4802     assert(rt2[i+1]!=31);
4803     rt=get_reg(branch_regs[i].regmap,31);
4804     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4805     //assert(rt>=0);
4806     return_address=start+i*4+8;
4807     if(rt>=0) {
4808       #ifdef USE_MINI_HT
4809       if(internal_branch(branch_regs[i].is32,return_address)) {
4810         int temp=rt+1;
4811         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4812            branch_regs[i].regmap[temp]>=0)
4813         {
4814           temp=get_reg(branch_regs[i].regmap,-1);
4815         }
4816         #ifdef HOST_TEMPREG
4817         if(temp<0) temp=HOST_TEMPREG;
4818         #endif
4819         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4820         else emit_movimm(return_address,rt);
4821       }
4822       else
4823       #endif
4824       {
4825         #ifdef REG_PREFETCH
4826         if(temp>=0) 
4827         {
4828           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4829         }
4830         #endif
4831         emit_movimm(return_address,rt); // PC into link register
4832         #ifdef IMM_PREFETCH
4833         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4834         #endif
4835       }
4836     }
4837   }
4838   int cc,adj;
4839   cc=get_reg(branch_regs[i].regmap,CCREG);
4840   assert(cc==HOST_CCREG);
4841   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4842   #ifdef REG_PREFETCH
4843   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4844   #endif
4845   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4846   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4847   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4848   if(internal_branch(branch_regs[i].is32,ba[i]))
4849     assem_debug("branch: internal\n");
4850   else
4851     assem_debug("branch: external\n");
4852   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4853     ds_assemble_entry(i);
4854   }
4855   else {
4856     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4857     emit_jmp(0);
4858   }
4859 }
4860
4861 void rjump_assemble(int i,struct regstat *i_regs)
4862 {
4863   signed char *i_regmap=i_regs->regmap;
4864   int temp;
4865   int rs,cc,adj;
4866   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4867   assert(rs>=0);
4868   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4869     // Delay slot abuse, make a copy of the branch address register
4870     temp=get_reg(branch_regs[i].regmap,RTEMP);
4871     assert(temp>=0);
4872     assert(regs[i].regmap[temp]==RTEMP);
4873     emit_mov(rs,temp);
4874     rs=temp;
4875   }
4876   address_generation(i+1,i_regs,regs[i].regmap_entry);
4877   #ifdef REG_PREFETCH
4878   if(rt1[i]==31) 
4879   {
4880     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4881       int return_address=start+i*4+8;
4882       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4883     }
4884   }
4885   #endif
4886   #ifdef USE_MINI_HT
4887   if(rs1[i]==31) {
4888     int rh=get_reg(regs[i].regmap,RHASH);
4889     if(rh>=0) do_preload_rhash(rh);
4890   }
4891   #endif
4892   ds_assemble(i+1,i_regs);
4893   uint64_t bc_unneeded=branch_regs[i].u;
4894   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4895   bc_unneeded|=1|(1LL<<rt1[i]);
4896   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4897   bc_unneeded&=~(1LL<<rs1[i]);
4898   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4899                 bc_unneeded,bc_unneeded_upper);
4900   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4901   if(rt1[i]==31) {
4902     int rt,return_address;
4903     assert(rt1[i+1]!=31);
4904     assert(rt2[i+1]!=31);
4905     rt=get_reg(branch_regs[i].regmap,31);
4906     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4907     assert(rt>=0);
4908     return_address=start+i*4+8;
4909     #ifdef REG_PREFETCH
4910     if(temp>=0) 
4911     {
4912       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4913     }
4914     #endif
4915     emit_movimm(return_address,rt); // PC into link register
4916     #ifdef IMM_PREFETCH
4917     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4918     #endif
4919   }
4920   cc=get_reg(branch_regs[i].regmap,CCREG);
4921   assert(cc==HOST_CCREG);
4922   #ifdef USE_MINI_HT
4923   int rh=get_reg(branch_regs[i].regmap,RHASH);
4924   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4925   if(rs1[i]==31) {
4926     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4927     do_preload_rhtbl(ht);
4928     do_rhash(rs,rh);
4929   }
4930   #endif
4931   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4932   #ifdef DESTRUCTIVE_WRITEBACK
4933   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4934     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4935       emit_loadreg(rs1[i],rs);
4936     }
4937   }
4938   #endif
4939   #ifdef REG_PREFETCH
4940   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4941   #endif
4942   #ifdef USE_MINI_HT
4943   if(rs1[i]==31) {
4944     do_miniht_load(ht,rh);
4945   }
4946   #endif
4947   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4948   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4949   //assert(adj==0);
4950   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
4951   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4952   emit_jns(0);
4953   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4954   #ifdef USE_MINI_HT
4955   if(rs1[i]==31) {
4956     do_miniht_jump(rs,rh,ht);
4957   }
4958   else
4959   #endif
4960   {
4961     //if(rs!=EAX) emit_mov(rs,EAX);
4962     //emit_jmp((int)jump_vaddr_eax);
4963     emit_jmp(jump_vaddr_reg[rs]);
4964   }
4965   /* Check hash table
4966   temp=!rs;
4967   emit_mov(rs,temp);
4968   emit_shrimm(rs,16,rs);
4969   emit_xor(temp,rs,rs);
4970   emit_movzwl_reg(rs,rs);
4971   emit_shlimm(rs,4,rs);
4972   emit_cmpmem_indexed((int)hash_table,rs,temp);
4973   emit_jne((int)out+14);
4974   emit_readword_indexed((int)hash_table+4,rs,rs);
4975   emit_jmpreg(rs);
4976   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4977   emit_addimm_no_flags(8,rs);
4978   emit_jeq((int)out-17);
4979   // No hit on hash table, call compiler
4980   emit_pushreg(temp);
4981 //DEBUG >
4982 #ifdef DEBUG_CYCLE_COUNT
4983   emit_readword((int)&last_count,ECX);
4984   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4985   emit_readword((int)&next_interupt,ECX);
4986   emit_writeword(HOST_CCREG,(int)&Count);
4987   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4988   emit_writeword(ECX,(int)&last_count);
4989 #endif
4990 //DEBUG <
4991   emit_storereg(CCREG,HOST_CCREG);
4992   emit_call((int)get_addr);
4993   emit_loadreg(CCREG,HOST_CCREG);
4994   emit_addimm(ESP,4,ESP);
4995   emit_jmpreg(EAX);*/
4996   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4997   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4998   #endif
4999 }
5000
5001 void cjump_assemble(int i,struct regstat *i_regs)
5002 {
5003   signed char *i_regmap=i_regs->regmap;
5004   int cc;
5005   int match;
5006   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5007   assem_debug("match=%d\n",match);
5008   int s1h,s1l,s2h,s2l;
5009   int prev_cop1_usable=cop1_usable;
5010   int unconditional=0,nop=0;
5011   int only32=0;
5012   int ooo=1;
5013   int invert=0;
5014   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5015   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5016   if(likely[i]) ooo=0;
5017   if(!match) invert=1;
5018   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5019   if(i>(ba[i]-start)>>2) invert=1;
5020   #endif
5021     
5022   if(ooo)
5023     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5024        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5025   {
5026     // Write-after-read dependency prevents out of order execution
5027     // First test branch condition, then execute delay slot, then branch
5028     ooo=0;
5029   }
5030
5031   if(ooo) {
5032     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5033     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5034     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5035     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5036   }
5037   else {
5038     s1l=get_reg(i_regmap,rs1[i]);
5039     s1h=get_reg(i_regmap,rs1[i]|64);
5040     s2l=get_reg(i_regmap,rs2[i]);
5041     s2h=get_reg(i_regmap,rs2[i]|64);
5042   }
5043   if(rs1[i]==0&&rs2[i]==0)
5044   {
5045     if(opcode[i]&1) nop=1;
5046     else unconditional=1;
5047     //assert(opcode[i]!=5);
5048     //assert(opcode[i]!=7);
5049     //assert(opcode[i]!=0x15);
5050     //assert(opcode[i]!=0x17);
5051   }
5052   else if(rs1[i]==0)
5053   {
5054     s1l=s2l;s1h=s2h;
5055     s2l=s2h=-1;
5056     only32=(regs[i].was32>>rs2[i])&1;
5057   }
5058   else if(rs2[i]==0)
5059   {
5060     s2l=s2h=-1;
5061     only32=(regs[i].was32>>rs1[i])&1;
5062   }
5063   else {
5064     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5065   }
5066
5067   if(ooo) {
5068     // Out of order execution (delay slot first)
5069     //printf("OOOE\n");
5070     address_generation(i+1,i_regs,regs[i].regmap_entry);
5071     ds_assemble(i+1,i_regs);
5072     int adj;
5073     uint64_t bc_unneeded=branch_regs[i].u;
5074     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5075     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5076     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5077     bc_unneeded|=1;
5078     bc_unneeded_upper|=1;
5079     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5080                   bc_unneeded,bc_unneeded_upper);
5081     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5082     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5083     cc=get_reg(branch_regs[i].regmap,CCREG);
5084     assert(cc==HOST_CCREG);
5085     if(unconditional) 
5086       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5087     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5088     //assem_debug("cycle count (adj)\n");
5089     if(unconditional) {
5090       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5091       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5092         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5093         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5094         if(internal)
5095           assem_debug("branch: internal\n");
5096         else
5097           assem_debug("branch: external\n");
5098         if(internal&&is_ds[(ba[i]-start)>>2]) {
5099           ds_assemble_entry(i);
5100         }
5101         else {
5102           add_to_linker((int)out,ba[i],internal);
5103           emit_jmp(0);
5104         }
5105         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5106         if(((u_int)out)&7) emit_addnop(0);
5107         #endif
5108       }
5109     }
5110     else if(nop) {
5111       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5112       int jaddr=(int)out;
5113       emit_jns(0);
5114       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5115     }
5116     else {
5117       int taken=0,nottaken=0,nottaken1=0;
5118       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5119       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5120       if(!only32)
5121       {
5122         assert(s1h>=0);
5123         if(opcode[i]==4) // BEQ
5124         {
5125           if(s2h>=0) emit_cmp(s1h,s2h);
5126           else emit_test(s1h,s1h);
5127           nottaken1=(int)out;
5128           emit_jne(1);
5129         }
5130         if(opcode[i]==5) // BNE
5131         {
5132           if(s2h>=0) emit_cmp(s1h,s2h);
5133           else emit_test(s1h,s1h);
5134           if(invert) taken=(int)out;
5135           else add_to_linker((int)out,ba[i],internal);
5136           emit_jne(0);
5137         }
5138         if(opcode[i]==6) // BLEZ
5139         {
5140           emit_test(s1h,s1h);
5141           if(invert) taken=(int)out;
5142           else add_to_linker((int)out,ba[i],internal);
5143           emit_js(0);
5144           nottaken1=(int)out;
5145           emit_jne(1);
5146         }
5147         if(opcode[i]==7) // BGTZ
5148         {
5149           emit_test(s1h,s1h);
5150           nottaken1=(int)out;
5151           emit_js(1);
5152           if(invert) taken=(int)out;
5153           else add_to_linker((int)out,ba[i],internal);
5154           emit_jne(0);
5155         }
5156       } // if(!only32)
5157           
5158       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5159       assert(s1l>=0);
5160       if(opcode[i]==4) // BEQ
5161       {
5162         if(s2l>=0) emit_cmp(s1l,s2l);
5163         else emit_test(s1l,s1l);
5164         if(invert){
5165           nottaken=(int)out;
5166           emit_jne(1);
5167         }else{
5168           add_to_linker((int)out,ba[i],internal);
5169           emit_jeq(0);
5170         }
5171       }
5172       if(opcode[i]==5) // BNE
5173       {
5174         if(s2l>=0) emit_cmp(s1l,s2l);
5175         else emit_test(s1l,s1l);
5176         if(invert){
5177           nottaken=(int)out;
5178           emit_jeq(1);
5179         }else{
5180           add_to_linker((int)out,ba[i],internal);
5181           emit_jne(0);
5182         }
5183       }
5184       if(opcode[i]==6) // BLEZ
5185       {
5186         emit_cmpimm(s1l,1);
5187         if(invert){
5188           nottaken=(int)out;
5189           emit_jge(1);
5190         }else{
5191           add_to_linker((int)out,ba[i],internal);
5192           emit_jl(0);
5193         }
5194       }
5195       if(opcode[i]==7) // BGTZ
5196       {
5197         emit_cmpimm(s1l,1);
5198         if(invert){
5199           nottaken=(int)out;
5200           emit_jl(1);
5201         }else{
5202           add_to_linker((int)out,ba[i],internal);
5203           emit_jge(0);
5204         }
5205       }
5206       if(invert) {
5207         if(taken) set_jump_target(taken,(int)out);
5208         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5209         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5210           if(adj) {
5211             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5212             add_to_linker((int)out,ba[i],internal);
5213           }else{
5214             emit_addnop(13);
5215             add_to_linker((int)out,ba[i],internal*2);
5216           }
5217           emit_jmp(0);
5218         }else
5219         #endif
5220         {
5221           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5222           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5223           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5224           if(internal)
5225             assem_debug("branch: internal\n");
5226           else
5227             assem_debug("branch: external\n");
5228           if(internal&&is_ds[(ba[i]-start)>>2]) {
5229             ds_assemble_entry(i);
5230           }
5231           else {
5232             add_to_linker((int)out,ba[i],internal);
5233             emit_jmp(0);
5234           }
5235         }
5236         set_jump_target(nottaken,(int)out);
5237       }
5238
5239       if(nottaken1) set_jump_target(nottaken1,(int)out);
5240       if(adj) {
5241         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5242       }
5243     } // (!unconditional)
5244   } // if(ooo)
5245   else
5246   {
5247     // In-order execution (branch first)
5248     //if(likely[i]) printf("IOL\n");
5249     //else
5250     //printf("IOE\n");
5251     int taken=0,nottaken=0,nottaken1=0;
5252     if(!unconditional&&!nop) {
5253       if(!only32)
5254       {
5255         assert(s1h>=0);
5256         if((opcode[i]&0x2f)==4) // BEQ
5257         {
5258           if(s2h>=0) emit_cmp(s1h,s2h);
5259           else emit_test(s1h,s1h);
5260           nottaken1=(int)out;
5261           emit_jne(2);
5262         }
5263         if((opcode[i]&0x2f)==5) // BNE
5264         {
5265           if(s2h>=0) emit_cmp(s1h,s2h);
5266           else emit_test(s1h,s1h);
5267           taken=(int)out;
5268           emit_jne(1);
5269         }
5270         if((opcode[i]&0x2f)==6) // BLEZ
5271         {
5272           emit_test(s1h,s1h);
5273           taken=(int)out;
5274           emit_js(1);
5275           nottaken1=(int)out;
5276           emit_jne(2);
5277         }
5278         if((opcode[i]&0x2f)==7) // BGTZ
5279         {
5280           emit_test(s1h,s1h);
5281           nottaken1=(int)out;
5282           emit_js(2);
5283           taken=(int)out;
5284           emit_jne(1);
5285         }
5286       } // if(!only32)
5287           
5288       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5289       assert(s1l>=0);
5290       if((opcode[i]&0x2f)==4) // BEQ
5291       {
5292         if(s2l>=0) emit_cmp(s1l,s2l);
5293         else emit_test(s1l,s1l);
5294         nottaken=(int)out;
5295         emit_jne(2);
5296       }
5297       if((opcode[i]&0x2f)==5) // BNE
5298       {
5299         if(s2l>=0) emit_cmp(s1l,s2l);
5300         else emit_test(s1l,s1l);
5301         nottaken=(int)out;
5302         emit_jeq(2);
5303       }
5304       if((opcode[i]&0x2f)==6) // BLEZ
5305       {
5306         emit_cmpimm(s1l,1);
5307         nottaken=(int)out;
5308         emit_jge(2);
5309       }
5310       if((opcode[i]&0x2f)==7) // BGTZ
5311       {
5312         emit_cmpimm(s1l,1);
5313         nottaken=(int)out;
5314         emit_jl(2);
5315       }
5316     } // if(!unconditional)
5317     int adj;
5318     uint64_t ds_unneeded=branch_regs[i].u;
5319     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5320     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5321     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5322     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5323     ds_unneeded|=1;
5324     ds_unneeded_upper|=1;
5325     // branch taken
5326     if(!nop) {
5327       if(taken) set_jump_target(taken,(int)out);
5328       assem_debug("1:\n");
5329       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5330                     ds_unneeded,ds_unneeded_upper);
5331       // load regs
5332       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5333       address_generation(i+1,&branch_regs[i],0);
5334       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5335       ds_assemble(i+1,&branch_regs[i]);
5336       cc=get_reg(branch_regs[i].regmap,CCREG);
5337       if(cc==-1) {
5338         emit_loadreg(CCREG,cc=HOST_CCREG);
5339         // CHECK: Is the following instruction (fall thru) allocated ok?
5340       }
5341       assert(cc==HOST_CCREG);
5342       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5343       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5344       assem_debug("cycle count (adj)\n");
5345       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5346       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5347       if(internal)
5348         assem_debug("branch: internal\n");
5349       else
5350         assem_debug("branch: external\n");
5351       if(internal&&is_ds[(ba[i]-start)>>2]) {
5352         ds_assemble_entry(i);
5353       }
5354       else {
5355         add_to_linker((int)out,ba[i],internal);
5356         emit_jmp(0);
5357       }
5358     }
5359     // branch not taken
5360     cop1_usable=prev_cop1_usable;
5361     if(!unconditional) {
5362       if(nottaken1) set_jump_target(nottaken1,(int)out);
5363       set_jump_target(nottaken,(int)out);
5364       assem_debug("2:\n");
5365       if(!likely[i]) {
5366         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5367                       ds_unneeded,ds_unneeded_upper);
5368         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5369         address_generation(i+1,&branch_regs[i],0);
5370         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5371         ds_assemble(i+1,&branch_regs[i]);
5372       }
5373       cc=get_reg(branch_regs[i].regmap,CCREG);
5374       if(cc==-1&&!likely[i]) {
5375         // Cycle count isn't in a register, temporarily load it then write it out
5376         emit_loadreg(CCREG,HOST_CCREG);
5377         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5378         int jaddr=(int)out;
5379         emit_jns(0);
5380         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5381         emit_storereg(CCREG,HOST_CCREG);
5382       }
5383       else{
5384         cc=get_reg(i_regmap,CCREG);
5385         assert(cc==HOST_CCREG);
5386         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5387         int jaddr=(int)out;
5388         emit_jns(0);
5389         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5390       }
5391     }
5392   }
5393 }
5394
5395 void sjump_assemble(int i,struct regstat *i_regs)
5396 {
5397   signed char *i_regmap=i_regs->regmap;
5398   int cc;
5399   int match;
5400   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5401   assem_debug("smatch=%d\n",match);
5402   int s1h,s1l;
5403   int prev_cop1_usable=cop1_usable;
5404   int unconditional=0,nevertaken=0;
5405   int only32=0;
5406   int ooo=1;
5407   int invert=0;
5408   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5409   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5410   if(likely[i]) ooo=0;
5411   if(!match) invert=1;
5412   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5413   if(i>(ba[i]-start)>>2) invert=1;
5414   #endif
5415
5416   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5417   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5418
5419   if(ooo)
5420     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5421   {
5422     // Write-after-read dependency prevents out of order execution
5423     // First test branch condition, then execute delay slot, then branch
5424     ooo=0;
5425   }
5426   // TODO: Conditional branches w/link must execute in-order so that
5427   // condition test and write to r31 occur before cycle count test
5428
5429   if(ooo) {
5430     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5431     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5432   }
5433   else {
5434     s1l=get_reg(i_regmap,rs1[i]);
5435     s1h=get_reg(i_regmap,rs1[i]|64);
5436   }
5437   if(rs1[i]==0)
5438   {
5439     if(opcode2[i]&1) unconditional=1;
5440     else nevertaken=1;
5441     // These are never taken (r0 is never less than zero)
5442     //assert(opcode2[i]!=0);
5443     //assert(opcode2[i]!=2);
5444     //assert(opcode2[i]!=0x10);
5445     //assert(opcode2[i]!=0x12);
5446   }
5447   else {
5448     only32=(regs[i].was32>>rs1[i])&1;
5449   }
5450
5451   if(ooo) {
5452     // Out of order execution (delay slot first)
5453     //printf("OOOE\n");
5454     address_generation(i+1,i_regs,regs[i].regmap_entry);
5455     ds_assemble(i+1,i_regs);
5456     int adj;
5457     uint64_t bc_unneeded=branch_regs[i].u;
5458     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5459     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5460     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5461     bc_unneeded|=1;
5462     bc_unneeded_upper|=1;
5463     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5464                   bc_unneeded,bc_unneeded_upper);
5465     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5466     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5467     if(rt1[i]==31) {
5468       int rt,return_address;
5469       assert(rt1[i+1]!=31);
5470       assert(rt2[i+1]!=31);
5471       rt=get_reg(branch_regs[i].regmap,31);
5472       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5473       if(rt>=0) {
5474         // Save the PC even if the branch is not taken
5475         return_address=start+i*4+8;
5476         emit_movimm(return_address,rt); // PC into link register
5477         #ifdef IMM_PREFETCH
5478         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5479         #endif
5480       }
5481     }
5482     cc=get_reg(branch_regs[i].regmap,CCREG);
5483     assert(cc==HOST_CCREG);
5484     if(unconditional) 
5485       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5486     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5487     assem_debug("cycle count (adj)\n");
5488     if(unconditional) {
5489       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5490       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5491         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5492         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5493         if(internal)
5494           assem_debug("branch: internal\n");
5495         else
5496           assem_debug("branch: external\n");
5497         if(internal&&is_ds[(ba[i]-start)>>2]) {
5498           ds_assemble_entry(i);
5499         }
5500         else {
5501           add_to_linker((int)out,ba[i],internal);
5502           emit_jmp(0);
5503         }
5504         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5505         if(((u_int)out)&7) emit_addnop(0);
5506         #endif
5507       }
5508     }
5509     else if(nevertaken) {
5510       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5511       int jaddr=(int)out;
5512       emit_jns(0);
5513       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5514     }
5515     else {
5516       int nottaken=0;
5517       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5518       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5519       if(!only32)
5520       {
5521         assert(s1h>=0);
5522         if(opcode2[i]==0) // BLTZ
5523         {
5524           emit_test(s1h,s1h);
5525           if(invert){
5526             nottaken=(int)out;
5527             emit_jns(1);
5528           }else{
5529             add_to_linker((int)out,ba[i],internal);
5530             emit_js(0);
5531           }
5532         }
5533         if(opcode2[i]==1) // BGEZ
5534         {
5535           emit_test(s1h,s1h);
5536           if(invert){
5537             nottaken=(int)out;
5538             emit_js(1);
5539           }else{
5540             add_to_linker((int)out,ba[i],internal);
5541             emit_jns(0);
5542           }
5543         }
5544       } // if(!only32)
5545       else
5546       {
5547         assert(s1l>=0);
5548         if(opcode2[i]==0) // BLTZ
5549         {
5550           emit_test(s1l,s1l);
5551           if(invert){
5552             nottaken=(int)out;
5553             emit_jns(1);
5554           }else{
5555             add_to_linker((int)out,ba[i],internal);
5556             emit_js(0);
5557           }
5558         }
5559         if(opcode2[i]==1) // BGEZ
5560         {
5561           emit_test(s1l,s1l);
5562           if(invert){
5563             nottaken=(int)out;
5564             emit_js(1);
5565           }else{
5566             add_to_linker((int)out,ba[i],internal);
5567             emit_jns(0);
5568           }
5569         }
5570       } // if(!only32)
5571           
5572       if(invert) {
5573         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5574         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5575           if(adj) {
5576             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5577             add_to_linker((int)out,ba[i],internal);
5578           }else{
5579             emit_addnop(13);
5580             add_to_linker((int)out,ba[i],internal*2);
5581           }
5582           emit_jmp(0);
5583         }else
5584         #endif
5585         {
5586           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5587           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5588           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5589           if(internal)
5590             assem_debug("branch: internal\n");
5591           else
5592             assem_debug("branch: external\n");
5593           if(internal&&is_ds[(ba[i]-start)>>2]) {
5594             ds_assemble_entry(i);
5595           }
5596           else {
5597             add_to_linker((int)out,ba[i],internal);
5598             emit_jmp(0);
5599           }
5600         }
5601         set_jump_target(nottaken,(int)out);
5602       }
5603
5604       if(adj) {
5605         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5606       }
5607     } // (!unconditional)
5608   } // if(ooo)
5609   else
5610   {
5611     // In-order execution (branch first)
5612     //printf("IOE\n");
5613     int nottaken=0;
5614     if(!unconditional) {
5615       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5616       if(!only32)
5617       {
5618         assert(s1h>=0);
5619         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5620         {
5621           emit_test(s1h,s1h);
5622           nottaken=(int)out;
5623           emit_jns(1);
5624         }
5625         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5626         {
5627           emit_test(s1h,s1h);
5628           nottaken=(int)out;
5629           emit_js(1);
5630         }
5631       } // if(!only32)
5632       else
5633       {
5634         assert(s1l>=0);
5635         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5636         {
5637           emit_test(s1l,s1l);
5638           nottaken=(int)out;
5639           emit_jns(1);
5640         }
5641         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5642         {
5643           emit_test(s1l,s1l);
5644           nottaken=(int)out;
5645           emit_js(1);
5646         }
5647       }
5648     } // if(!unconditional)
5649     int adj;
5650     uint64_t ds_unneeded=branch_regs[i].u;
5651     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5652     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5653     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5654     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5655     ds_unneeded|=1;
5656     ds_unneeded_upper|=1;
5657     // branch taken
5658     if(!nevertaken) {
5659       //assem_debug("1:\n");
5660       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5661                     ds_unneeded,ds_unneeded_upper);
5662       // load regs
5663       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5664       address_generation(i+1,&branch_regs[i],0);
5665       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5666       ds_assemble(i+1,&branch_regs[i]);
5667       cc=get_reg(branch_regs[i].regmap,CCREG);
5668       if(cc==-1) {
5669         emit_loadreg(CCREG,cc=HOST_CCREG);
5670         // CHECK: Is the following instruction (fall thru) allocated ok?
5671       }
5672       assert(cc==HOST_CCREG);
5673       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5674       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5675       assem_debug("cycle count (adj)\n");
5676       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5677       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5678       if(internal)
5679         assem_debug("branch: internal\n");
5680       else
5681         assem_debug("branch: external\n");
5682       if(internal&&is_ds[(ba[i]-start)>>2]) {
5683         ds_assemble_entry(i);
5684       }
5685       else {
5686         add_to_linker((int)out,ba[i],internal);
5687         emit_jmp(0);
5688       }
5689     }
5690     // branch not taken
5691     cop1_usable=prev_cop1_usable;
5692     if(!unconditional) {
5693       set_jump_target(nottaken,(int)out);
5694       assem_debug("1:\n");
5695       if(!likely[i]) {
5696         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5697                       ds_unneeded,ds_unneeded_upper);
5698         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5699         address_generation(i+1,&branch_regs[i],0);
5700         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5701         ds_assemble(i+1,&branch_regs[i]);
5702       }
5703       cc=get_reg(branch_regs[i].regmap,CCREG);
5704       if(cc==-1&&!likely[i]) {
5705         // Cycle count isn't in a register, temporarily load it then write it out
5706         emit_loadreg(CCREG,HOST_CCREG);
5707         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5708         int jaddr=(int)out;
5709         emit_jns(0);
5710         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5711         emit_storereg(CCREG,HOST_CCREG);
5712       }
5713       else{
5714         cc=get_reg(i_regmap,CCREG);
5715         assert(cc==HOST_CCREG);
5716         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5717         int jaddr=(int)out;
5718         emit_jns(0);
5719         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5720       }
5721     }
5722   }
5723 }
5724
5725 void fjump_assemble(int i,struct regstat *i_regs)
5726 {
5727   signed char *i_regmap=i_regs->regmap;
5728   int cc;
5729   int match;
5730   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5731   assem_debug("fmatch=%d\n",match);
5732   int fs,cs;
5733   int eaddr;
5734   int ooo=1;
5735   int invert=0;
5736   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5737   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5738   if(likely[i]) ooo=0;
5739   if(!match) invert=1;
5740   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5741   if(i>(ba[i]-start)>>2) invert=1;
5742   #endif
5743
5744   if(ooo)
5745     if(itype[i+1]==FCOMP)
5746   {
5747     // Write-after-read dependency prevents out of order execution
5748     // First test branch condition, then execute delay slot, then branch
5749     ooo=0;
5750   }
5751
5752   if(ooo) {
5753     fs=get_reg(branch_regs[i].regmap,FSREG);
5754     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5755   }
5756   else {
5757     fs=get_reg(i_regmap,FSREG);
5758   }
5759
5760   // Check cop1 unusable
5761   if(!cop1_usable) {
5762     cs=get_reg(i_regmap,CSREG);
5763     assert(cs>=0);
5764     emit_testimm(cs,0x20000000);
5765     eaddr=(int)out;
5766     emit_jeq(0);
5767     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5768     cop1_usable=1;
5769   }
5770
5771   if(ooo) {
5772     // Out of order execution (delay slot first)
5773     //printf("OOOE\n");
5774     ds_assemble(i+1,i_regs);
5775     int adj;
5776     uint64_t bc_unneeded=branch_regs[i].u;
5777     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5778     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5779     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5780     bc_unneeded|=1;
5781     bc_unneeded_upper|=1;
5782     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5783                   bc_unneeded,bc_unneeded_upper);
5784     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5785     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5786     cc=get_reg(branch_regs[i].regmap,CCREG);
5787     assert(cc==HOST_CCREG);
5788     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5789     assem_debug("cycle count (adj)\n");
5790     if(1) {
5791       int nottaken=0;
5792       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5793       if(1) {
5794         assert(fs>=0);
5795         emit_testimm(fs,0x800000);
5796         if(source[i]&0x10000) // BC1T
5797         {
5798           if(invert){
5799             nottaken=(int)out;
5800             emit_jeq(1);
5801           }else{
5802             add_to_linker((int)out,ba[i],internal);
5803             emit_jne(0);
5804           }
5805         }
5806         else // BC1F
5807           if(invert){
5808             nottaken=(int)out;
5809             emit_jne(1);
5810           }else{
5811             add_to_linker((int)out,ba[i],internal);
5812             emit_jeq(0);
5813           }
5814         {
5815         }
5816       } // if(!only32)
5817           
5818       if(invert) {
5819         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5820         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5821         else if(match) emit_addnop(13);
5822         #endif
5823         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5824         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5825         if(internal)
5826           assem_debug("branch: internal\n");
5827         else
5828           assem_debug("branch: external\n");
5829         if(internal&&is_ds[(ba[i]-start)>>2]) {
5830           ds_assemble_entry(i);
5831         }
5832         else {
5833           add_to_linker((int)out,ba[i],internal);
5834           emit_jmp(0);
5835         }
5836         set_jump_target(nottaken,(int)out);
5837       }
5838
5839       if(adj) {
5840         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5841       }
5842     } // (!unconditional)
5843   } // if(ooo)
5844   else
5845   {
5846     // In-order execution (branch first)
5847     //printf("IOE\n");
5848     int nottaken=0;
5849     if(1) {
5850       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5851       if(1) {
5852         assert(fs>=0);
5853         emit_testimm(fs,0x800000);
5854         if(source[i]&0x10000) // BC1T
5855         {
5856           nottaken=(int)out;
5857           emit_jeq(1);
5858         }
5859         else // BC1F
5860         {
5861           nottaken=(int)out;
5862           emit_jne(1);
5863         }
5864       }
5865     } // if(!unconditional)
5866     int adj;
5867     uint64_t ds_unneeded=branch_regs[i].u;
5868     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5869     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5870     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5871     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5872     ds_unneeded|=1;
5873     ds_unneeded_upper|=1;
5874     // branch taken
5875     //assem_debug("1:\n");
5876     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5877                   ds_unneeded,ds_unneeded_upper);
5878     // load regs
5879     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5880     address_generation(i+1,&branch_regs[i],0);
5881     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5882     ds_assemble(i+1,&branch_regs[i]);
5883     cc=get_reg(branch_regs[i].regmap,CCREG);
5884     if(cc==-1) {
5885       emit_loadreg(CCREG,cc=HOST_CCREG);
5886       // CHECK: Is the following instruction (fall thru) allocated ok?
5887     }
5888     assert(cc==HOST_CCREG);
5889     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5890     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5891     assem_debug("cycle count (adj)\n");
5892     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5893     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5894     if(internal)
5895       assem_debug("branch: internal\n");
5896     else
5897       assem_debug("branch: external\n");
5898     if(internal&&is_ds[(ba[i]-start)>>2]) {
5899       ds_assemble_entry(i);
5900     }
5901     else {
5902       add_to_linker((int)out,ba[i],internal);
5903       emit_jmp(0);
5904     }
5905
5906     // branch not taken
5907     if(1) { // <- FIXME (don't need this)
5908       set_jump_target(nottaken,(int)out);
5909       assem_debug("1:\n");
5910       if(!likely[i]) {
5911         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5912                       ds_unneeded,ds_unneeded_upper);
5913         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5914         address_generation(i+1,&branch_regs[i],0);
5915         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5916         ds_assemble(i+1,&branch_regs[i]);
5917       }
5918       cc=get_reg(branch_regs[i].regmap,CCREG);
5919       if(cc==-1&&!likely[i]) {
5920         // Cycle count isn't in a register, temporarily load it then write it out
5921         emit_loadreg(CCREG,HOST_CCREG);
5922         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5923         int jaddr=(int)out;
5924         emit_jns(0);
5925         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5926         emit_storereg(CCREG,HOST_CCREG);
5927       }
5928       else{
5929         cc=get_reg(i_regmap,CCREG);
5930         assert(cc==HOST_CCREG);
5931         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5932         int jaddr=(int)out;
5933         emit_jns(0);
5934         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5935       }
5936     }
5937   }
5938 }
5939
5940 static void pagespan_assemble(int i,struct regstat *i_regs)
5941 {
5942   int s1l=get_reg(i_regs->regmap,rs1[i]);
5943   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5944   int s2l=get_reg(i_regs->regmap,rs2[i]);
5945   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5946   void *nt_branch=NULL;
5947   int taken=0;
5948   int nottaken=0;
5949   int unconditional=0;
5950   if(rs1[i]==0)
5951   {
5952     s1l=s2l;s1h=s2h;
5953     s2l=s2h=-1;
5954   }
5955   else if(rs2[i]==0)
5956   {
5957     s2l=s2h=-1;
5958   }
5959   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5960     s1h=s2h=-1;
5961   }
5962   int hr=0;
5963   int addr,alt,ntaddr;
5964   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5965   else {
5966     while(hr<HOST_REGS)
5967     {
5968       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5969          (i_regs->regmap[hr]&63)!=rs1[i] &&
5970          (i_regs->regmap[hr]&63)!=rs2[i] )
5971       {
5972         addr=hr++;break;
5973       }
5974       hr++;
5975     }
5976   }
5977   while(hr<HOST_REGS)
5978   {
5979     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5980        (i_regs->regmap[hr]&63)!=rs1[i] &&
5981        (i_regs->regmap[hr]&63)!=rs2[i] )
5982     {
5983       alt=hr++;break;
5984     }
5985     hr++;
5986   }
5987   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5988   {
5989     while(hr<HOST_REGS)
5990     {
5991       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5992          (i_regs->regmap[hr]&63)!=rs1[i] &&
5993          (i_regs->regmap[hr]&63)!=rs2[i] )
5994       {
5995         ntaddr=hr;break;
5996       }
5997       hr++;
5998     }
5999   }
6000   assert(hr<HOST_REGS);
6001   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6002     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6003   }
6004   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6005   if(opcode[i]==2) // J
6006   {
6007     unconditional=1;
6008   }
6009   if(opcode[i]==3) // JAL
6010   {
6011     // TODO: mini_ht
6012     int rt=get_reg(i_regs->regmap,31);
6013     emit_movimm(start+i*4+8,rt);
6014     unconditional=1;
6015   }
6016   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6017   {
6018     emit_mov(s1l,addr);
6019     if(opcode2[i]==9) // JALR
6020     {
6021       int rt=get_reg(i_regs->regmap,31);
6022       emit_movimm(start+i*4+8,rt);
6023     }
6024   }
6025   if((opcode[i]&0x3f)==4) // BEQ
6026   {
6027     if(rs1[i]==rs2[i])
6028     {
6029       unconditional=1;
6030     }
6031     else
6032     #ifdef HAVE_CMOV_IMM
6033     if(s1h<0) {
6034       if(s2l>=0) emit_cmp(s1l,s2l);
6035       else emit_test(s1l,s1l);
6036       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6037     }
6038     else
6039     #endif
6040     {
6041       assert(s1l>=0);
6042       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6043       if(s1h>=0) {
6044         if(s2h>=0) emit_cmp(s1h,s2h);
6045         else emit_test(s1h,s1h);
6046         emit_cmovne_reg(alt,addr);
6047       }
6048       if(s2l>=0) emit_cmp(s1l,s2l);
6049       else emit_test(s1l,s1l);
6050       emit_cmovne_reg(alt,addr);
6051     }
6052   }
6053   if((opcode[i]&0x3f)==5) // BNE
6054   {
6055     #ifdef HAVE_CMOV_IMM
6056     if(s1h<0) {
6057       if(s2l>=0) emit_cmp(s1l,s2l);
6058       else emit_test(s1l,s1l);
6059       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6060     }
6061     else
6062     #endif
6063     {
6064       assert(s1l>=0);
6065       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6066       if(s1h>=0) {
6067         if(s2h>=0) emit_cmp(s1h,s2h);
6068         else emit_test(s1h,s1h);
6069         emit_cmovne_reg(alt,addr);
6070       }
6071       if(s2l>=0) emit_cmp(s1l,s2l);
6072       else emit_test(s1l,s1l);
6073       emit_cmovne_reg(alt,addr);
6074     }
6075   }
6076   if((opcode[i]&0x3f)==0x14) // BEQL
6077   {
6078     if(s1h>=0) {
6079       if(s2h>=0) emit_cmp(s1h,s2h);
6080       else emit_test(s1h,s1h);
6081       nottaken=(int)out;
6082       emit_jne(0);
6083     }
6084     if(s2l>=0) emit_cmp(s1l,s2l);
6085     else emit_test(s1l,s1l);
6086     if(nottaken) set_jump_target(nottaken,(int)out);
6087     nottaken=(int)out;
6088     emit_jne(0);
6089   }
6090   if((opcode[i]&0x3f)==0x15) // BNEL
6091   {
6092     if(s1h>=0) {
6093       if(s2h>=0) emit_cmp(s1h,s2h);
6094       else emit_test(s1h,s1h);
6095       taken=(int)out;
6096       emit_jne(0);
6097     }
6098     if(s2l>=0) emit_cmp(s1l,s2l);
6099     else emit_test(s1l,s1l);
6100     nottaken=(int)out;
6101     emit_jeq(0);
6102     if(taken) set_jump_target(taken,(int)out);
6103   }
6104   if((opcode[i]&0x3f)==6) // BLEZ
6105   {
6106     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6107     emit_cmpimm(s1l,1);
6108     if(s1h>=0) emit_mov(addr,ntaddr);
6109     emit_cmovl_reg(alt,addr);
6110     if(s1h>=0) {
6111       emit_test(s1h,s1h);
6112       emit_cmovne_reg(ntaddr,addr);
6113       emit_cmovs_reg(alt,addr);
6114     }
6115   }
6116   if((opcode[i]&0x3f)==7) // BGTZ
6117   {
6118     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6119     emit_cmpimm(s1l,1);
6120     if(s1h>=0) emit_mov(addr,alt);
6121     emit_cmovl_reg(ntaddr,addr);
6122     if(s1h>=0) {
6123       emit_test(s1h,s1h);
6124       emit_cmovne_reg(alt,addr);
6125       emit_cmovs_reg(ntaddr,addr);
6126     }
6127   }
6128   if((opcode[i]&0x3f)==0x16) // BLEZL
6129   {
6130     assert((opcode[i]&0x3f)!=0x16);
6131   }
6132   if((opcode[i]&0x3f)==0x17) // BGTZL
6133   {
6134     assert((opcode[i]&0x3f)!=0x17);
6135   }
6136   assert(opcode[i]!=1); // BLTZ/BGEZ
6137
6138   //FIXME: Check CSREG
6139   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6140     if((source[i]&0x30000)==0) // BC1F
6141     {
6142       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6143       emit_testimm(s1l,0x800000);
6144       emit_cmovne_reg(alt,addr);
6145     }
6146     if((source[i]&0x30000)==0x10000) // BC1T
6147     {
6148       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6149       emit_testimm(s1l,0x800000);
6150       emit_cmovne_reg(alt,addr);
6151     }
6152     if((source[i]&0x30000)==0x20000) // BC1FL
6153     {
6154       emit_testimm(s1l,0x800000);
6155       nottaken=(int)out;
6156       emit_jne(0);
6157     }
6158     if((source[i]&0x30000)==0x30000) // BC1TL
6159     {
6160       emit_testimm(s1l,0x800000);
6161       nottaken=(int)out;
6162       emit_jeq(0);
6163     }
6164   }
6165
6166   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6167   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6168   if(likely[i]||unconditional)
6169   {
6170     emit_movimm(ba[i],HOST_BTREG);
6171   }
6172   else if(addr!=HOST_BTREG)
6173   {
6174     emit_mov(addr,HOST_BTREG);
6175   }
6176   void *branch_addr=out;
6177   emit_jmp(0);
6178   int target_addr=start+i*4+5;
6179   void *stub=out;
6180   void *compiled_target_addr=check_addr(target_addr);
6181   emit_extjump_ds((int)branch_addr,target_addr);
6182   if(compiled_target_addr) {
6183     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6184     add_link(target_addr,stub);
6185   }
6186   else set_jump_target((int)branch_addr,(int)stub);
6187   if(likely[i]) {
6188     // Not-taken path
6189     set_jump_target((int)nottaken,(int)out);
6190     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6191     void *branch_addr=out;
6192     emit_jmp(0);
6193     int target_addr=start+i*4+8;
6194     void *stub=out;
6195     void *compiled_target_addr=check_addr(target_addr);
6196     emit_extjump_ds((int)branch_addr,target_addr);
6197     if(compiled_target_addr) {
6198       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6199       add_link(target_addr,stub);
6200     }
6201     else set_jump_target((int)branch_addr,(int)stub);
6202   }
6203 }
6204
6205 // Assemble the delay slot for the above
6206 static void pagespan_ds()
6207 {
6208   assem_debug("initial delay slot:\n");
6209   u_int vaddr=start+1;
6210   u_int page=get_page(vaddr);
6211   u_int vpage=get_vpage(vaddr);
6212   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6213   do_dirty_stub_ds();
6214   ll_add(jump_in+page,vaddr,(void *)out);
6215   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6216   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6217     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6218   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6219     emit_writeword(HOST_BTREG,(int)&branch_target);
6220   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6221   address_generation(0,&regs[0],regs[0].regmap_entry);
6222   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6223     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6224   cop1_usable=0;
6225   is_delayslot=0;
6226   switch(itype[0]) {
6227     case ALU:
6228       alu_assemble(0,&regs[0]);break;
6229     case IMM16:
6230       imm16_assemble(0,&regs[0]);break;
6231     case SHIFT:
6232       shift_assemble(0,&regs[0]);break;
6233     case SHIFTIMM:
6234       shiftimm_assemble(0,&regs[0]);break;
6235     case LOAD:
6236       load_assemble(0,&regs[0]);break;
6237     case LOADLR:
6238       loadlr_assemble(0,&regs[0]);break;
6239     case STORE:
6240       store_assemble(0,&regs[0]);break;
6241     case STORELR:
6242       storelr_assemble(0,&regs[0]);break;
6243     case COP0:
6244       cop0_assemble(0,&regs[0]);break;
6245     case COP1:
6246       cop1_assemble(0,&regs[0]);break;
6247     case C1LS:
6248       c1ls_assemble(0,&regs[0]);break;
6249     case FCONV:
6250       fconv_assemble(0,&regs[0]);break;
6251     case FLOAT:
6252       float_assemble(0,&regs[0]);break;
6253     case FCOMP:
6254       fcomp_assemble(0,&regs[0]);break;
6255     case MULTDIV:
6256       multdiv_assemble(0,&regs[0]);break;
6257     case MOV:
6258       mov_assemble(0,&regs[0]);break;
6259     case SYSCALL:
6260     case SPAN:
6261     case UJUMP:
6262     case RJUMP:
6263     case CJUMP:
6264     case SJUMP:
6265     case FJUMP:
6266       printf("Jump in the delay slot.  This is probably a bug.\n");
6267   }
6268   int btaddr=get_reg(regs[0].regmap,BTREG);
6269   if(btaddr<0) {
6270     btaddr=get_reg(regs[0].regmap,-1);
6271     emit_readword((int)&branch_target,btaddr);
6272   }
6273   assert(btaddr!=HOST_CCREG);
6274   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6275 #ifdef HOST_IMM8
6276   emit_movimm(start+4,HOST_TEMPREG);
6277   emit_cmp(btaddr,HOST_TEMPREG);
6278 #else
6279   emit_cmpimm(btaddr,start+4);
6280 #endif
6281   int branch=(int)out;
6282   emit_jeq(0);
6283   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6284   emit_jmp(jump_vaddr_reg[btaddr]);
6285   set_jump_target(branch,(int)out);
6286   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6287   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6288 }
6289
6290 // Basic liveness analysis for MIPS registers
6291 void unneeded_registers(int istart,int iend,int r)
6292 {
6293   int i;
6294   uint64_t u,uu,b,bu;
6295   uint64_t temp_u,temp_uu;
6296   uint64_t tdep;
6297   if(iend==slen-1) {
6298     u=1;uu=1;
6299   }else{
6300     u=unneeded_reg[iend+1];
6301     uu=unneeded_reg_upper[iend+1];
6302     u=1;uu=1;
6303   }
6304   for (i=iend;i>=istart;i--)
6305   {
6306     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6307     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6308     {
6309       // If subroutine call, flag return address as a possible branch target
6310       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6311       
6312       if(ba[i]<start || ba[i]>=(start+slen*4))
6313       {
6314         // Branch out of this block, flush all regs
6315         u=1;
6316         uu=1;
6317         /* Hexagon hack 
6318         if(itype[i]==UJUMP&&rt1[i]==31)
6319         {
6320           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6321         }
6322         if(itype[i]==RJUMP&&rs1[i]==31)
6323         {
6324           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6325         }
6326         if(start>0x80000400&&start<0x80800000) {
6327           if(itype[i]==UJUMP&&rt1[i]==31)
6328           {
6329             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6330             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6331           }
6332           if(itype[i]==RJUMP&&rs1[i]==31)
6333           {
6334             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6335             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6336           }
6337         }*/
6338         branch_unneeded_reg[i]=u;
6339         branch_unneeded_reg_upper[i]=uu;
6340         // Merge in delay slot
6341         tdep=(~uu>>rt1[i+1])&1;
6342         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6343         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6344         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6345         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6346         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6347         u|=1;uu|=1;
6348         // If branch is "likely" (and conditional)
6349         // then we skip the delay slot on the fall-thru path
6350         if(likely[i]) {
6351           if(i<slen-1) {
6352             u&=unneeded_reg[i+2];
6353             uu&=unneeded_reg_upper[i+2];
6354           }
6355           else
6356           {
6357             u=1;
6358             uu=1;
6359           }
6360         }
6361       }
6362       else
6363       {
6364         // Internal branch, flag target
6365         bt[(ba[i]-start)>>2]=1;
6366         if(ba[i]<=start+i*4) {
6367           // Backward branch
6368           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6369           {
6370             // Unconditional branch
6371             temp_u=1;temp_uu=1;
6372           } else {
6373             // Conditional branch (not taken case)
6374             temp_u=unneeded_reg[i+2];
6375             temp_uu=unneeded_reg_upper[i+2];
6376           }
6377           // Merge in delay slot
6378           tdep=(~temp_uu>>rt1[i+1])&1;
6379           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6380           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6381           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6382           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6383           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6384           temp_u|=1;temp_uu|=1;
6385           // If branch is "likely" (and conditional)
6386           // then we skip the delay slot on the fall-thru path
6387           if(likely[i]) {
6388             if(i<slen-1) {
6389               temp_u&=unneeded_reg[i+2];
6390               temp_uu&=unneeded_reg_upper[i+2];
6391             }
6392             else
6393             {
6394               temp_u=1;
6395               temp_uu=1;
6396             }
6397           }
6398           tdep=(~temp_uu>>rt1[i])&1;
6399           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6400           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6401           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6402           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6403           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6404           temp_u|=1;temp_uu|=1;
6405           unneeded_reg[i]=temp_u;
6406           unneeded_reg_upper[i]=temp_uu;
6407           // Only go three levels deep.  This recursion can take an
6408           // excessive amount of time if there are a lot of nested loops.
6409           if(r<2) {
6410             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6411           }else{
6412             unneeded_reg[(ba[i]-start)>>2]=1;
6413             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6414           }
6415         } /*else*/ if(1) {
6416           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6417           {
6418             // Unconditional branch
6419             u=unneeded_reg[(ba[i]-start)>>2];
6420             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6421             branch_unneeded_reg[i]=u;
6422             branch_unneeded_reg_upper[i]=uu;
6423         //u=1;
6424         //uu=1;
6425         //branch_unneeded_reg[i]=u;
6426         //branch_unneeded_reg_upper[i]=uu;
6427             // Merge in delay slot
6428             tdep=(~uu>>rt1[i+1])&1;
6429             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6430             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6431             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6432             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6433             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6434             u|=1;uu|=1;
6435           } else {
6436             // Conditional branch
6437             b=unneeded_reg[(ba[i]-start)>>2];
6438             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6439             branch_unneeded_reg[i]=b;
6440             branch_unneeded_reg_upper[i]=bu;
6441         //b=1;
6442         //bu=1;
6443         //branch_unneeded_reg[i]=b;
6444         //branch_unneeded_reg_upper[i]=bu;
6445             // Branch delay slot
6446             tdep=(~uu>>rt1[i+1])&1;
6447             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6448             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6449             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6450             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6451             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6452             b|=1;bu|=1;
6453             // If branch is "likely" then we skip the
6454             // delay slot on the fall-thru path
6455             if(likely[i]) {
6456               u=b;
6457               uu=bu;
6458               if(i<slen-1) {
6459                 u&=unneeded_reg[i+2];
6460                 uu&=unneeded_reg_upper[i+2];
6461         //u=1;
6462         //uu=1;
6463               }
6464             } else {
6465               u&=b;
6466               uu&=bu;
6467         //u=1;
6468         //uu=1;
6469             }
6470             if(i<slen-1) {
6471               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6472               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6473         //branch_unneeded_reg[i]=1;
6474         //branch_unneeded_reg_upper[i]=1;
6475             } else {
6476               branch_unneeded_reg[i]=1;
6477               branch_unneeded_reg_upper[i]=1;
6478             }
6479           }
6480         }
6481       }
6482     }
6483     else if(itype[i]==SYSCALL)
6484     {
6485       // SYSCALL instruction (software interrupt)
6486       u=1;
6487       uu=1;
6488     }
6489     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6490     {
6491       // ERET instruction (return from interrupt)
6492       u=1;
6493       uu=1;
6494     }
6495     //u=uu=1; // DEBUG
6496     tdep=(~uu>>rt1[i])&1;
6497     // Written registers are unneeded
6498     u|=1LL<<rt1[i];
6499     u|=1LL<<rt2[i];
6500     uu|=1LL<<rt1[i];
6501     uu|=1LL<<rt2[i];
6502     // Accessed registers are needed
6503     u&=~(1LL<<rs1[i]);
6504     u&=~(1LL<<rs2[i]);
6505     uu&=~(1LL<<us1[i]);
6506     uu&=~(1LL<<us2[i]);
6507     // Source-target dependencies
6508     uu&=~(tdep<<dep1[i]);
6509     uu&=~(tdep<<dep2[i]);
6510     // R0 is always unneeded
6511     u|=1;uu|=1;
6512     // Save it
6513     unneeded_reg[i]=u;
6514     unneeded_reg_upper[i]=uu;
6515 #ifdef FORCE32
6516     unneeded_reg_upper[i]=-1LL;
6517 #endif
6518     /*
6519     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6520     printf("U:");
6521     int r;
6522     for(r=1;r<=CCREG;r++) {
6523       if((unneeded_reg[i]>>r)&1) {
6524         if(r==HIREG) printf(" HI");
6525         else if(r==LOREG) printf(" LO");
6526         else printf(" r%d",r);
6527       }
6528     }
6529     printf(" UU:");
6530     for(r=1;r<=CCREG;r++) {
6531       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6532         if(r==HIREG) printf(" HI");
6533         else if(r==LOREG) printf(" LO");
6534         else printf(" r%d",r);
6535       }
6536     }
6537     printf("\n");*/
6538   }
6539 }
6540
6541 // Identify registers which are likely to contain 32-bit values
6542 // This is used to predict whether any branches will jump to a
6543 // location with 64-bit values in registers.
6544 static void provisional_32bit()
6545 {
6546   int i,j;
6547   uint64_t is32=1;
6548   uint64_t lastbranch=1;
6549   
6550   for(i=0;i<slen;i++)
6551   {
6552     if(i>0) {
6553       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6554         if(i>1) is32=lastbranch;
6555         else is32=1;
6556       }
6557     }
6558     if(i>1)
6559     {
6560       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6561         if(likely[i-2]) {
6562           if(i>2) is32=lastbranch;
6563           else is32=1;
6564         }
6565       }
6566       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6567       {
6568         if(rs1[i-2]==0||rs2[i-2]==0)
6569         {
6570           if(rs1[i-2]) {
6571             is32|=1LL<<rs1[i-2];
6572           }
6573           if(rs2[i-2]) {
6574             is32|=1LL<<rs2[i-2];
6575           }
6576         }
6577       }
6578     }
6579     // If something jumps here with 64-bit values
6580     // then promote those registers to 64 bits
6581     if(bt[i])
6582     {
6583       uint64_t temp_is32=is32;
6584       for(j=i-1;j>=0;j--)
6585       {
6586         if(ba[j]==start+i*4) 
6587           //temp_is32&=branch_regs[j].is32;
6588           temp_is32&=p32[j];
6589       }
6590       for(j=i;j<slen;j++)
6591       {
6592         if(ba[j]==start+i*4) 
6593           temp_is32=1;
6594       }
6595       is32=temp_is32;
6596     }
6597     int type=itype[i];
6598     int op=opcode[i];
6599     int op2=opcode2[i];
6600     int rt=rt1[i];
6601     int s1=rs1[i];
6602     int s2=rs2[i];
6603     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6604       // Branches don't write registers, consider the delay slot instead.
6605       type=itype[i+1];
6606       op=opcode[i+1];
6607       op2=opcode2[i+1];
6608       rt=rt1[i+1];
6609       s1=rs1[i+1];
6610       s2=rs2[i+1];
6611       lastbranch=is32;
6612     }
6613     switch(type) {
6614       case LOAD:
6615         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6616            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6617           is32&=~(1LL<<rt);
6618         else
6619           is32|=1LL<<rt;
6620         break;
6621       case STORE:
6622       case STORELR:
6623         break;
6624       case LOADLR:
6625         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6626         if(op==0x22) is32|=1LL<<rt; // LWL
6627         break;
6628       case IMM16:
6629         if (op==0x08||op==0x09|| // ADDI/ADDIU
6630             op==0x0a||op==0x0b|| // SLTI/SLTIU
6631             op==0x0c|| // ANDI
6632             op==0x0f)  // LUI
6633         {
6634           is32|=1LL<<rt;
6635         }
6636         if(op==0x18||op==0x19) { // DADDI/DADDIU
6637           is32&=~(1LL<<rt);
6638           //if(imm[i]==0)
6639           //  is32|=((is32>>s1)&1LL)<<rt;
6640         }
6641         if(op==0x0d||op==0x0e) { // ORI/XORI
6642           uint64_t sr=((is32>>s1)&1LL);
6643           is32&=~(1LL<<rt);
6644           is32|=sr<<rt;
6645         }
6646         break;
6647       case UJUMP:
6648         break;
6649       case RJUMP:
6650         break;
6651       case CJUMP:
6652         break;
6653       case SJUMP:
6654         break;
6655       case FJUMP:
6656         break;
6657       case ALU:
6658         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6659           is32|=1LL<<rt;
6660         }
6661         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6662           is32|=1LL<<rt;
6663         }
6664         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6665           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6666           is32&=~(1LL<<rt);
6667           is32|=sr<<rt;
6668         }
6669         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6670           if(s1==0&&s2==0) {
6671             is32|=1LL<<rt;
6672           }
6673           else if(s2==0) {
6674             uint64_t sr=((is32>>s1)&1LL);
6675             is32&=~(1LL<<rt);
6676             is32|=sr<<rt;
6677           }
6678           else if(s1==0) {
6679             uint64_t sr=((is32>>s2)&1LL);
6680             is32&=~(1LL<<rt);
6681             is32|=sr<<rt;
6682           }
6683           else {
6684             is32&=~(1LL<<rt);
6685           }
6686         }
6687         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6688           if(s1==0&&s2==0) {
6689             is32|=1LL<<rt;
6690           }
6691           else if(s2==0) {
6692             uint64_t sr=((is32>>s1)&1LL);
6693             is32&=~(1LL<<rt);
6694             is32|=sr<<rt;
6695           }
6696           else {
6697             is32&=~(1LL<<rt);
6698           }
6699         }
6700         break;
6701       case MULTDIV:
6702         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6703           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6704         }
6705         else {
6706           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6707         }
6708         break;
6709       case MOV:
6710         {
6711           uint64_t sr=((is32>>s1)&1LL);
6712           is32&=~(1LL<<rt);
6713           is32|=sr<<rt;
6714         }
6715         break;
6716       case SHIFT:
6717         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6718         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6719         break;
6720       case SHIFTIMM:
6721         is32|=1LL<<rt;
6722         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6723         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6724         break;
6725       case COP0:
6726         if(op2==0) is32|=1LL<<rt; // MFC0
6727         break;
6728       case COP1:
6729         if(op2==0) is32|=1LL<<rt; // MFC1
6730         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6731         if(op2==2) is32|=1LL<<rt; // CFC1
6732         break;
6733       case C1LS:
6734         break;
6735       case FLOAT:
6736       case FCONV:
6737         break;
6738       case FCOMP:
6739         break;
6740       case SYSCALL:
6741         break;
6742       default:
6743         break;
6744     }
6745     is32|=1;
6746     p32[i]=is32;
6747
6748     if(i>0)
6749     {
6750       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6751       {
6752         if(rt1[i-1]==31) // JAL/JALR
6753         {
6754           // Subroutine call will return here, don't alloc any registers
6755           is32=1;
6756         }
6757         else if(i+1<slen)
6758         {
6759           // Internal branch will jump here, match registers to caller
6760           is32=0x3FFFFFFFFLL;
6761         }
6762       }
6763     }
6764   }
6765 }
6766
6767 // Identify registers which may be assumed to contain 32-bit values
6768 // and where optimizations will rely on this.
6769 // This is used to determine whether backward branches can safely
6770 // jump to a location with 64-bit values in registers.
6771 static void provisional_r32()
6772 {
6773   u_int r32=0;
6774   int i;
6775   
6776   for (i=slen-1;i>=0;i--)
6777   {
6778     int hr;
6779     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6780     {
6781       if(ba[i]<start || ba[i]>=(start+slen*4))
6782       {
6783         // Branch out of this block, don't need anything
6784         r32=0;
6785       }
6786       else
6787       {
6788         // Internal branch
6789         // Need whatever matches the target
6790         // (and doesn't get overwritten by the delay slot instruction)
6791         r32=0;
6792         int t=(ba[i]-start)>>2;
6793         if(ba[i]>start+i*4) {
6794           // Forward branch
6795           //if(!(requires_32bit[t]&~regs[i].was32))
6796           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6797           if(!(pr32[t]&~regs[i].was32))
6798             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6799         }else{
6800           // Backward branch
6801           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6802             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6803         }
6804       }
6805       // Conditional branch may need registers for following instructions
6806       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6807       {
6808         if(i<slen-2) {
6809           //r32|=requires_32bit[i+2];
6810           r32|=pr32[i+2];
6811           r32&=regs[i].was32;
6812           // Mark this address as a branch target since it may be called
6813           // upon return from interrupt
6814           //bt[i+2]=1;
6815         }
6816       }
6817       // Merge in delay slot
6818       if(!likely[i]) {
6819         // These are overwritten unless the branch is "likely"
6820         // and the delay slot is nullified if not taken
6821         r32&=~(1LL<<rt1[i+1]);
6822         r32&=~(1LL<<rt2[i+1]);
6823       }
6824       // Assume these are needed (delay slot)
6825       if(us1[i+1]>0)
6826       {
6827         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6828       }
6829       if(us2[i+1]>0)
6830       {
6831         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6832       }
6833       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6834       {
6835         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6836       }
6837       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6838       {
6839         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6840       }
6841     }
6842     else if(itype[i]==SYSCALL)
6843     {
6844       // SYSCALL instruction (software interrupt)
6845       r32=0;
6846     }
6847     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6848     {
6849       // ERET instruction (return from interrupt)
6850       r32=0;
6851     }
6852     // Check 32 bits
6853     r32&=~(1LL<<rt1[i]);
6854     r32&=~(1LL<<rt2[i]);
6855     if(us1[i]>0)
6856     {
6857       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6858     }
6859     if(us2[i]>0)
6860     {
6861       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6862     }
6863     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6864     {
6865       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6866     }
6867     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6868     {
6869       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6870     }
6871     //requires_32bit[i]=r32;
6872     pr32[i]=r32;
6873     
6874     // Dirty registers which are 32-bit, require 32-bit input
6875     // as they will be written as 32-bit values
6876     for(hr=0;hr<HOST_REGS;hr++)
6877     {
6878       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6879         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6880           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6881           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6882           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6883         }
6884       }
6885     }
6886   }
6887 }
6888
6889 // Write back dirty registers as soon as we will no longer modify them,
6890 // so that we don't end up with lots of writes at the branches.
6891 void clean_registers(int istart,int iend,int wr)
6892 {
6893   int i;
6894   int r;
6895   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6896   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6897   if(iend==slen-1) {
6898     will_dirty_i=will_dirty_next=0;
6899     wont_dirty_i=wont_dirty_next=0;
6900   }else{
6901     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6902     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6903   }
6904   for (i=iend;i>=istart;i--)
6905   {
6906     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6907     {
6908       if(ba[i]<start || ba[i]>=(start+slen*4))
6909       {
6910         // Branch out of this block, flush all regs
6911         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6912         {
6913           // Unconditional branch
6914           will_dirty_i=0;
6915           wont_dirty_i=0;
6916           // Merge in delay slot (will dirty)
6917           for(r=0;r<HOST_REGS;r++) {
6918             if(r!=EXCLUDE_REG) {
6919               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6920               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6921               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6922               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6923               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6924               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6925               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6926               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6927               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6928               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6929               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6930               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6931               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6932               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6933             }
6934           }
6935         }
6936         else
6937         {
6938           // Conditional branch
6939           will_dirty_i=0;
6940           wont_dirty_i=wont_dirty_next;
6941           // Merge in delay slot (will dirty)
6942           for(r=0;r<HOST_REGS;r++) {
6943             if(r!=EXCLUDE_REG) {
6944               if(!likely[i]) {
6945                 // Might not dirty if likely branch is not taken
6946                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6947                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6948                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6949                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6950                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6951                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6952                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6953                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6954                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6955                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6956                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6957                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6958                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6959                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6960               }
6961             }
6962           }
6963         }
6964         // Merge in delay slot (wont dirty)
6965         for(r=0;r<HOST_REGS;r++) {
6966           if(r!=EXCLUDE_REG) {
6967             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6968             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6969             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6970             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6971             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6972             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6973             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6974             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6975             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6976             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6977           }
6978         }
6979         if(wr) {
6980           #ifndef DESTRUCTIVE_WRITEBACK
6981           branch_regs[i].dirty&=wont_dirty_i;
6982           #endif
6983           branch_regs[i].dirty|=will_dirty_i;
6984         }
6985       }
6986       else
6987       {
6988         // Internal branch
6989         if(ba[i]<=start+i*4) {
6990           // Backward branch
6991           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6992           {
6993             // Unconditional branch
6994             temp_will_dirty=0;
6995             temp_wont_dirty=0;
6996             // Merge in delay slot (will dirty)
6997             for(r=0;r<HOST_REGS;r++) {
6998               if(r!=EXCLUDE_REG) {
6999                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7000                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7001                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7002                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7003                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7004                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7005                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7006                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7007                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7008                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7009                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7010                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7011                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7012                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7013               }
7014             }
7015           } else {
7016             // Conditional branch (not taken case)
7017             temp_will_dirty=will_dirty_next;
7018             temp_wont_dirty=wont_dirty_next;
7019             // Merge in delay slot (will dirty)
7020             for(r=0;r<HOST_REGS;r++) {
7021               if(r!=EXCLUDE_REG) {
7022                 if(!likely[i]) {
7023                   // Will not dirty if likely branch is not taken
7024                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7025                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7026                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7027                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7028                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7029                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7030                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7031                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7032                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7033                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7034                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7035                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7036                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7037                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7038                 }
7039               }
7040             }
7041           }
7042           // Merge in delay slot (wont dirty)
7043           for(r=0;r<HOST_REGS;r++) {
7044             if(r!=EXCLUDE_REG) {
7045               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7046               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7047               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7048               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7049               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7050               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7051               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7052               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7053               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7054               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7055             }
7056           }
7057           // Deal with changed mappings
7058           if(i<iend) {
7059             for(r=0;r<HOST_REGS;r++) {
7060               if(r!=EXCLUDE_REG) {
7061                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7062                   temp_will_dirty&=~(1<<r);
7063                   temp_wont_dirty&=~(1<<r);
7064                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7065                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7066                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7067                   } else {
7068                     temp_will_dirty|=1<<r;
7069                     temp_wont_dirty|=1<<r;
7070                   }
7071                 }
7072               }
7073             }
7074           }
7075           if(wr) {
7076             will_dirty[i]=temp_will_dirty;
7077             wont_dirty[i]=temp_wont_dirty;
7078             clean_registers((ba[i]-start)>>2,i-1,0);
7079           }else{
7080             // Limit recursion.  It can take an excessive amount
7081             // of time if there are a lot of nested loops.
7082             will_dirty[(ba[i]-start)>>2]=0;
7083             wont_dirty[(ba[i]-start)>>2]=-1;
7084           }
7085         }
7086         /*else*/ if(1)
7087         {
7088           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7089           {
7090             // Unconditional branch
7091             will_dirty_i=0;
7092             wont_dirty_i=0;
7093           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7094             for(r=0;r<HOST_REGS;r++) {
7095               if(r!=EXCLUDE_REG) {
7096                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7097                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7098                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7099                 }
7100               }
7101             }
7102           //}
7103             // Merge in delay slot
7104             for(r=0;r<HOST_REGS;r++) {
7105               if(r!=EXCLUDE_REG) {
7106                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7107                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7108                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7109                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7110                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7111                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7112                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7113                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7114                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7115                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7116                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7117                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7118                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7119                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7120               }
7121             }
7122           } else {
7123             // Conditional branch
7124             will_dirty_i=will_dirty_next;
7125             wont_dirty_i=wont_dirty_next;
7126           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7127             for(r=0;r<HOST_REGS;r++) {
7128               if(r!=EXCLUDE_REG) {
7129                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7130                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7131                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7132                 }
7133                 else
7134                 {
7135                   will_dirty_i&=~(1<<r);
7136                 }
7137                 // Treat delay slot as part of branch too
7138                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7139                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7140                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7141                 }
7142                 else
7143                 {
7144                   will_dirty[i+1]&=~(1<<r);
7145                 }*/
7146               }
7147             }
7148           //}
7149             // Merge in delay slot
7150             for(r=0;r<HOST_REGS;r++) {
7151               if(r!=EXCLUDE_REG) {
7152                 if(!likely[i]) {
7153                   // Might not dirty if likely branch is not taken
7154                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7155                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7156                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7157                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7158                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7159                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7160                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7161                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7162                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7163                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7164                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7165                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7166                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7167                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7168                 }
7169               }
7170             }
7171           }
7172           // Merge in delay slot
7173           for(r=0;r<HOST_REGS;r++) {
7174             if(r!=EXCLUDE_REG) {
7175               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7176               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7177               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7178               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7179               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7180               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7181               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7182               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7183               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7184               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7185             }
7186           }
7187           if(wr) {
7188             #ifndef DESTRUCTIVE_WRITEBACK
7189             branch_regs[i].dirty&=wont_dirty_i;
7190             #endif
7191             branch_regs[i].dirty|=will_dirty_i;
7192           }
7193         }
7194       }
7195     }
7196     else if(itype[i]==SYSCALL)
7197     {
7198       // SYSCALL instruction (software interrupt)
7199       will_dirty_i=0;
7200       wont_dirty_i=0;
7201     }
7202     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7203     {
7204       // ERET instruction (return from interrupt)
7205       will_dirty_i=0;
7206       wont_dirty_i=0;
7207     }
7208     will_dirty_next=will_dirty_i;
7209     wont_dirty_next=wont_dirty_i;
7210     for(r=0;r<HOST_REGS;r++) {
7211       if(r!=EXCLUDE_REG) {
7212         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7213         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7214         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7215         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7216         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7217         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7218         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7219         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7220         if(i>istart) {
7221           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7222           {
7223             // Don't store a register immediately after writing it,
7224             // may prevent dual-issue.
7225             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7226             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7227           }
7228         }
7229       }
7230     }
7231     // Save it
7232     will_dirty[i]=will_dirty_i;
7233     wont_dirty[i]=wont_dirty_i;
7234     // Mark registers that won't be dirtied as not dirty
7235     if(wr) {
7236       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7237       for(r=0;r<HOST_REGS;r++) {
7238         if((will_dirty_i>>r)&1) {
7239           printf(" r%d",r);
7240         }
7241       }
7242       printf("\n");*/
7243
7244       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7245         regs[i].dirty|=will_dirty_i;
7246         #ifndef DESTRUCTIVE_WRITEBACK
7247         regs[i].dirty&=wont_dirty_i;
7248         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7249         {
7250           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7251             for(r=0;r<HOST_REGS;r++) {
7252               if(r!=EXCLUDE_REG) {
7253                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7254                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7255                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7256               }
7257             }
7258           }
7259         }
7260         else
7261         {
7262           if(i<iend) {
7263             for(r=0;r<HOST_REGS;r++) {
7264               if(r!=EXCLUDE_REG) {
7265                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7266                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7267                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7268               }
7269             }
7270           }
7271         }
7272         #endif
7273       //}
7274     }
7275     // Deal with changed mappings
7276     temp_will_dirty=will_dirty_i;
7277     temp_wont_dirty=wont_dirty_i;
7278     for(r=0;r<HOST_REGS;r++) {
7279       if(r!=EXCLUDE_REG) {
7280         int nr;
7281         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7282           if(wr) {
7283             #ifndef DESTRUCTIVE_WRITEBACK
7284             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7285             #endif
7286             regs[i].wasdirty|=will_dirty_i&(1<<r);
7287           }
7288         }
7289         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7290           // Register moved to a different register
7291           will_dirty_i&=~(1<<r);
7292           wont_dirty_i&=~(1<<r);
7293           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7294           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7295           if(wr) {
7296             #ifndef DESTRUCTIVE_WRITEBACK
7297             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7298             #endif
7299             regs[i].wasdirty|=will_dirty_i&(1<<r);
7300           }
7301         }
7302         else {
7303           will_dirty_i&=~(1<<r);
7304           wont_dirty_i&=~(1<<r);
7305           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7306             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7307             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7308           } else {
7309             wont_dirty_i|=1<<r;
7310             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7311           }
7312         }
7313       }
7314     }
7315   }
7316 }
7317
7318   /* disassembly */
7319 void disassemble_inst(int i)
7320 {
7321     if (bt[i]) printf("*"); else printf(" ");
7322     switch(itype[i]) {
7323       case UJUMP:
7324         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7325       case CJUMP:
7326         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7327       case SJUMP:
7328         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7329       case FJUMP:
7330         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7331       case RJUMP:
7332         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7333       case SPAN:
7334         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7335       case IMM16:
7336         if(opcode[i]==0xf) //LUI
7337           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7338         else
7339           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7340         break;
7341       case LOAD:
7342       case LOADLR:
7343         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7344         break;
7345       case STORE:
7346       case STORELR:
7347         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7348         break;
7349       case ALU:
7350       case SHIFT:
7351         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7352         break;
7353       case MULTDIV:
7354         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7355         break;
7356       case SHIFTIMM:
7357         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7358         break;
7359       case MOV:
7360         if((opcode2[i]&0x1d)==0x10)
7361           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7362         else if((opcode2[i]&0x1d)==0x11)
7363           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7364         else
7365           printf (" %x: %s\n",start+i*4,insn[i]);
7366         break;
7367       case COP0:
7368         if(opcode2[i]==0)
7369           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7370         else if(opcode2[i]==4)
7371           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7372         else printf (" %x: %s\n",start+i*4,insn[i]);
7373         break;
7374       case COP1:
7375         if(opcode2[i]<3)
7376           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7377         else if(opcode2[i]>3)
7378           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7379         else printf (" %x: %s\n",start+i*4,insn[i]);
7380         break;
7381       case C1LS:
7382         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7383         break;
7384       default:
7385         //printf (" %s %8x\n",insn[i],source[i]);
7386         printf (" %x: %s\n",start+i*4,insn[i]);
7387     }
7388 }
7389
7390 void new_dynarec_init()
7391 {
7392   printf("Init new dynarec\n");
7393   out=(u_char *)BASE_ADDR;
7394   if (mmap (out, 1<<TARGET_SIZE_2,
7395             PROT_READ | PROT_WRITE | PROT_EXEC,
7396             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7397             -1, 0) <= 0) {printf("mmap() failed\n");}
7398 #ifdef MUPEN64
7399   rdword=&readmem_dword;
7400   fake_pc.f.r.rs=&readmem_dword;
7401   fake_pc.f.r.rt=&readmem_dword;
7402   fake_pc.f.r.rd=&readmem_dword;
7403 #endif
7404   int n;
7405   for(n=0x80000;n<0x80800;n++)
7406     invalid_code[n]=1;
7407   for(n=0;n<65536;n++)
7408     hash_table[n][0]=hash_table[n][2]=-1;
7409   memset(mini_ht,-1,sizeof(mini_ht));
7410   memset(restore_candidate,0,sizeof(restore_candidate));
7411   copy=shadow;
7412   expirep=16384; // Expiry pointer, +2 blocks
7413   pending_exception=0;
7414   literalcount=0;
7415 #ifdef HOST_IMM8
7416   // Copy this into local area so we don't have to put it in every literal pool
7417   invc_ptr=invalid_code;
7418 #endif
7419   stop_after_jal=0;
7420   // TLB
7421   using_tlb=0;
7422   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7423     memory_map[n]=-1;
7424   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7425     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7426   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7427     memory_map[n]=-1;
7428 #ifdef MUPEN64
7429   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7430     writemem[n] = write_nomem_new;
7431     writememb[n] = write_nomemb_new;
7432     writememh[n] = write_nomemh_new;
7433 #ifndef FORCE32
7434     writememd[n] = write_nomemd_new;
7435 #endif
7436     readmem[n] = read_nomem_new;
7437     readmemb[n] = read_nomemb_new;
7438     readmemh[n] = read_nomemh_new;
7439 #ifndef FORCE32
7440     readmemd[n] = read_nomemd_new;
7441 #endif
7442   }
7443   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7444     writemem[n] = write_rdram_new;
7445     writememb[n] = write_rdramb_new;
7446     writememh[n] = write_rdramh_new;
7447 #ifndef FORCE32
7448     writememd[n] = write_rdramd_new;
7449 #endif
7450   }
7451   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7452     writemem[n] = write_nomem_new;
7453     writememb[n] = write_nomemb_new;
7454     writememh[n] = write_nomemh_new;
7455 #ifndef FORCE32
7456     writememd[n] = write_nomemd_new;
7457 #endif
7458     readmem[n] = read_nomem_new;
7459     readmemb[n] = read_nomemb_new;
7460     readmemh[n] = read_nomemh_new;
7461 #ifndef FORCE32
7462     readmemd[n] = read_nomemd_new;
7463 #endif
7464   }
7465 #endif
7466   tlb_hacks();
7467   arch_init();
7468 }
7469
7470 void new_dynarec_cleanup()
7471 {
7472   int n;
7473   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7474   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7475   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7476   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7477   #ifdef ROM_COPY
7478   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7479   #endif
7480 }
7481
7482 int new_recompile_block(int addr)
7483 {
7484 /*
7485   if(addr==0x800cd050) {
7486     int block;
7487     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7488     int n;
7489     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7490   }
7491 */
7492   //if(Count==365117028) tracedebug=1;
7493   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7494   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7495   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7496   //if(debug) 
7497   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7498   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7499   /*if(Count>=312978186) {
7500     rlist();
7501   }*/
7502   //rlist();
7503   start = (u_int)addr&~3;
7504   //assert(((u_int)addr&1)==0);
7505 #ifdef MUPEN64
7506   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7507     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7508     pagelimit = 0xa4001000;
7509   }
7510   else
7511 #endif
7512   if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7513     source = (u_int *)((u_int)rdram+start-0x80000000);
7514     pagelimit = 0x80800000;
7515   }
7516 #ifndef DISABLE_TLB
7517   else if ((signed int)addr >= (signed int)0xC0000000) {
7518     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7519     //if(tlb_LUT_r[start>>12])
7520       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7521     if((signed int)memory_map[start>>12]>=0) {
7522       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7523       pagelimit=(start+4096)&0xFFFFF000;
7524       int map=memory_map[start>>12];
7525       int i;
7526       for(i=0;i<5;i++) {
7527         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7528         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7529       }
7530       assem_debug("pagelimit=%x\n",pagelimit);
7531       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7532     }
7533     else {
7534       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7535       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7536       return 1; // Caller will invoke exception handler
7537     }
7538     //printf("source= %x\n",(int)source);
7539   }
7540 #endif
7541   else {
7542     printf("Compile at bogus memory address: %x \n", (int)addr);
7543     exit(1);
7544   }
7545
7546   /* Pass 1: disassemble */
7547   /* Pass 2: register dependencies, branch targets */
7548   /* Pass 3: register allocation */
7549   /* Pass 4: branch dependencies */
7550   /* Pass 5: pre-alloc */
7551   /* Pass 6: optimize clean/dirty state */
7552   /* Pass 7: flag 32-bit registers */
7553   /* Pass 8: assembly */
7554   /* Pass 9: linker */
7555   /* Pass 10: garbage collection / free memory */
7556
7557   int i,j;
7558   int done=0;
7559   unsigned int type,op,op2;
7560
7561   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7562   
7563   /* Pass 1 disassembly */
7564
7565   for(i=0;!done;i++) {
7566     bt[i]=0;likely[i]=0;op2=0;
7567     opcode[i]=op=source[i]>>26;
7568     switch(op)
7569     {
7570       case 0x00: strcpy(insn[i],"special"); type=NI;
7571         op2=source[i]&0x3f;
7572         switch(op2)
7573         {
7574           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7575           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7576           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7577           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7578           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7579           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7580           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7581           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7582           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7583           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7584           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7585           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7586           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7587           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7588           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7589           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7590           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7591           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7592           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7593           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7594           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7595           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7596           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7597           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7598           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7599           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7600           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7601           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7602           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7603           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7604           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7605           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7606           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7607           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7608           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7609           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7610           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7611           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7612           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7613           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7614           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7615           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7616           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7617           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7618           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7619           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7620           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7621           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7622           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7623           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7624           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7625           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7626         }
7627         break;
7628       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7629         op2=(source[i]>>16)&0x1f;
7630         switch(op2)
7631         {
7632           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7633           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7634           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7635           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7636           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7637           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7638           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7639           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7640           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7641           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7642           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7643           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7644           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7645           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7646         }
7647         break;
7648       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7649       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7650       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7651       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7652       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7653       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7654       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7655       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7656       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7657       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7658       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7659       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7660       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7661       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7662       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7663         op2=(source[i]>>21)&0x1f;
7664         switch(op2)
7665         {
7666           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7667           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7668           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7669           switch(source[i]&0x3f)
7670           {
7671             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7672             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7673             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7674             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7675             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7676           }
7677         }
7678         break;
7679       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7680         op2=(source[i]>>21)&0x1f;
7681         switch(op2)
7682         {
7683           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7684           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7685           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7686           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7687           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7688           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7689           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7690           switch((source[i]>>16)&0x3)
7691           {
7692             case 0x00: strcpy(insn[i],"BC1F"); break;
7693             case 0x01: strcpy(insn[i],"BC1T"); break;
7694             case 0x02: strcpy(insn[i],"BC1FL"); break;
7695             case 0x03: strcpy(insn[i],"BC1TL"); break;
7696           }
7697           break;
7698           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7699           switch(source[i]&0x3f)
7700           {
7701             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7702             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7703             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7704             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7705             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7706             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7707             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7708             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7709             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7710             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7711             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7712             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7713             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7714             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7715             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7716             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7717             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7718             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7719             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7720             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7721             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7722             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7723             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7724             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7725             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7726             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7727             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7728             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7729             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7730             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7731             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7732             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7733             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7734             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7735             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7736           }
7737           break;
7738           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7739           switch(source[i]&0x3f)
7740           {
7741             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7742             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7743             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7744             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7745             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7746             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7747             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7748             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7749             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7750             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7751             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7752             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7753             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7754             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7755             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7756             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7757             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7758             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7759             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7760             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7761             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7762             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7763             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7764             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7765             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7766             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7767             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7768             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7769             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7770             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7771             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7772             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7773             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7774             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7775             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7776           }
7777           break;
7778           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7779           switch(source[i]&0x3f)
7780           {
7781             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7782             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7783           }
7784           break;
7785           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7786           switch(source[i]&0x3f)
7787           {
7788             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7789             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7790           }
7791           break;
7792         }
7793         break;
7794       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7795       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7796       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7797       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7798       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7799       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7800       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7801       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7802       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7803       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7804       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7805       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7806       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7807       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7808       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7809       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7810       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7811       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7812       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7813       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7814       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7815       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7816       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7817       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7818       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7819       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7820       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7821       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7822       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7823       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7824       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7825       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7826       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7827       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7828       default: strcpy(insn[i],"???"); type=NI;
7829         assem_debug("NI %08x @%08x\n", source[i], addr + i*4);
7830         break;
7831     }
7832     itype[i]=type;
7833     opcode2[i]=op2;
7834     /* Get registers/immediates */
7835     lt1[i]=0;
7836     us1[i]=0;
7837     us2[i]=0;
7838     dep1[i]=0;
7839     dep2[i]=0;
7840     switch(type) {
7841       case LOAD:
7842         rs1[i]=(source[i]>>21)&0x1f;
7843         rs2[i]=0;
7844         rt1[i]=(source[i]>>16)&0x1f;
7845         rt2[i]=0;
7846         imm[i]=(short)source[i];
7847         break;
7848       case STORE:
7849       case STORELR:
7850         rs1[i]=(source[i]>>21)&0x1f;
7851         rs2[i]=(source[i]>>16)&0x1f;
7852         rt1[i]=0;
7853         rt2[i]=0;
7854         imm[i]=(short)source[i];
7855         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7856         break;
7857       case LOADLR:
7858         // LWL/LWR only load part of the register,
7859         // therefore the target register must be treated as a source too
7860         rs1[i]=(source[i]>>21)&0x1f;
7861         rs2[i]=(source[i]>>16)&0x1f;
7862         rt1[i]=(source[i]>>16)&0x1f;
7863         rt2[i]=0;
7864         imm[i]=(short)source[i];
7865         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7866         if(op==0x26) dep1[i]=rt1[i]; // LWR
7867         break;
7868       case IMM16:
7869         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7870         else rs1[i]=(source[i]>>21)&0x1f;
7871         rs2[i]=0;
7872         rt1[i]=(source[i]>>16)&0x1f;
7873         rt2[i]=0;
7874         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7875           imm[i]=(unsigned short)source[i];
7876         }else{
7877           imm[i]=(short)source[i];
7878         }
7879         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7880         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7881         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7882         break;
7883       case UJUMP:
7884         rs1[i]=0;
7885         rs2[i]=0;
7886         rt1[i]=0;
7887         rt2[i]=0;
7888         // The JAL instruction writes to r31.
7889         if (op&1) {
7890           rt1[i]=31;
7891         }
7892         rs2[i]=CCREG;
7893         break;
7894       case RJUMP:
7895         rs1[i]=(source[i]>>21)&0x1f;
7896         rs2[i]=0;
7897         rt1[i]=0;
7898         rt2[i]=0;
7899         // The JALR instruction writes to r31.
7900         if (op2&1) {
7901           rt1[i]=31;   
7902         }
7903         rs2[i]=CCREG;
7904         break;
7905       case CJUMP:
7906         rs1[i]=(source[i]>>21)&0x1f;
7907         rs2[i]=(source[i]>>16)&0x1f;
7908         rt1[i]=0;
7909         rt2[i]=0;
7910         if(op&2) { // BGTZ/BLEZ
7911           rs2[i]=0;
7912         }
7913         us1[i]=rs1[i];
7914         us2[i]=rs2[i];
7915         likely[i]=op>>4;
7916         break;
7917       case SJUMP:
7918         rs1[i]=(source[i]>>21)&0x1f;
7919         rs2[i]=CCREG;
7920         rt1[i]=0;
7921         rt2[i]=0;
7922         us1[i]=rs1[i];
7923         if(op2&0x10) { // BxxAL
7924           rt1[i]=31;
7925           // NOTE: If the branch is not taken, r31 is still overwritten
7926         }
7927         likely[i]=(op2&2)>>1;
7928         break;
7929       case FJUMP:
7930         rs1[i]=FSREG;
7931         rs2[i]=CSREG;
7932         rt1[i]=0;
7933         rt2[i]=0;
7934         likely[i]=((source[i])>>17)&1;
7935         break;
7936       case ALU:
7937         rs1[i]=(source[i]>>21)&0x1f; // source
7938         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7939         rt1[i]=(source[i]>>11)&0x1f; // destination
7940         rt2[i]=0;
7941         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7942           us1[i]=rs1[i];us2[i]=rs2[i];
7943         }
7944         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7945           dep1[i]=rs1[i];dep2[i]=rs2[i];
7946         }
7947         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7948           dep1[i]=rs1[i];dep2[i]=rs2[i];
7949         }
7950         break;
7951       case MULTDIV:
7952         rs1[i]=(source[i]>>21)&0x1f; // source
7953         rs2[i]=(source[i]>>16)&0x1f; // divisor
7954         rt1[i]=HIREG;
7955         rt2[i]=LOREG;
7956         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7957           us1[i]=rs1[i];us2[i]=rs2[i];
7958         }
7959         break;
7960       case MOV:
7961         rs1[i]=0;
7962         rs2[i]=0;
7963         rt1[i]=0;
7964         rt2[i]=0;
7965         if(op2==0x10) rs1[i]=HIREG; // MFHI
7966         if(op2==0x11) rt1[i]=HIREG; // MTHI
7967         if(op2==0x12) rs1[i]=LOREG; // MFLO
7968         if(op2==0x13) rt1[i]=LOREG; // MTLO
7969         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7970         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7971         dep1[i]=rs1[i];
7972         break;
7973       case SHIFT:
7974         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7975         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7976         rt1[i]=(source[i]>>11)&0x1f; // destination
7977         rt2[i]=0;
7978         // DSLLV/DSRLV/DSRAV are 64-bit
7979         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7980         break;
7981       case SHIFTIMM:
7982         rs1[i]=(source[i]>>16)&0x1f;
7983         rs2[i]=0;
7984         rt1[i]=(source[i]>>11)&0x1f;
7985         rt2[i]=0;
7986         imm[i]=(source[i]>>6)&0x1f;
7987         // DSxx32 instructions
7988         if(op2>=0x3c) imm[i]|=0x20;
7989         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7990         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7991         break;
7992       case COP0:
7993         rs1[i]=0;
7994         rs2[i]=0;
7995         rt1[i]=0;
7996         rt2[i]=0;
7997         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7998         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7999         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8000         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8001         break;
8002       case COP1:
8003         rs1[i]=0;
8004         rs2[i]=0;
8005         rt1[i]=0;
8006         rt2[i]=0;
8007         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8008         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8009         if(op2==5) us1[i]=rs1[i]; // DMTC1
8010         rs2[i]=CSREG;
8011         break;
8012       case C1LS:
8013         rs1[i]=(source[i]>>21)&0x1F;
8014         rs2[i]=CSREG;
8015         rt1[i]=0;
8016         rt2[i]=0;
8017         imm[i]=(short)source[i];
8018         break;
8019       case FLOAT:
8020       case FCONV:
8021         rs1[i]=0;
8022         rs2[i]=CSREG;
8023         rt1[i]=0;
8024         rt2[i]=0;
8025         break;
8026       case FCOMP:
8027         rs1[i]=FSREG;
8028         rs2[i]=CSREG;
8029         rt1[i]=FSREG;
8030         rt2[i]=0;
8031         break;
8032       case SYSCALL:
8033         rs1[i]=CCREG;
8034         rs2[i]=0;
8035         rt1[i]=0;
8036         rt2[i]=0;
8037         break;
8038       default:
8039         rs1[i]=0;
8040         rs2[i]=0;
8041         rt1[i]=0;
8042         rt2[i]=0;
8043     }
8044     /* Calculate branch target addresses */
8045     if(type==UJUMP)
8046       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8047     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8048       ba[i]=start+i*4+8; // Ignore never taken branch
8049     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8050       ba[i]=start+i*4+8; // Ignore never taken branch
8051     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8052       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8053     else ba[i]=-1;
8054     /* Is this the end of the block? */
8055     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8056       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8057         done=1;
8058         // Does the block continue due to a branch?
8059         for(j=i-1;j>=0;j--)
8060         {
8061           if(ba[j]==start+i*4+4) done=j=0;
8062           if(ba[j]==start+i*4+8) done=j=0;
8063         }
8064       }
8065       else {
8066         if(stop_after_jal) done=1;
8067         // Stop on BREAK
8068         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8069       }
8070       // Don't recompile stuff that's already compiled
8071       if(check_addr(start+i*4+4)) done=1;
8072       // Don't get too close to the limit
8073       if(i>MAXBLOCK/2) done=1;
8074     }
8075     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8076     assert(i<MAXBLOCK-1);
8077     if(start+i*4==pagelimit-4) done=1;
8078     assert(start+i*4<pagelimit);
8079     if (i==MAXBLOCK-1) done=1;
8080     // Stop if we're compiling junk
8081     if(itype[i]==NI&&opcode[i]==0x11) {
8082       done=stop_after_jal=1;
8083       printf("Disabled speculative precompilation\n");
8084     }
8085   }
8086   slen=i;
8087   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8088     if(start+i*4==pagelimit) {
8089       itype[i-1]=SPAN;
8090     }
8091   }
8092   assert(slen>0);
8093
8094   /* Pass 2 - Register dependencies and branch targets */
8095
8096   unneeded_registers(0,slen-1,0);
8097   
8098   /* Pass 3 - Register allocation */
8099
8100   struct regstat current; // Current register allocations/status
8101   current.is32=1;
8102   current.dirty=0;
8103   current.u=unneeded_reg[0];
8104   current.uu=unneeded_reg_upper[0];
8105   clear_all_regs(current.regmap);
8106   alloc_reg(&current,0,CCREG);
8107   dirty_reg(&current,CCREG);
8108   current.isconst=0;
8109   current.wasconst=0;
8110   int ds=0;
8111   int cc=0;
8112   int hr;
8113   
8114   provisional_32bit();
8115   
8116   if((u_int)addr&1) {
8117     // First instruction is delay slot
8118     cc=-1;
8119     bt[1]=1;
8120     ds=1;
8121     unneeded_reg[0]=1;
8122     unneeded_reg_upper[0]=1;
8123     current.regmap[HOST_BTREG]=BTREG;
8124   }
8125   
8126   for(i=0;i<slen;i++)
8127   {
8128     if(bt[i])
8129     {
8130       int hr;
8131       for(hr=0;hr<HOST_REGS;hr++)
8132       {
8133         // Is this really necessary?
8134         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8135       }
8136       current.isconst=0;
8137     }
8138     if(i>1)
8139     {
8140       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8141       {
8142         if(rs1[i-2]==0||rs2[i-2]==0)
8143         {
8144           if(rs1[i-2]) {
8145             current.is32|=1LL<<rs1[i-2];
8146             int hr=get_reg(current.regmap,rs1[i-2]|64);
8147             if(hr>=0) current.regmap[hr]=-1;
8148           }
8149           if(rs2[i-2]) {
8150             current.is32|=1LL<<rs2[i-2];
8151             int hr=get_reg(current.regmap,rs2[i-2]|64);
8152             if(hr>=0) current.regmap[hr]=-1;
8153           }
8154         }
8155       }
8156     }
8157     // If something jumps here with 64-bit values
8158     // then promote those registers to 64 bits
8159     if(bt[i])
8160     {
8161       uint64_t temp_is32=current.is32;
8162       for(j=i-1;j>=0;j--)
8163       {
8164         if(ba[j]==start+i*4) 
8165           temp_is32&=branch_regs[j].is32;
8166       }
8167       for(j=i;j<slen;j++)
8168       {
8169         if(ba[j]==start+i*4) 
8170           //temp_is32=1;
8171           temp_is32&=p32[j];
8172       }
8173       if(temp_is32!=current.is32) {
8174         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8175         #ifdef DESTRUCTIVE_WRITEBACK
8176         for(hr=0;hr<HOST_REGS;hr++)
8177         {
8178           int r=current.regmap[hr];
8179           if(r>0&&r<64)
8180           {
8181             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8182               temp_is32|=1LL<<r;
8183               //printf("restore %d\n",r);
8184             }
8185           }
8186         }
8187         #endif
8188         current.is32=temp_is32;
8189       }
8190     }
8191 #ifdef FORCE32
8192     memset(p32, 0xff, sizeof(p32));
8193     current.is32=-1LL;
8194 #endif
8195
8196     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8197     regs[i].wasconst=current.isconst;
8198     regs[i].was32=current.is32;
8199     regs[i].wasdirty=current.dirty;
8200     #ifdef DESTRUCTIVE_WRITEBACK
8201     // To change a dirty register from 32 to 64 bits, we must write
8202     // it out during the previous cycle (for branches, 2 cycles)
8203     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8204     {
8205       uint64_t temp_is32=current.is32;
8206       for(j=i-1;j>=0;j--)
8207       {
8208         if(ba[j]==start+i*4+4) 
8209           temp_is32&=branch_regs[j].is32;
8210       }
8211       for(j=i;j<slen;j++)
8212       {
8213         if(ba[j]==start+i*4+4) 
8214           //temp_is32=1;
8215           temp_is32&=p32[j];
8216       }
8217       if(temp_is32!=current.is32) {
8218         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8219         for(hr=0;hr<HOST_REGS;hr++)
8220         {
8221           int r=current.regmap[hr];
8222           if(r>0)
8223           {
8224             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8225               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8226               {
8227                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8228                 {
8229                   //printf("dump %d/r%d\n",hr,r);
8230                   current.regmap[hr]=-1;
8231                   if(get_reg(current.regmap,r|64)>=0) 
8232                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8233                 }
8234               }
8235             }
8236           }
8237         }
8238       }
8239     }
8240     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8241     {
8242       uint64_t temp_is32=current.is32;
8243       for(j=i-1;j>=0;j--)
8244       {
8245         if(ba[j]==start+i*4+8) 
8246           temp_is32&=branch_regs[j].is32;
8247       }
8248       for(j=i;j<slen;j++)
8249       {
8250         if(ba[j]==start+i*4+8) 
8251           //temp_is32=1;
8252           temp_is32&=p32[j];
8253       }
8254       if(temp_is32!=current.is32) {
8255         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8256         for(hr=0;hr<HOST_REGS;hr++)
8257         {
8258           int r=current.regmap[hr];
8259           if(r>0)
8260           {
8261             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8262               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8263               {
8264                 //printf("dump %d/r%d\n",hr,r);
8265                 current.regmap[hr]=-1;
8266                 if(get_reg(current.regmap,r|64)>=0) 
8267                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8268               }
8269             }
8270           }
8271         }
8272       }
8273     }
8274     #endif
8275     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8276       if(i+1<slen) {
8277         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8278         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8279         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8280         current.u|=1;
8281         current.uu|=1;
8282       } else {
8283         current.u=1;
8284         current.uu=1;
8285       }
8286     } else {
8287       if(i+1<slen) {
8288         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8289         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8290         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8291         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8292         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8293         current.u|=1;
8294         current.uu|=1;
8295       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8296     }
8297     is_ds[i]=ds;
8298     if(ds) {
8299       ds=0; // Skip delay slot, already allocated as part of branch
8300       // ...but we need to alloc it in case something jumps here
8301       if(i+1<slen) {
8302         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8303         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8304       }else{
8305         current.u=branch_unneeded_reg[i-1];
8306         current.uu=branch_unneeded_reg_upper[i-1];
8307       }
8308       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8309       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8310       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8311       current.u|=1;
8312       current.uu|=1;
8313       struct regstat temp;
8314       memcpy(&temp,&current,sizeof(current));
8315       temp.wasdirty=temp.dirty;
8316       temp.was32=temp.is32;
8317       // TODO: Take into account unconditional branches, as below
8318       delayslot_alloc(&temp,i);
8319       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8320       regs[i].wasdirty=temp.wasdirty;
8321       regs[i].was32=temp.was32;
8322       regs[i].dirty=temp.dirty;
8323       regs[i].is32=temp.is32;
8324       regs[i].isconst=0;
8325       regs[i].wasconst=0;
8326       current.isconst=0;
8327       // Create entry (branch target) regmap
8328       for(hr=0;hr<HOST_REGS;hr++)
8329       {
8330         int r=temp.regmap[hr];
8331         if(r>=0) {
8332           if(r!=regmap_pre[i][hr]) {
8333             regs[i].regmap_entry[hr]=-1;
8334           }
8335           else
8336           {
8337             if(r<64){
8338               if((current.u>>r)&1) {
8339                 regs[i].regmap_entry[hr]=-1;
8340                 regs[i].regmap[hr]=-1;
8341                 //Don't clear regs in the delay slot as the branch might need them
8342                 //current.regmap[hr]=-1;
8343               }else
8344                 regs[i].regmap_entry[hr]=r;
8345             }
8346             else {
8347               if((current.uu>>(r&63))&1) {
8348                 regs[i].regmap_entry[hr]=-1;
8349                 regs[i].regmap[hr]=-1;
8350                 //Don't clear regs in the delay slot as the branch might need them
8351                 //current.regmap[hr]=-1;
8352               }else
8353                 regs[i].regmap_entry[hr]=r;
8354             }
8355           }
8356         } else {
8357           // First instruction expects CCREG to be allocated
8358           if(i==0&&hr==HOST_CCREG) 
8359             regs[i].regmap_entry[hr]=CCREG;
8360           else
8361             regs[i].regmap_entry[hr]=-1;
8362         }
8363       }
8364     }
8365     else { // Not delay slot
8366       switch(itype[i]) {
8367         case UJUMP:
8368           //current.isconst=0; // DEBUG
8369           //current.wasconst=0; // DEBUG
8370           //regs[i].wasconst=0; // DEBUG
8371           clear_const(&current,rt1[i]);
8372           alloc_cc(&current,i);
8373           dirty_reg(&current,CCREG);
8374           if (rt1[i]==31) {
8375             alloc_reg(&current,i,31);
8376             dirty_reg(&current,31);
8377             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8378             #ifdef REG_PREFETCH
8379             alloc_reg(&current,i,PTEMP);
8380             #endif
8381             //current.is32|=1LL<<rt1[i];
8382           }
8383           delayslot_alloc(&current,i+1);
8384           //current.isconst=0; // DEBUG
8385           ds=1;
8386           //printf("i=%d, isconst=%x\n",i,current.isconst);
8387           break;
8388         case RJUMP:
8389           //current.isconst=0;
8390           //current.wasconst=0;
8391           //regs[i].wasconst=0;
8392           clear_const(&current,rs1[i]);
8393           clear_const(&current,rt1[i]);
8394           alloc_cc(&current,i);
8395           dirty_reg(&current,CCREG);
8396           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8397             alloc_reg(&current,i,rs1[i]);
8398             if (rt1[i]==31) {
8399               alloc_reg(&current,i,31);
8400               dirty_reg(&current,31);
8401               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8402               #ifdef REG_PREFETCH
8403               alloc_reg(&current,i,PTEMP);
8404               #endif
8405             }
8406             #ifdef USE_MINI_HT
8407             if(rs1[i]==31) { // JALR
8408               alloc_reg(&current,i,RHASH);
8409               #ifndef HOST_IMM_ADDR32
8410               alloc_reg(&current,i,RHTBL);
8411               #endif
8412             }
8413             #endif
8414             delayslot_alloc(&current,i+1);
8415           } else {
8416             // The delay slot overwrites our source register,
8417             // allocate a temporary register to hold the old value.
8418             current.isconst=0;
8419             current.wasconst=0;
8420             regs[i].wasconst=0;
8421             delayslot_alloc(&current,i+1);
8422             current.isconst=0;
8423             alloc_reg(&current,i,RTEMP);
8424           }
8425           //current.isconst=0; // DEBUG
8426           ds=1;
8427           break;
8428         case CJUMP:
8429           //current.isconst=0;
8430           //current.wasconst=0;
8431           //regs[i].wasconst=0;
8432           clear_const(&current,rs1[i]);
8433           clear_const(&current,rs2[i]);
8434           if((opcode[i]&0x3E)==4) // BEQ/BNE
8435           {
8436             alloc_cc(&current,i);
8437             dirty_reg(&current,CCREG);
8438             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8439             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8440             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8441             {
8442               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8443               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8444             }
8445             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8446                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8447               // The delay slot overwrites one of our conditions.
8448               // Allocate the branch condition registers instead.
8449               // Note that such a sequence of instructions could
8450               // be considered a bug since the branch can not be
8451               // re-executed if an exception occurs.
8452               current.isconst=0;
8453               current.wasconst=0;
8454               regs[i].wasconst=0;
8455               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8456               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8457               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8458               {
8459                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8460                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8461               }
8462             }
8463             else delayslot_alloc(&current,i+1);
8464           }
8465           else
8466           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8467           {
8468             alloc_cc(&current,i);
8469             dirty_reg(&current,CCREG);
8470             alloc_reg(&current,i,rs1[i]);
8471             if(!(current.is32>>rs1[i]&1))
8472             {
8473               alloc_reg64(&current,i,rs1[i]);
8474             }
8475             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8476               // The delay slot overwrites one of our conditions.
8477               // Allocate the branch condition registers instead.
8478               // Note that such a sequence of instructions could
8479               // be considered a bug since the branch can not be
8480               // re-executed if an exception occurs.
8481               current.isconst=0;
8482               current.wasconst=0;
8483               regs[i].wasconst=0;
8484               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8485               if(!((current.is32>>rs1[i])&1))
8486               {
8487                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8488               }
8489             }
8490             else delayslot_alloc(&current,i+1);
8491           }
8492           else
8493           // Don't alloc the delay slot yet because we might not execute it
8494           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8495           {
8496             current.isconst=0;
8497             current.wasconst=0;
8498             regs[i].wasconst=0;
8499             alloc_cc(&current,i);
8500             dirty_reg(&current,CCREG);
8501             alloc_reg(&current,i,rs1[i]);
8502             alloc_reg(&current,i,rs2[i]);
8503             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8504             {
8505               alloc_reg64(&current,i,rs1[i]);
8506               alloc_reg64(&current,i,rs2[i]);
8507             }
8508           }
8509           else
8510           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8511           {
8512             current.isconst=0;
8513             current.wasconst=0;
8514             regs[i].wasconst=0;
8515             alloc_cc(&current,i);
8516             dirty_reg(&current,CCREG);
8517             alloc_reg(&current,i,rs1[i]);
8518             if(!(current.is32>>rs1[i]&1))
8519             {
8520               alloc_reg64(&current,i,rs1[i]);
8521             }
8522           }
8523           ds=1;
8524           //current.isconst=0;
8525           break;
8526         case SJUMP:
8527           //current.isconst=0;
8528           //current.wasconst=0;
8529           //regs[i].wasconst=0;
8530           clear_const(&current,rs1[i]);
8531           clear_const(&current,rt1[i]);
8532           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8533           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8534           {
8535             alloc_cc(&current,i);
8536             dirty_reg(&current,CCREG);
8537             alloc_reg(&current,i,rs1[i]);
8538             if(!(current.is32>>rs1[i]&1))
8539             {
8540               alloc_reg64(&current,i,rs1[i]);
8541             }
8542             if (rt1[i]==31) { // BLTZAL/BGEZAL
8543               alloc_reg(&current,i,31);
8544               dirty_reg(&current,31);
8545               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8546               //#ifdef REG_PREFETCH
8547               //alloc_reg(&current,i,PTEMP);
8548               //#endif
8549               //current.is32|=1LL<<rt1[i];
8550             }
8551             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8552               // The delay slot overwrites the branch condition.
8553               // Allocate the branch condition registers instead.
8554               // Note that such a sequence of instructions could
8555               // be considered a bug since the branch can not be
8556               // re-executed if an exception occurs.
8557               current.isconst=0;
8558               current.wasconst=0;
8559               regs[i].wasconst=0;
8560               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8561               if(!((current.is32>>rs1[i])&1))
8562               {
8563                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8564               }
8565             }
8566             else delayslot_alloc(&current,i+1);
8567           }
8568           else
8569           // Don't alloc the delay slot yet because we might not execute it
8570           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8571           {
8572             current.isconst=0;
8573             current.wasconst=0;
8574             regs[i].wasconst=0;
8575             alloc_cc(&current,i);
8576             dirty_reg(&current,CCREG);
8577             alloc_reg(&current,i,rs1[i]);
8578             if(!(current.is32>>rs1[i]&1))
8579             {
8580               alloc_reg64(&current,i,rs1[i]);
8581             }
8582           }
8583           ds=1;
8584           //current.isconst=0;
8585           break;
8586         case FJUMP:
8587           current.isconst=0;
8588           current.wasconst=0;
8589           regs[i].wasconst=0;
8590           if(likely[i]==0) // BC1F/BC1T
8591           {
8592             // TODO: Theoretically we can run out of registers here on x86.
8593             // The delay slot can allocate up to six, and we need to check
8594             // CSREG before executing the delay slot.  Possibly we can drop
8595             // the cycle count and then reload it after checking that the
8596             // FPU is in a usable state, or don't do out-of-order execution.
8597             alloc_cc(&current,i);
8598             dirty_reg(&current,CCREG);
8599             alloc_reg(&current,i,FSREG);
8600             alloc_reg(&current,i,CSREG);
8601             if(itype[i+1]==FCOMP) {
8602               // The delay slot overwrites the branch condition.
8603               // Allocate the branch condition registers instead.
8604               // Note that such a sequence of instructions could
8605               // be considered a bug since the branch can not be
8606               // re-executed if an exception occurs.
8607               alloc_cc(&current,i);
8608               dirty_reg(&current,CCREG);
8609               alloc_reg(&current,i,CSREG);
8610               alloc_reg(&current,i,FSREG);
8611             }
8612             else {
8613               delayslot_alloc(&current,i+1);
8614               alloc_reg(&current,i+1,CSREG);
8615             }
8616           }
8617           else
8618           // Don't alloc the delay slot yet because we might not execute it
8619           if(likely[i]) // BC1FL/BC1TL
8620           {
8621             alloc_cc(&current,i);
8622             dirty_reg(&current,CCREG);
8623             alloc_reg(&current,i,CSREG);
8624             alloc_reg(&current,i,FSREG);
8625           }
8626           ds=1;
8627           current.isconst=0;
8628           break;
8629         case IMM16:
8630           imm16_alloc(&current,i);
8631           break;
8632         case LOAD:
8633         case LOADLR:
8634           load_alloc(&current,i);
8635           break;
8636         case STORE:
8637         case STORELR:
8638           store_alloc(&current,i);
8639           break;
8640         case ALU:
8641           alu_alloc(&current,i);
8642           break;
8643         case SHIFT:
8644           shift_alloc(&current,i);
8645           break;
8646         case MULTDIV:
8647           multdiv_alloc(&current,i);
8648           break;
8649         case SHIFTIMM:
8650           shiftimm_alloc(&current,i);
8651           break;
8652         case MOV:
8653           mov_alloc(&current,i);
8654           break;
8655         case COP0:
8656           cop0_alloc(&current,i);
8657           break;
8658         case COP1:
8659           cop1_alloc(&current,i);
8660           break;
8661         case C1LS:
8662           c1ls_alloc(&current,i);
8663           break;
8664         case FCONV:
8665           fconv_alloc(&current,i);
8666           break;
8667         case FLOAT:
8668           float_alloc(&current,i);
8669           break;
8670         case FCOMP:
8671           fcomp_alloc(&current,i);
8672           break;
8673         case SYSCALL:
8674           syscall_alloc(&current,i);
8675           break;
8676         case SPAN:
8677           pagespan_alloc(&current,i);
8678           break;
8679       }
8680       
8681       // Drop the upper half of registers that have become 32-bit
8682       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8683       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8684         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8685         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8686         current.uu|=1;
8687       } else {
8688         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8689         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8690         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8691         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8692         current.uu|=1;
8693       }
8694
8695       // Create entry (branch target) regmap
8696       for(hr=0;hr<HOST_REGS;hr++)
8697       {
8698         int r,or,er;
8699         r=current.regmap[hr];
8700         if(r>=0) {
8701           if(r!=regmap_pre[i][hr]) {
8702             // TODO: delay slot (?)
8703             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8704             if(or<0||(r&63)>=TEMPREG){
8705               regs[i].regmap_entry[hr]=-1;
8706             }
8707             else
8708             {
8709               // Just move it to a different register
8710               regs[i].regmap_entry[hr]=r;
8711               // If it was dirty before, it's still dirty
8712               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8713             }
8714           }
8715           else
8716           {
8717             // Unneeded
8718             if(r==0){
8719               regs[i].regmap_entry[hr]=0;
8720             }
8721             else
8722             if(r<64){
8723               if((current.u>>r)&1) {
8724                 regs[i].regmap_entry[hr]=-1;
8725                 //regs[i].regmap[hr]=-1;
8726                 current.regmap[hr]=-1;
8727               }else
8728                 regs[i].regmap_entry[hr]=r;
8729             }
8730             else {
8731               if((current.uu>>(r&63))&1) {
8732                 regs[i].regmap_entry[hr]=-1;
8733                 //regs[i].regmap[hr]=-1;
8734                 current.regmap[hr]=-1;
8735               }else
8736                 regs[i].regmap_entry[hr]=r;
8737             }
8738           }
8739         } else {
8740           // Branches expect CCREG to be allocated at the target
8741           if(regmap_pre[i][hr]==CCREG) 
8742             regs[i].regmap_entry[hr]=CCREG;
8743           else
8744             regs[i].regmap_entry[hr]=-1;
8745         }
8746       }
8747       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8748     }
8749     /* Branch post-alloc */
8750     if(i>0)
8751     {
8752       current.was32=current.is32;
8753       current.wasdirty=current.dirty;
8754       switch(itype[i-1]) {
8755         case UJUMP:
8756           memcpy(&branch_regs[i-1],&current,sizeof(current));
8757           branch_regs[i-1].isconst=0;
8758           branch_regs[i-1].wasconst=0;
8759           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8760           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8761           alloc_cc(&branch_regs[i-1],i-1);
8762           dirty_reg(&branch_regs[i-1],CCREG);
8763           if(rt1[i-1]==31) { // JAL
8764             alloc_reg(&branch_regs[i-1],i-1,31);
8765             dirty_reg(&branch_regs[i-1],31);
8766             branch_regs[i-1].is32|=1LL<<31;
8767           }
8768           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8769           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8770           break;
8771         case RJUMP:
8772           memcpy(&branch_regs[i-1],&current,sizeof(current));
8773           branch_regs[i-1].isconst=0;
8774           branch_regs[i-1].wasconst=0;
8775           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8776           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8777           alloc_cc(&branch_regs[i-1],i-1);
8778           dirty_reg(&branch_regs[i-1],CCREG);
8779           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8780           if(rt1[i-1]==31) { // JALR
8781             alloc_reg(&branch_regs[i-1],i-1,31);
8782             dirty_reg(&branch_regs[i-1],31);
8783             branch_regs[i-1].is32|=1LL<<31;
8784           }
8785           #ifdef USE_MINI_HT
8786           if(rs1[i-1]==31) { // JALR
8787             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8788             #ifndef HOST_IMM_ADDR32
8789             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8790             #endif
8791           }
8792           #endif
8793           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8794           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8795           break;
8796         case CJUMP:
8797           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8798           {
8799             alloc_cc(&current,i-1);
8800             dirty_reg(&current,CCREG);
8801             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8802                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8803               // The delay slot overwrote one of our conditions
8804               // Delay slot goes after the test (in order)
8805               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8806               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8807               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8808               current.u|=1;
8809               current.uu|=1;
8810               delayslot_alloc(&current,i);
8811               current.isconst=0;
8812             }
8813             else
8814             {
8815               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8816               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8817               // Alloc the branch condition registers
8818               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8819               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8820               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8821               {
8822                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8823                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8824               }
8825             }
8826             memcpy(&branch_regs[i-1],&current,sizeof(current));
8827             branch_regs[i-1].isconst=0;
8828             branch_regs[i-1].wasconst=0;
8829             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8830             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8831           }
8832           else
8833           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8834           {
8835             alloc_cc(&current,i-1);
8836             dirty_reg(&current,CCREG);
8837             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8838               // The delay slot overwrote the branch condition
8839               // Delay slot goes after the test (in order)
8840               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8841               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8842               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8843               current.u|=1;
8844               current.uu|=1;
8845               delayslot_alloc(&current,i);
8846               current.isconst=0;
8847             }
8848             else
8849             {
8850               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8851               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8852               // Alloc the branch condition register
8853               alloc_reg(&current,i-1,rs1[i-1]);
8854               if(!(current.is32>>rs1[i-1]&1))
8855               {
8856                 alloc_reg64(&current,i-1,rs1[i-1]);
8857               }
8858             }
8859             memcpy(&branch_regs[i-1],&current,sizeof(current));
8860             branch_regs[i-1].isconst=0;
8861             branch_regs[i-1].wasconst=0;
8862             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8863             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8864           }
8865           else
8866           // Alloc the delay slot in case the branch is taken
8867           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8868           {
8869             memcpy(&branch_regs[i-1],&current,sizeof(current));
8870             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8871             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8872             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8873             alloc_cc(&branch_regs[i-1],i);
8874             dirty_reg(&branch_regs[i-1],CCREG);
8875             delayslot_alloc(&branch_regs[i-1],i);
8876             branch_regs[i-1].isconst=0;
8877             alloc_reg(&current,i,CCREG); // Not taken path
8878             dirty_reg(&current,CCREG);
8879             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8880           }
8881           else
8882           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8883           {
8884             memcpy(&branch_regs[i-1],&current,sizeof(current));
8885             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8886             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8887             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8888             alloc_cc(&branch_regs[i-1],i);
8889             dirty_reg(&branch_regs[i-1],CCREG);
8890             delayslot_alloc(&branch_regs[i-1],i);
8891             branch_regs[i-1].isconst=0;
8892             alloc_reg(&current,i,CCREG); // Not taken path
8893             dirty_reg(&current,CCREG);
8894             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8895           }
8896           break;
8897         case SJUMP:
8898           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8899           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8900           {
8901             alloc_cc(&current,i-1);
8902             dirty_reg(&current,CCREG);
8903             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8904               // The delay slot overwrote the branch condition
8905               // Delay slot goes after the test (in order)
8906               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8907               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8908               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8909               current.u|=1;
8910               current.uu|=1;
8911               delayslot_alloc(&current,i);
8912               current.isconst=0;
8913             }
8914             else
8915             {
8916               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8917               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8918               // Alloc the branch condition register
8919               alloc_reg(&current,i-1,rs1[i-1]);
8920               if(!(current.is32>>rs1[i-1]&1))
8921               {
8922                 alloc_reg64(&current,i-1,rs1[i-1]);
8923               }
8924             }
8925             memcpy(&branch_regs[i-1],&current,sizeof(current));
8926             branch_regs[i-1].isconst=0;
8927             branch_regs[i-1].wasconst=0;
8928             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8929             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8930           }
8931           else
8932           // Alloc the delay slot in case the branch is taken
8933           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8934           {
8935             memcpy(&branch_regs[i-1],&current,sizeof(current));
8936             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8937             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8938             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8939             alloc_cc(&branch_regs[i-1],i);
8940             dirty_reg(&branch_regs[i-1],CCREG);
8941             delayslot_alloc(&branch_regs[i-1],i);
8942             branch_regs[i-1].isconst=0;
8943             alloc_reg(&current,i,CCREG); // Not taken path
8944             dirty_reg(&current,CCREG);
8945             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8946           }
8947           // FIXME: BLTZAL/BGEZAL
8948           if(opcode2[i-1]&0x10) { // BxxZAL
8949             alloc_reg(&branch_regs[i-1],i-1,31);
8950             dirty_reg(&branch_regs[i-1],31);
8951             branch_regs[i-1].is32|=1LL<<31;
8952           }
8953           break;
8954         case FJUMP:
8955           if(likely[i-1]==0) // BC1F/BC1T
8956           {
8957             alloc_cc(&current,i-1);
8958             dirty_reg(&current,CCREG);
8959             if(itype[i]==FCOMP) {
8960               // The delay slot overwrote the branch condition
8961               // Delay slot goes after the test (in order)
8962               delayslot_alloc(&current,i);
8963               current.isconst=0;
8964             }
8965             else
8966             {
8967               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8968               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8969               // Alloc the branch condition register
8970               alloc_reg(&current,i-1,FSREG);
8971             }
8972             memcpy(&branch_regs[i-1],&current,sizeof(current));
8973             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8974           }
8975           else // BC1FL/BC1TL
8976           {
8977             // Alloc the delay slot in case the branch is taken
8978             memcpy(&branch_regs[i-1],&current,sizeof(current));
8979             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8980             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8981             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8982             alloc_cc(&branch_regs[i-1],i);
8983             dirty_reg(&branch_regs[i-1],CCREG);
8984             delayslot_alloc(&branch_regs[i-1],i);
8985             branch_regs[i-1].isconst=0;
8986             alloc_reg(&current,i,CCREG); // Not taken path
8987             dirty_reg(&current,CCREG);
8988             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8989           }
8990           break;
8991       }
8992
8993       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8994       {
8995         if(rt1[i-1]==31) // JAL/JALR
8996         {
8997           // Subroutine call will return here, don't alloc any registers
8998           current.is32=1;
8999           current.dirty=0;
9000           clear_all_regs(current.regmap);
9001           alloc_reg(&current,i,CCREG);
9002           dirty_reg(&current,CCREG);
9003         }
9004         else if(i+1<slen)
9005         {
9006           // Internal branch will jump here, match registers to caller
9007           current.is32=0x3FFFFFFFFLL;
9008           current.dirty=0;
9009           clear_all_regs(current.regmap);
9010           alloc_reg(&current,i,CCREG);
9011           dirty_reg(&current,CCREG);
9012           for(j=i-1;j>=0;j--)
9013           {
9014             if(ba[j]==start+i*4+4) {
9015               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9016               current.is32=branch_regs[j].is32;
9017               current.dirty=branch_regs[j].dirty;
9018               break;
9019             }
9020           }
9021           while(j>=0) {
9022             if(ba[j]==start+i*4+4) {
9023               for(hr=0;hr<HOST_REGS;hr++) {
9024                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9025                   current.regmap[hr]=-1;
9026                 }
9027                 current.is32&=branch_regs[j].is32;
9028                 current.dirty&=branch_regs[j].dirty;
9029               }
9030             }
9031             j--;
9032           }
9033         }
9034       }
9035     }
9036
9037     // Count cycles in between branches
9038     ccadj[i]=cc;
9039     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL))
9040     {
9041       cc=0;
9042     }
9043     else
9044     {
9045       cc++;
9046     }
9047
9048     flush_dirty_uppers(&current);
9049     if(!is_ds[i]) {
9050       regs[i].is32=current.is32;
9051       regs[i].dirty=current.dirty;
9052       regs[i].isconst=current.isconst;
9053       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9054     }
9055     for(hr=0;hr<HOST_REGS;hr++) {
9056       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9057         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9058           regs[i].wasconst&=~(1<<hr);
9059         }
9060       }
9061     }
9062     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9063   }
9064   
9065   /* Pass 4 - Cull unused host registers */
9066   
9067   uint64_t nr=0;
9068   
9069   for (i=slen-1;i>=0;i--)
9070   {
9071     int hr;
9072     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9073     {
9074       if(ba[i]<start || ba[i]>=(start+slen*4))
9075       {
9076         // Branch out of this block, don't need anything
9077         nr=0;
9078       }
9079       else
9080       {
9081         // Internal branch
9082         // Need whatever matches the target
9083         nr=0;
9084         int t=(ba[i]-start)>>2;
9085         for(hr=0;hr<HOST_REGS;hr++)
9086         {
9087           if(regs[i].regmap_entry[hr]>=0) {
9088             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9089           }
9090         }
9091       }
9092       // Conditional branch may need registers for following instructions
9093       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9094       {
9095         if(i<slen-2) {
9096           nr|=needed_reg[i+2];
9097           for(hr=0;hr<HOST_REGS;hr++)
9098           {
9099             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9100             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9101           }
9102         }
9103       }
9104       // Don't need stuff which is overwritten
9105       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9106       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9107       // Merge in delay slot
9108       for(hr=0;hr<HOST_REGS;hr++)
9109       {
9110         if(!likely[i]) {
9111           // These are overwritten unless the branch is "likely"
9112           // and the delay slot is nullified if not taken
9113           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9114           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9115         }
9116         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9117         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9118         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9119         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9120         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9121         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9122         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9123         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9124         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9125           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9126           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9127         }
9128         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9129           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9130           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9131         }
9132         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9133           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9134           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9135         }
9136       }
9137     }
9138     else if(itype[i]==SYSCALL)
9139     {
9140       // SYSCALL instruction (software interrupt)
9141       nr=0;
9142     }
9143     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9144     {
9145       // ERET instruction (return from interrupt)
9146       nr=0;
9147     }
9148     else // Non-branch
9149     {
9150       if(i<slen-1) {
9151         for(hr=0;hr<HOST_REGS;hr++) {
9152           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9153           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9154           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9155           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9156         }
9157       }
9158     }
9159     for(hr=0;hr<HOST_REGS;hr++)
9160     {
9161       // Overwritten registers are not needed
9162       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9163       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9164       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9165       // Source registers are needed
9166       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9167       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9168       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9169       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9170       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9171       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9172       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9173       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9174       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9175         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9176         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9177       }
9178       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9179         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9180         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9181       }
9182       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9183         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9184         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9185       }
9186       // Don't store a register immediately after writing it,
9187       // may prevent dual-issue.
9188       // But do so if this is a branch target, otherwise we
9189       // might have to load the register before the branch.
9190       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9191         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9192            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9193           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9194           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9195         }
9196         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9197            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9198           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9199           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9200         }
9201       }
9202     }
9203     // Cycle count is needed at branches.  Assume it is needed at the target too.
9204     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9205       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9206       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9207     }
9208     // Save it
9209     needed_reg[i]=nr;
9210     
9211     // Deallocate unneeded registers
9212     for(hr=0;hr<HOST_REGS;hr++)
9213     {
9214       if(!((nr>>hr)&1)) {
9215         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9216         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9217            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9218            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9219         {
9220           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9221           {
9222             if(likely[i]) {
9223               regs[i].regmap[hr]=-1;
9224               regs[i].isconst&=~(1<<hr);
9225               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9226             }
9227           }
9228         }
9229         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9230         {
9231           int d1=0,d2=0,map=0,temp=0;
9232           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9233           {
9234             d1=dep1[i+1];
9235             d2=dep2[i+1];
9236           }
9237           if(using_tlb) {
9238             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9239                itype[i+1]==STORE || itype[i+1]==STORELR ||
9240                itype[i+1]==C1LS )
9241             map=TLREG;
9242           } else
9243           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9244             map=INVCP;
9245           }
9246           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9247              itype[i+1]==C1LS )
9248             temp=FTEMP;
9249           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9250              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9251              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9252              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9253              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9254              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9255              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9256              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9257              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9258              regs[i].regmap[hr]!=map )
9259           {
9260             regs[i].regmap[hr]=-1;
9261             regs[i].isconst&=~(1<<hr);
9262             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9263                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9264                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9265                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9266                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9267                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9268                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9269                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9270                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9271                branch_regs[i].regmap[hr]!=map)
9272             {
9273               branch_regs[i].regmap[hr]=-1;
9274               branch_regs[i].regmap_entry[hr]=-1;
9275               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9276               {
9277                 if(!likely[i]&&i<slen-2) {
9278                   regmap_pre[i+2][hr]=-1;
9279                 }
9280               }
9281             }
9282           }
9283         }
9284         else
9285         {
9286           // Non-branch
9287           if(i>0)
9288           {
9289             int d1=0,d2=0,map=-1,temp=-1;
9290             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9291             {
9292               d1=dep1[i];
9293               d2=dep2[i];
9294             }
9295             if(using_tlb) {
9296               if(itype[i]==LOAD || itype[i]==LOADLR ||
9297                  itype[i]==STORE || itype[i]==STORELR ||
9298                  itype[i]==C1LS )
9299               map=TLREG;
9300             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9301               map=INVCP;
9302             }
9303             if(itype[i]==LOADLR || itype[i]==STORELR ||
9304                itype[i]==C1LS )
9305               temp=FTEMP;
9306             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9307                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9308                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9309                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9310                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9311                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9312             {
9313               if(i<slen-1&&!is_ds[i]) {
9314                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9315                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9316                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9317                 {
9318                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9319                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9320                 }
9321                 regmap_pre[i+1][hr]=-1;
9322                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9323               }
9324               regs[i].regmap[hr]=-1;
9325               regs[i].isconst&=~(1<<hr);
9326             }
9327           }
9328         }
9329       }
9330     }
9331   }
9332   
9333   /* Pass 5 - Pre-allocate registers */
9334   
9335   // If a register is allocated during a loop, try to allocate it for the
9336   // entire loop, if possible.  This avoids loading/storing registers
9337   // inside of the loop.
9338
9339   signed char f_regmap[HOST_REGS];
9340   clear_all_regs(f_regmap);
9341   for(i=0;i<slen-1;i++)
9342   {
9343     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9344     {
9345       if(ba[i]>=start && ba[i]<(start+i*4)) 
9346       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9347       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9348       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9349       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9350       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9351       {
9352         int t=(ba[i]-start)>>2;
9353         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9354         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9355         for(hr=0;hr<HOST_REGS;hr++)
9356         {
9357           if(regs[i].regmap[hr]>64) {
9358             if(!((regs[i].dirty>>hr)&1))
9359               f_regmap[hr]=regs[i].regmap[hr];
9360             else f_regmap[hr]=-1;
9361           }
9362           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9363           if(branch_regs[i].regmap[hr]>64) {
9364             if(!((branch_regs[i].dirty>>hr)&1))
9365               f_regmap[hr]=branch_regs[i].regmap[hr];
9366             else f_regmap[hr]=-1;
9367           }
9368           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9369           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9370           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9371           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9372           {
9373             // Test both in case the delay slot is ooo,
9374             // could be done better...
9375             if(count_free_regs(branch_regs[i].regmap)<2
9376              ||count_free_regs(regs[i].regmap)<2) 
9377               f_regmap[hr]=branch_regs[i].regmap[hr];
9378           }
9379           // Avoid dirty->clean transition
9380           // #ifdef DESTRUCTIVE_WRITEBACK here?
9381           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9382           if(f_regmap[hr]>0) {
9383             if(regs[t].regmap_entry[hr]<0) {
9384               int r=f_regmap[hr];
9385               for(j=t;j<=i;j++)
9386               {
9387                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9388                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9389                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9390                 if(r>63) {
9391                   // NB This can exclude the case where the upper-half
9392                   // register is lower numbered than the lower-half
9393                   // register.  Not sure if it's worth fixing...
9394                   if(get_reg(regs[j].regmap,r&63)<0) break;
9395                   if(regs[j].is32&(1LL<<(r&63))) break;
9396                 }
9397                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9398                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9399                   int k;
9400                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9401                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9402                     if(r>63) {
9403                       if(get_reg(regs[i].regmap,r&63)<0) break;
9404                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9405                     }
9406                     k=i;
9407                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9408                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9409                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9410                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9411                       ||itype[k-1]==FCOMP) {
9412                         if(count_free_regs(regs[k-1].regmap)<2) {
9413                           //printf("no free regs for store %x\n",start+(k-1)*4);
9414                           break;
9415                         }
9416                       }
9417                       else
9418                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9419                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9420                         //printf("no-match due to different register\n");
9421                         break;
9422                       }
9423                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9424                         //printf("no-match due to branch\n");
9425                         break;
9426                       }
9427                       // call/ret fast path assumes no registers allocated
9428                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9429                         break;
9430                       }
9431                       if(r>63) {
9432                         // NB This can exclude the case where the upper-half
9433                         // register is lower numbered than the lower-half
9434                         // register.  Not sure if it's worth fixing...
9435                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9436                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9437                       }
9438                       k--;
9439                     }
9440                     if(i<slen-1) {
9441                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9442                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9443                         //printf("bad match after branch\n");
9444                         break;
9445                       }
9446                     }
9447                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9448                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9449                       while(k<i) {
9450                         regs[k].regmap_entry[hr]=f_regmap[hr];
9451                         regs[k].regmap[hr]=f_regmap[hr];
9452                         regmap_pre[k+1][hr]=f_regmap[hr];
9453                         regs[k].wasdirty&=~(1<<hr);
9454                         regs[k].dirty&=~(1<<hr);
9455                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9456                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9457                         regs[k].wasconst&=~(1<<hr);
9458                         regs[k].isconst&=~(1<<hr);
9459                         k++;
9460                       }
9461                     }
9462                     else {
9463                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9464                       break;
9465                     }
9466                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9467                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9468                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9469                       regs[i].regmap_entry[hr]=f_regmap[hr];
9470                       regs[i].regmap[hr]=f_regmap[hr];
9471                       regs[i].wasdirty&=~(1<<hr);
9472                       regs[i].dirty&=~(1<<hr);
9473                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9474                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9475                       regs[i].wasconst&=~(1<<hr);
9476                       regs[i].isconst&=~(1<<hr);
9477                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9478                       branch_regs[i].wasdirty&=~(1<<hr);
9479                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9480                       branch_regs[i].regmap[hr]=f_regmap[hr];
9481                       branch_regs[i].dirty&=~(1<<hr);
9482                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9483                       branch_regs[i].wasconst&=~(1<<hr);
9484                       branch_regs[i].isconst&=~(1<<hr);
9485                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9486                         regmap_pre[i+2][hr]=f_regmap[hr];
9487                         regs[i+2].wasdirty&=~(1<<hr);
9488                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9489                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9490                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9491                       }
9492                     }
9493                   }
9494                   for(k=t;k<j;k++) {
9495                     regs[k].regmap_entry[hr]=f_regmap[hr];
9496                     regs[k].regmap[hr]=f_regmap[hr];
9497                     regmap_pre[k+1][hr]=f_regmap[hr];
9498                     regs[k+1].wasdirty&=~(1<<hr);
9499                     regs[k].dirty&=~(1<<hr);
9500                     regs[k].wasconst&=~(1<<hr);
9501                     regs[k].isconst&=~(1<<hr);
9502                   }
9503                   if(regs[j].regmap[hr]==f_regmap[hr])
9504                     regs[j].regmap_entry[hr]=f_regmap[hr];
9505                   break;
9506                 }
9507                 if(j==i) break;
9508                 if(regs[j].regmap[hr]>=0)
9509                   break;
9510                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9511                   //printf("no-match due to different register\n");
9512                   break;
9513                 }
9514                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9515                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9516                   break;
9517                 }
9518                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9519                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9520                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9521                   if(count_free_regs(regs[j].regmap)<2) {
9522                     //printf("No free regs for store %x\n",start+j*4);
9523                     break;
9524                   }
9525                 }
9526                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9527                 if(f_regmap[hr]>=64) {
9528                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9529                     break;
9530                   }
9531                   else
9532                   {
9533                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9534                       break;
9535                     }
9536                   }
9537                 }
9538               }
9539             }
9540           }
9541         }
9542       }
9543     }else{
9544       int count=0;
9545       for(hr=0;hr<HOST_REGS;hr++)
9546       {
9547         if(hr!=EXCLUDE_REG) {
9548           if(regs[i].regmap[hr]>64) {
9549             if(!((regs[i].dirty>>hr)&1))
9550               f_regmap[hr]=regs[i].regmap[hr];
9551           }
9552           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9553           else if(regs[i].regmap[hr]<0) count++;
9554         }
9555       }
9556       // Try to restore cycle count at branch targets
9557       if(bt[i]) {
9558         for(j=i;j<slen-1;j++) {
9559           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9560           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9561           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9562           ||itype[j]==FCOMP||itype[j]==FCONV) {
9563             if(count_free_regs(regs[j].regmap)<2) {
9564               //printf("no free regs for store %x\n",start+j*4);
9565               break;
9566             }
9567           }
9568           else
9569           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9570         }
9571         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9572           int k=i;
9573           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9574           while(k<j) {
9575             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9576             regs[k].regmap[HOST_CCREG]=CCREG;
9577             regmap_pre[k+1][HOST_CCREG]=CCREG;
9578             regs[k+1].wasdirty|=1<<HOST_CCREG;
9579             regs[k].dirty|=1<<HOST_CCREG;
9580             regs[k].wasconst&=~(1<<HOST_CCREG);
9581             regs[k].isconst&=~(1<<HOST_CCREG);
9582             k++;
9583           }
9584           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9585         }
9586         // Work backwards from the branch target
9587         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9588         {
9589           //printf("Extend backwards\n");
9590           int k;
9591           k=i;
9592           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9593             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9594             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9595             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9596               if(count_free_regs(regs[k-1].regmap)<2) {
9597                 //printf("no free regs for store %x\n",start+(k-1)*4);
9598                 break;
9599               }
9600             }
9601             else
9602             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9603             k--;
9604           }
9605           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9606             //printf("Extend CC, %x ->\n",start+k*4);
9607             while(k<=i) {
9608               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9609               regs[k].regmap[HOST_CCREG]=CCREG;
9610               regmap_pre[k+1][HOST_CCREG]=CCREG;
9611               regs[k+1].wasdirty|=1<<HOST_CCREG;
9612               regs[k].dirty|=1<<HOST_CCREG;
9613               regs[k].wasconst&=~(1<<HOST_CCREG);
9614               regs[k].isconst&=~(1<<HOST_CCREG);
9615               k++;
9616             }
9617           }
9618           else {
9619             //printf("Fail Extend CC, %x ->\n",start+k*4);
9620           }
9621         }
9622       }
9623       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9624          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9625          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9626          itype[i]!=FCONV&&itype[i]!=FCOMP)
9627       {
9628         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9629       }
9630     }
9631   }
9632   
9633   // This allocates registers (if possible) one instruction prior
9634   // to use, which can avoid a load-use penalty on certain CPUs.
9635   for(i=0;i<slen-1;i++)
9636   {
9637     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9638     {
9639       if(!bt[i+1])
9640       {
9641         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9642         {
9643           if(rs1[i+1]) {
9644             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9645             {
9646               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9647               {
9648                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9649                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9650                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9651                 regs[i].isconst&=~(1<<hr);
9652                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9653                 constmap[i][hr]=constmap[i+1][hr];
9654                 regs[i+1].wasdirty&=~(1<<hr);
9655                 regs[i].dirty&=~(1<<hr);
9656               }
9657             }
9658           }
9659           if(rs2[i+1]) {
9660             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9661             {
9662               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9663               {
9664                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9665                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9666                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9667                 regs[i].isconst&=~(1<<hr);
9668                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9669                 constmap[i][hr]=constmap[i+1][hr];
9670                 regs[i+1].wasdirty&=~(1<<hr);
9671                 regs[i].dirty&=~(1<<hr);
9672               }
9673             }
9674           }
9675           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9676             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9677             {
9678               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9679               {
9680                 regs[i].regmap[hr]=rs1[i+1];
9681                 regmap_pre[i+1][hr]=rs1[i+1];
9682                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9683                 regs[i].isconst&=~(1<<hr);
9684                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9685                 constmap[i][hr]=constmap[i+1][hr];
9686                 regs[i+1].wasdirty&=~(1<<hr);
9687                 regs[i].dirty&=~(1<<hr);
9688               }
9689             }
9690           }
9691           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9692             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9693             {
9694               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9695               {
9696                 regs[i].regmap[hr]=rs1[i+1];
9697                 regmap_pre[i+1][hr]=rs1[i+1];
9698                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9699                 regs[i].isconst&=~(1<<hr);
9700                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9701                 constmap[i][hr]=constmap[i+1][hr];
9702                 regs[i+1].wasdirty&=~(1<<hr);
9703                 regs[i].dirty&=~(1<<hr);
9704               }
9705             }
9706           }
9707           #ifndef HOST_IMM_ADDR32
9708           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9709             hr=get_reg(regs[i+1].regmap,TLREG);
9710             if(hr>=0) {
9711               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9712               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9713                 int nr;
9714                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9715                 {
9716                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9717                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9718                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9719                   regs[i].isconst&=~(1<<hr);
9720                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9721                   constmap[i][hr]=constmap[i+1][hr];
9722                   regs[i+1].wasdirty&=~(1<<hr);
9723                   regs[i].dirty&=~(1<<hr);
9724                 }
9725                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9726                 {
9727                   // move it to another register
9728                   regs[i+1].regmap[hr]=-1;
9729                   regmap_pre[i+2][hr]=-1;
9730                   regs[i+1].regmap[nr]=TLREG;
9731                   regmap_pre[i+2][nr]=TLREG;
9732                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9733                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9734                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9735                   regs[i].isconst&=~(1<<nr);
9736                   regs[i+1].isconst&=~(1<<nr);
9737                   regs[i].dirty&=~(1<<nr);
9738                   regs[i+1].wasdirty&=~(1<<nr);
9739                   regs[i+1].dirty&=~(1<<nr);
9740                   regs[i+2].wasdirty&=~(1<<nr);
9741                 }
9742               }
9743             }
9744           }
9745           #endif
9746           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9747             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9748               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9749               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9750               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9751               assert(hr>=0);
9752               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9753               {
9754                 regs[i].regmap[hr]=rs1[i+1];
9755                 regmap_pre[i+1][hr]=rs1[i+1];
9756                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9757                 regs[i].isconst&=~(1<<hr);
9758                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9759                 constmap[i][hr]=constmap[i+1][hr];
9760                 regs[i+1].wasdirty&=~(1<<hr);
9761                 regs[i].dirty&=~(1<<hr);
9762               }
9763             }
9764           }
9765           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9766             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9767               int nr;
9768               hr=get_reg(regs[i+1].regmap,FTEMP);
9769               assert(hr>=0);
9770               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9771               {
9772                 regs[i].regmap[hr]=rs1[i+1];
9773                 regmap_pre[i+1][hr]=rs1[i+1];
9774                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9775                 regs[i].isconst&=~(1<<hr);
9776                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9777                 constmap[i][hr]=constmap[i+1][hr];
9778                 regs[i+1].wasdirty&=~(1<<hr);
9779                 regs[i].dirty&=~(1<<hr);
9780               }
9781               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9782               {
9783                 // move it to another register
9784                 regs[i+1].regmap[hr]=-1;
9785                 regmap_pre[i+2][hr]=-1;
9786                 regs[i+1].regmap[nr]=FTEMP;
9787                 regmap_pre[i+2][nr]=FTEMP;
9788                 regs[i].regmap[nr]=rs1[i+1];
9789                 regmap_pre[i+1][nr]=rs1[i+1];
9790                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9791                 regs[i].isconst&=~(1<<nr);
9792                 regs[i+1].isconst&=~(1<<nr);
9793                 regs[i].dirty&=~(1<<nr);
9794                 regs[i+1].wasdirty&=~(1<<nr);
9795                 regs[i+1].dirty&=~(1<<nr);
9796                 regs[i+2].wasdirty&=~(1<<nr);
9797               }
9798             }
9799           }
9800           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9801             if(itype[i+1]==LOAD) 
9802               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9803             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9804               hr=get_reg(regs[i+1].regmap,FTEMP);
9805             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9806               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9807               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9808             }
9809             if(hr>=0&&regs[i].regmap[hr]<0) {
9810               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9811               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9812                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9813                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9814                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9815                 regs[i].isconst&=~(1<<hr);
9816                 regs[i+1].wasdirty&=~(1<<hr);
9817                 regs[i].dirty&=~(1<<hr);
9818               }
9819             }
9820           }
9821         }
9822       }
9823     }
9824   }
9825   
9826   /* Pass 6 - Optimize clean/dirty state */
9827   clean_registers(0,slen-1,1);
9828   
9829   /* Pass 7 - Identify 32-bit registers */
9830   
9831   provisional_r32();
9832
9833   u_int r32=0;
9834   
9835   for (i=slen-1;i>=0;i--)
9836   {
9837     int hr;
9838     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9839     {
9840       if(ba[i]<start || ba[i]>=(start+slen*4))
9841       {
9842         // Branch out of this block, don't need anything
9843         r32=0;
9844       }
9845       else
9846       {
9847         // Internal branch
9848         // Need whatever matches the target
9849         // (and doesn't get overwritten by the delay slot instruction)
9850         r32=0;
9851         int t=(ba[i]-start)>>2;
9852         if(ba[i]>start+i*4) {
9853           // Forward branch
9854           if(!(requires_32bit[t]&~regs[i].was32))
9855             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9856         }else{
9857           // Backward branch
9858           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9859           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9860           if(!(pr32[t]&~regs[i].was32))
9861             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9862         }
9863       }
9864       // Conditional branch may need registers for following instructions
9865       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9866       {
9867         if(i<slen-2) {
9868           r32|=requires_32bit[i+2];
9869           r32&=regs[i].was32;
9870           // Mark this address as a branch target since it may be called
9871           // upon return from interrupt
9872           bt[i+2]=1;
9873         }
9874       }
9875       // Merge in delay slot
9876       if(!likely[i]) {
9877         // These are overwritten unless the branch is "likely"
9878         // and the delay slot is nullified if not taken
9879         r32&=~(1LL<<rt1[i+1]);
9880         r32&=~(1LL<<rt2[i+1]);
9881       }
9882       // Assume these are needed (delay slot)
9883       if(us1[i+1]>0)
9884       {
9885         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9886       }
9887       if(us2[i+1]>0)
9888       {
9889         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9890       }
9891       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9892       {
9893         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9894       }
9895       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9896       {
9897         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9898       }
9899     }
9900     else if(itype[i]==SYSCALL)
9901     {
9902       // SYSCALL instruction (software interrupt)
9903       r32=0;
9904     }
9905     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9906     {
9907       // ERET instruction (return from interrupt)
9908       r32=0;
9909     }
9910     // Check 32 bits
9911     r32&=~(1LL<<rt1[i]);
9912     r32&=~(1LL<<rt2[i]);
9913     if(us1[i]>0)
9914     {
9915       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9916     }
9917     if(us2[i]>0)
9918     {
9919       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9920     }
9921     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
9922     {
9923       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
9924     }
9925     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
9926     {
9927       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
9928     }
9929     requires_32bit[i]=r32;
9930     
9931     // Dirty registers which are 32-bit, require 32-bit input
9932     // as they will be written as 32-bit values
9933     for(hr=0;hr<HOST_REGS;hr++)
9934     {
9935       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
9936         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
9937           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
9938           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
9939         }
9940       }
9941     }
9942     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
9943   }
9944
9945   if(itype[slen-1]==SPAN) {
9946     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9947   }
9948   
9949   /* Debug/disassembly */
9950   if((void*)assem_debug==(void*)printf) 
9951   for(i=0;i<slen;i++)
9952   {
9953     printf("U:");
9954     int r;
9955     for(r=1;r<=CCREG;r++) {
9956       if((unneeded_reg[i]>>r)&1) {
9957         if(r==HIREG) printf(" HI");
9958         else if(r==LOREG) printf(" LO");
9959         else printf(" r%d",r);
9960       }
9961     }
9962 #ifndef FORCE32
9963     printf(" UU:");
9964     for(r=1;r<=CCREG;r++) {
9965       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
9966         if(r==HIREG) printf(" HI");
9967         else if(r==LOREG) printf(" LO");
9968         else printf(" r%d",r);
9969       }
9970     }
9971     printf(" 32:");
9972     for(r=0;r<=CCREG;r++) {
9973       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9974       if((regs[i].was32>>r)&1) {
9975         if(r==CCREG) printf(" CC");
9976         else if(r==HIREG) printf(" HI");
9977         else if(r==LOREG) printf(" LO");
9978         else printf(" r%d",r);
9979       }
9980     }
9981 #endif
9982     printf("\n");
9983     #if defined(__i386__) || defined(__x86_64__)
9984     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9985     #endif
9986     #ifdef __arm__
9987     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9988     #endif
9989     printf("needs: ");
9990     if(needed_reg[i]&1) printf("eax ");
9991     if((needed_reg[i]>>1)&1) printf("ecx ");
9992     if((needed_reg[i]>>2)&1) printf("edx ");
9993     if((needed_reg[i]>>3)&1) printf("ebx ");
9994     if((needed_reg[i]>>5)&1) printf("ebp ");
9995     if((needed_reg[i]>>6)&1) printf("esi ");
9996     if((needed_reg[i]>>7)&1) printf("edi ");
9997     printf("r:");
9998     for(r=0;r<=CCREG;r++) {
9999       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10000       if((requires_32bit[i]>>r)&1) {
10001         if(r==CCREG) printf(" CC");
10002         else if(r==HIREG) printf(" HI");
10003         else if(r==LOREG) printf(" LO");
10004         else printf(" r%d",r);
10005       }
10006     }
10007     printf("\n");
10008     /*printf("pr:");
10009     for(r=0;r<=CCREG;r++) {
10010       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10011       if((pr32[i]>>r)&1) {
10012         if(r==CCREG) printf(" CC");
10013         else if(r==HIREG) printf(" HI");
10014         else if(r==LOREG) printf(" LO");
10015         else printf(" r%d",r);
10016       }
10017     }
10018     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10019     printf("\n");*/
10020     #if defined(__i386__) || defined(__x86_64__)
10021     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10022     printf("dirty: ");
10023     if(regs[i].wasdirty&1) printf("eax ");
10024     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10025     if((regs[i].wasdirty>>2)&1) printf("edx ");
10026     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10027     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10028     if((regs[i].wasdirty>>6)&1) printf("esi ");
10029     if((regs[i].wasdirty>>7)&1) printf("edi ");
10030     #endif
10031     #ifdef __arm__
10032     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10033     printf("dirty: ");
10034     if(regs[i].wasdirty&1) printf("r0 ");
10035     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10036     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10037     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10038     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10039     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10040     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10041     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10042     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10043     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10044     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10045     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10046     #endif
10047     printf("\n");
10048     disassemble_inst(i);
10049     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10050     #if defined(__i386__) || defined(__x86_64__)
10051     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10052     if(regs[i].dirty&1) printf("eax ");
10053     if((regs[i].dirty>>1)&1) printf("ecx ");
10054     if((regs[i].dirty>>2)&1) printf("edx ");
10055     if((regs[i].dirty>>3)&1) printf("ebx ");
10056     if((regs[i].dirty>>5)&1) printf("ebp ");
10057     if((regs[i].dirty>>6)&1) printf("esi ");
10058     if((regs[i].dirty>>7)&1) printf("edi ");
10059     #endif
10060     #ifdef __arm__
10061     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10062     if(regs[i].dirty&1) printf("r0 ");
10063     if((regs[i].dirty>>1)&1) printf("r1 ");
10064     if((regs[i].dirty>>2)&1) printf("r2 ");
10065     if((regs[i].dirty>>3)&1) printf("r3 ");
10066     if((regs[i].dirty>>4)&1) printf("r4 ");
10067     if((regs[i].dirty>>5)&1) printf("r5 ");
10068     if((regs[i].dirty>>6)&1) printf("r6 ");
10069     if((regs[i].dirty>>7)&1) printf("r7 ");
10070     if((regs[i].dirty>>8)&1) printf("r8 ");
10071     if((regs[i].dirty>>9)&1) printf("r9 ");
10072     if((regs[i].dirty>>10)&1) printf("r10 ");
10073     if((regs[i].dirty>>12)&1) printf("r12 ");
10074     #endif
10075     printf("\n");
10076     if(regs[i].isconst) {
10077       printf("constants: ");
10078       #if defined(__i386__) || defined(__x86_64__)
10079       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10080       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10081       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10082       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10083       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10084       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10085       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10086       #endif
10087       #ifdef __arm__
10088       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10089       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10090       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10091       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10092       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10093       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10094       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10095       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10096       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10097       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10098       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10099       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10100       #endif
10101       printf("\n");
10102     }
10103 #ifndef FORCE32
10104     printf(" 32:");
10105     for(r=0;r<=CCREG;r++) {
10106       if((regs[i].is32>>r)&1) {
10107         if(r==CCREG) printf(" CC");
10108         else if(r==HIREG) printf(" HI");
10109         else if(r==LOREG) printf(" LO");
10110         else printf(" r%d",r);
10111       }
10112     }
10113     printf("\n");
10114 #endif
10115     /*printf(" p32:");
10116     for(r=0;r<=CCREG;r++) {
10117       if((p32[i]>>r)&1) {
10118         if(r==CCREG) printf(" CC");
10119         else if(r==HIREG) printf(" HI");
10120         else if(r==LOREG) printf(" LO");
10121         else printf(" r%d",r);
10122       }
10123     }
10124     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10125     else printf("\n");*/
10126     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10127       #if defined(__i386__) || defined(__x86_64__)
10128       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10129       if(branch_regs[i].dirty&1) printf("eax ");
10130       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10131       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10132       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10133       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10134       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10135       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10136       #endif
10137       #ifdef __arm__
10138       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10139       if(branch_regs[i].dirty&1) printf("r0 ");
10140       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10141       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10142       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10143       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10144       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10145       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10146       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10147       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10148       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10149       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10150       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10151       #endif
10152 #ifndef FORCE32
10153       printf(" 32:");
10154       for(r=0;r<=CCREG;r++) {
10155         if((branch_regs[i].is32>>r)&1) {
10156           if(r==CCREG) printf(" CC");
10157           else if(r==HIREG) printf(" HI");
10158           else if(r==LOREG) printf(" LO");
10159           else printf(" r%d",r);
10160         }
10161       }
10162       printf("\n");
10163 #endif
10164     }
10165   }
10166
10167   /* Pass 8 - Assembly */
10168   linkcount=0;stubcount=0;
10169   ds=0;is_delayslot=0;
10170   cop1_usable=0;
10171   uint64_t is32_pre=0;
10172   u_int dirty_pre=0;
10173   u_int beginning=(u_int)out;
10174   if((u_int)addr&1) {
10175     ds=1;
10176     pagespan_ds();
10177   }
10178   for(i=0;i<slen;i++)
10179   {
10180     //if(ds) printf("ds: ");
10181     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10182     if(ds) {
10183       ds=0; // Skip delay slot
10184       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10185       instr_addr[i]=0;
10186     } else {
10187       #ifndef DESTRUCTIVE_WRITEBACK
10188       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10189       {
10190         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10191               unneeded_reg[i],unneeded_reg_upper[i]);
10192         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10193               unneeded_reg[i],unneeded_reg_upper[i]);
10194       }
10195       is32_pre=regs[i].is32;
10196       dirty_pre=regs[i].dirty;
10197       #endif
10198       // write back
10199       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10200       {
10201         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10202                       unneeded_reg[i],unneeded_reg_upper[i]);
10203         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10204       }
10205       // branch target entry point
10206       instr_addr[i]=(u_int)out;
10207       assem_debug("<->\n");
10208       // load regs
10209       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10210         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10211       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10212       address_generation(i,&regs[i],regs[i].regmap_entry);
10213       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10214       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10215       {
10216         // Load the delay slot registers if necessary
10217         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10218           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10219         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10220           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10221         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10222           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10223       }
10224       else if(i+1<slen)
10225       {
10226         // Preload registers for following instruction
10227         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10228           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10229             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10230         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10231           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10232             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10233       }
10234       // TODO: if(is_ooo(i)) address_generation(i+1);
10235       if(itype[i]==CJUMP||itype[i]==FJUMP)
10236         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10237       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10238         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10239       if(bt[i]) cop1_usable=0;
10240       // assemble
10241       switch(itype[i]) {
10242         case ALU:
10243           alu_assemble(i,&regs[i]);break;
10244         case IMM16:
10245           imm16_assemble(i,&regs[i]);break;
10246         case SHIFT:
10247           shift_assemble(i,&regs[i]);break;
10248         case SHIFTIMM:
10249           shiftimm_assemble(i,&regs[i]);break;
10250         case LOAD:
10251           load_assemble(i,&regs[i]);break;
10252         case LOADLR:
10253           loadlr_assemble(i,&regs[i]);break;
10254         case STORE:
10255           store_assemble(i,&regs[i]);break;
10256         case STORELR:
10257           storelr_assemble(i,&regs[i]);break;
10258         case COP0:
10259           cop0_assemble(i,&regs[i]);break;
10260         case COP1:
10261           cop1_assemble(i,&regs[i]);break;
10262         case C1LS:
10263           c1ls_assemble(i,&regs[i]);break;
10264         case FCONV:
10265           fconv_assemble(i,&regs[i]);break;
10266         case FLOAT:
10267           float_assemble(i,&regs[i]);break;
10268         case FCOMP:
10269           fcomp_assemble(i,&regs[i]);break;
10270         case MULTDIV:
10271           multdiv_assemble(i,&regs[i]);break;
10272         case MOV:
10273           mov_assemble(i,&regs[i]);break;
10274         case SYSCALL:
10275           syscall_assemble(i,&regs[i]);break;
10276         case UJUMP:
10277           ujump_assemble(i,&regs[i]);ds=1;break;
10278         case RJUMP:
10279           rjump_assemble(i,&regs[i]);ds=1;break;
10280         case CJUMP:
10281           cjump_assemble(i,&regs[i]);ds=1;break;
10282         case SJUMP:
10283           sjump_assemble(i,&regs[i]);ds=1;break;
10284         case FJUMP:
10285           fjump_assemble(i,&regs[i]);ds=1;break;
10286         case SPAN:
10287           pagespan_assemble(i,&regs[i]);break;
10288       }
10289       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10290         literal_pool(1024);
10291       else
10292         literal_pool_jumpover(256);
10293     }
10294   }
10295   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10296   // If the block did not end with an unconditional branch,
10297   // add a jump to the next instruction.
10298   if(i>1) {
10299     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10300       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10301       assert(i==slen);
10302       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10303         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10304         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10305           emit_loadreg(CCREG,HOST_CCREG);
10306         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10307       }
10308       else if(!likely[i-2])
10309       {
10310         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10311         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10312       }
10313       else
10314       {
10315         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10316         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10317       }
10318       add_to_linker((int)out,start+i*4,0);
10319       emit_jmp(0);
10320     }
10321   }
10322   else
10323   {
10324     assert(i>0);
10325     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10326     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10327     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10328       emit_loadreg(CCREG,HOST_CCREG);
10329     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10330     add_to_linker((int)out,start+i*4,0);
10331     emit_jmp(0);
10332   }
10333
10334   // TODO: delay slot stubs?
10335   // Stubs
10336   for(i=0;i<stubcount;i++)
10337   {
10338     switch(stubs[i][0])
10339     {
10340       case LOADB_STUB:
10341       case LOADH_STUB:
10342       case LOADW_STUB:
10343       case LOADD_STUB:
10344       case LOADBU_STUB:
10345       case LOADHU_STUB:
10346         do_readstub(i);break;
10347       case STOREB_STUB:
10348       case STOREH_STUB:
10349       case STOREW_STUB:
10350       case STORED_STUB:
10351         do_writestub(i);break;
10352       case CC_STUB:
10353         do_ccstub(i);break;
10354       case INVCODE_STUB:
10355         do_invstub(i);break;
10356       case FP_STUB:
10357         do_cop1stub(i);break;
10358       case STORELR_STUB:
10359         do_unalignedwritestub(i);break;
10360     }
10361   }
10362
10363   /* Pass 9 - Linker */
10364   for(i=0;i<linkcount;i++)
10365   {
10366     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10367     literal_pool(64);
10368     if(!link_addr[i][2])
10369     {
10370       void *stub=out;
10371       void *addr=check_addr(link_addr[i][1]);
10372       emit_extjump(link_addr[i][0],link_addr[i][1]);
10373       if(addr) {
10374         set_jump_target(link_addr[i][0],(int)addr);
10375         add_link(link_addr[i][1],stub);
10376       }
10377       else set_jump_target(link_addr[i][0],(int)stub);
10378     }
10379     else
10380     {
10381       // Internal branch
10382       int target=(link_addr[i][1]-start)>>2;
10383       assert(target>=0&&target<slen);
10384       assert(instr_addr[target]);
10385       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10386       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10387       //#else
10388       set_jump_target(link_addr[i][0],instr_addr[target]);
10389       //#endif
10390     }
10391   }
10392   // External Branch Targets (jump_in)
10393   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10394   for(i=0;i<slen;i++)
10395   {
10396     if(bt[i]||i==0)
10397     {
10398       if(instr_addr[i]) // TODO - delay slots (=null)
10399       {
10400         u_int vaddr=start+i*4;
10401         u_int page=get_page(vaddr);
10402         u_int vpage=get_vpage(vaddr);
10403         literal_pool(256);
10404         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10405         if(!requires_32bit[i])
10406         {
10407           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10408           assem_debug("jump_in: %x\n",start+i*4);
10409           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10410           int entry_point=do_dirty_stub(i);
10411           ll_add(jump_in+page,vaddr,(void *)entry_point);
10412           // If there was an existing entry in the hash table,
10413           // replace it with the new address.
10414           // Don't add new entries.  We'll insert the
10415           // ones that actually get used in check_addr().
10416           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10417           if(ht_bin[0]==vaddr) {
10418             ht_bin[1]=entry_point;
10419           }
10420           if(ht_bin[2]==vaddr) {
10421             ht_bin[3]=entry_point;
10422           }
10423         }
10424         else
10425         {
10426           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10427           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10428           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10429           //int entry_point=(int)out;
10430           ////assem_debug("entry_point: %x\n",entry_point);
10431           //load_regs_entry(i);
10432           //if(entry_point==(int)out)
10433           //  entry_point=instr_addr[i];
10434           //else
10435           //  emit_jmp(instr_addr[i]);
10436           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10437           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10438           int entry_point=do_dirty_stub(i);
10439           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10440         }
10441       }
10442     }
10443   }
10444   // Write out the literal pool if necessary
10445   literal_pool(0);
10446   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10447   // Align code
10448   if(((u_int)out)&7) emit_addnop(13);
10449   #endif
10450   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10451   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10452   memcpy(copy,source,slen*4);
10453   copy+=slen*4;
10454   
10455   #ifdef __arm__
10456   __clear_cache((void *)beginning,out);
10457   #endif
10458   
10459   // If we're within 256K of the end of the buffer,
10460   // start over from the beginning. (Is 256K enough?)
10461   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10462   
10463   // Trap writes to any of the pages we compiled
10464   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10465     invalid_code[i]=0;
10466 #ifndef DISABLE_TLB
10467     memory_map[i]|=0x40000000;
10468     if((signed int)start>=(signed int)0xC0000000) {
10469       assert(using_tlb);
10470       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10471       invalid_code[j]=0;
10472       memory_map[j]|=0x40000000;
10473       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10474     }
10475 #endif
10476   }
10477   
10478   /* Pass 10 - Free memory by expiring oldest blocks */
10479   
10480   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10481   while(expirep!=end)
10482   {
10483     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10484     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10485     inv_debug("EXP: Phase %d\n",expirep);
10486     switch((expirep>>11)&3)
10487     {
10488       case 0:
10489         // Clear jump_in and jump_dirty
10490         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10491         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10492         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10493         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10494         break;
10495       case 1:
10496         // Clear pointers
10497         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10498         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10499         break;
10500       case 2:
10501         // Clear hash table
10502         for(i=0;i<32;i++) {
10503           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10504           if((ht_bin[3]>>shift)==(base>>shift) ||
10505              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10506             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10507             ht_bin[2]=ht_bin[3]=-1;
10508           }
10509           if((ht_bin[1]>>shift)==(base>>shift) ||
10510              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10511             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10512             ht_bin[0]=ht_bin[2];
10513             ht_bin[1]=ht_bin[3];
10514             ht_bin[2]=ht_bin[3]=-1;
10515           }
10516         }
10517         break;
10518       case 3:
10519         // Clear jump_out
10520         #ifdef __arm__
10521         if((expirep&2047)==0)
10522           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10523         #endif
10524         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10525         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10526         break;
10527     }
10528     expirep=(expirep+1)&65535;
10529   }
10530   return 0;
10531 }