b141b6b68a2354b6ce31fde0cebb036042aeea1a
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   uint64_t unneeded_reg[MAXBLOCK];
88   uint64_t unneeded_reg_upper[MAXBLOCK];
89   uint64_t branch_unneeded_reg[MAXBLOCK];
90   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
91   uint64_t p32[MAXBLOCK];
92   uint64_t pr32[MAXBLOCK];
93   signed char regmap_pre[MAXBLOCK][HOST_REGS];
94   signed char regmap[MAXBLOCK][HOST_REGS];
95   signed char regmap_entry[MAXBLOCK][HOST_REGS];
96   uint64_t constmap[MAXBLOCK][HOST_REGS];
97   uint64_t known_value[HOST_REGS];
98   u_int known_reg;
99   struct regstat regs[MAXBLOCK];
100   struct regstat branch_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124   u_int using_tlb;
125   u_int stop_after_jal;
126   extern u_char restore_candidate[512];
127   extern int cycle_count;
128
129   /* registers that may be allocated */
130   /* 1-31 gpr */
131 #define HIREG 32 // hi
132 #define LOREG 33 // lo
133 #define FSREG 34 // FPU status (FCSR)
134 #define CSREG 35 // Coprocessor status
135 #define CCREG 36 // Cycle count
136 #define INVCP 37 // Pointer to invalid_code
137 #define TEMPREG 38
138 #define FTEMP 38 // FPU temporary register
139 #define PTEMP 39 // Prefetch temporary register
140 #define TLREG 40 // TLB mapping offset
141 #define RHASH 41 // Return address hash
142 #define RHTBL 42 // Return address hash table address
143 #define RTEMP 43 // JR/JALR address register
144 #define MAXREG 43
145 #define AGEN1 44 // Address generation temporary register
146 #define AGEN2 45 // Address generation temporary register
147 #define MGEN1 46 // Maptable address generation temporary register
148 #define MGEN2 47 // Maptable address generation temporary register
149 #define BTREG 48 // Branch target temporary register
150
151   /* instruction types */
152 #define NOP 0     // No operation
153 #define LOAD 1    // Load
154 #define STORE 2   // Store
155 #define LOADLR 3  // Unaligned load
156 #define STORELR 4 // Unaligned store
157 #define MOV 5     // Move 
158 #define ALU 6     // Arithmetic/logic
159 #define MULTDIV 7 // Multiply/divide
160 #define SHIFT 8   // Shift by register
161 #define SHIFTIMM 9// Shift by immediate
162 #define IMM16 10  // 16-bit immediate
163 #define RJUMP 11  // Unconditional jump to register
164 #define UJUMP 12  // Unconditional jump
165 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
166 #define SJUMP 14  // Conditional branch (regimm format)
167 #define COP0 15   // Coprocessor 0
168 #define COP1 16   // Coprocessor 1
169 #define C1LS 17   // Coprocessor 1 load/store
170 #define FJUMP 18  // Conditional branch (floating point)
171 #define FLOAT 19  // Floating point unit
172 #define FCONV 20  // Convert integer to float
173 #define FCOMP 21  // Floating point compare (sets FSREG)
174 #define SYSCALL 22// SYSCALL
175 #define OTHER 23  // Other
176 #define SPAN 24   // Branch/delay slot spans 2 pages
177 #define NI 25     // Not implemented
178
179   /* stubs */
180 #define CC_STUB 1
181 #define FP_STUB 2
182 #define LOADB_STUB 3
183 #define LOADH_STUB 4
184 #define LOADW_STUB 5
185 #define LOADD_STUB 6
186 #define LOADBU_STUB 7
187 #define LOADHU_STUB 8
188 #define STOREB_STUB 9
189 #define STOREH_STUB 10
190 #define STOREW_STUB 11
191 #define STORED_STUB 12
192 #define STORELR_STUB 13
193 #define INVCODE_STUB 14
194
195   /* branch codes */
196 #define TAKEN 1
197 #define NOTTAKEN 2
198 #define NULLDS 3
199
200 // asm linkage
201 int new_recompile_block(int addr);
202 void *get_addr_ht(u_int vaddr);
203 void invalidate_block(u_int block);
204 void invalidate_addr(u_int addr);
205 void remove_hash(int vaddr);
206 void jump_vaddr();
207 void dyna_linker();
208 void dyna_linker_ds();
209 void verify_code();
210 void verify_code_vm();
211 void verify_code_ds();
212 void cc_interrupt();
213 void fp_exception();
214 void fp_exception_ds();
215 void jump_syscall();
216 void jump_eret();
217
218 // TLB
219 void TLBWI_new();
220 void TLBWR_new();
221 void read_nomem_new();
222 void read_nomemb_new();
223 void read_nomemh_new();
224 void read_nomemd_new();
225 void write_nomem_new();
226 void write_nomemb_new();
227 void write_nomemh_new();
228 void write_nomemd_new();
229 void write_rdram_new();
230 void write_rdramb_new();
231 void write_rdramh_new();
232 void write_rdramd_new();
233 extern u_int memory_map[1048576];
234
235 // Needed by assembler
236 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
237 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
238 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
239 void load_all_regs(signed char i_regmap[]);
240 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
241 void load_regs_entry(int t);
242 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
243
244 int tracedebug=0;
245
246 //#define DEBUG_CYCLE_COUNT 1
247
248 void nullf() {}
249 //#define assem_debug printf
250 //#define inv_debug printf
251 #define assem_debug nullf
252 #define inv_debug nullf
253
254 static void tlb_hacks()
255 {
256 #ifndef DISABLE_TLB
257   // Goldeneye hack
258   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
259   {
260     u_int addr;
261     int n;
262     switch (ROM_HEADER->Country_code&0xFF) 
263     {
264       case 0x45: // U
265         addr=0x34b30;
266         break;                   
267       case 0x4A: // J 
268         addr=0x34b70;    
269         break;    
270       case 0x50: // E 
271         addr=0x329f0;
272         break;                        
273       default: 
274         // Unknown country code
275         addr=0;
276         break;
277     }
278     u_int rom_addr=(u_int)rom;
279     #ifdef ROM_COPY
280     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
281     // in the lower 4G of memory to use this hack.  Copy it if necessary.
282     if((void *)rom>(void *)0xffffffff) {
283       munmap(ROM_COPY, 67108864);
284       if(mmap(ROM_COPY, 12582912,
285               PROT_READ | PROT_WRITE,
286               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
287               -1, 0) <= 0) {printf("mmap() failed\n");}
288       memcpy(ROM_COPY,rom,12582912);
289       rom_addr=(u_int)ROM_COPY;
290     }
291     #endif
292     if(addr) {
293       for(n=0x7F000;n<0x80000;n++) {
294         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
295       }
296     }
297   }
298 #endif
299 }
300
301 static u_int get_page(u_int vaddr)
302 {
303   u_int page=(vaddr^0x80000000)>>12;
304 #ifndef DISABLE_TLB
305   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
306 #endif
307   if(page>2048) page=2048+(page&2047);
308   return page;
309 }
310
311 static u_int get_vpage(u_int vaddr)
312 {
313   u_int vpage=(vaddr^0x80000000)>>12;
314 #ifndef DISABLE_TLB
315   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
316 #endif
317   if(vpage>2048) vpage=2048+(vpage&2047);
318   return vpage;
319 }
320
321 // Get address from virtual address
322 // This is called from the recompiled JR/JALR instructions
323 void *get_addr(u_int vaddr)
324 {
325   u_int page=get_page(vaddr);
326   u_int vpage=get_vpage(vaddr);
327   struct ll_entry *head;
328   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
329   head=jump_in[page];
330   while(head!=NULL) {
331     if(head->vaddr==vaddr&&head->reg32==0) {
332   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
333       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
334       ht_bin[3]=ht_bin[1];
335       ht_bin[2]=ht_bin[0];
336       ht_bin[1]=(int)head->addr;
337       ht_bin[0]=vaddr;
338       return head->addr;
339     }
340     head=head->next;
341   }
342   head=jump_dirty[vpage];
343   while(head!=NULL) {
344     if(head->vaddr==vaddr&&head->reg32==0) {
345       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
346       // Don't restore blocks which are about to expire from the cache
347       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
348       if(verify_dirty(head->addr)) {
349         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
350         invalid_code[vaddr>>12]=0;
351         memory_map[vaddr>>12]|=0x40000000;
352         if(vpage<2048) {
353 #ifndef DISABLE_TLB
354           if(tlb_LUT_r[vaddr>>12]) {
355             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
356             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
357           }
358 #endif
359           restore_candidate[vpage>>3]|=1<<(vpage&7);
360         }
361         else restore_candidate[page>>3]|=1<<(page&7);
362         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
363         if(ht_bin[0]==vaddr) {
364           ht_bin[1]=(int)head->addr; // Replace existing entry
365         }
366         else
367         {
368           ht_bin[3]=ht_bin[1];
369           ht_bin[2]=ht_bin[0];
370           ht_bin[1]=(int)head->addr;
371           ht_bin[0]=vaddr;
372         }
373         return head->addr;
374       }
375     }
376     head=head->next;
377   }
378   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
379   int r=new_recompile_block(vaddr);
380   if(r==0) return get_addr(vaddr);
381   // Execute in unmapped page, generate pagefault execption
382   Status|=2;
383   Cause=(vaddr<<31)|0x8;
384   EPC=(vaddr&1)?vaddr-5:vaddr;
385   BadVAddr=(vaddr&~1);
386   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
387   EntryHi=BadVAddr&0xFFFFE000;
388   return get_addr_ht(0x80000000);
389 }
390 // Look up address in hash table first
391 void *get_addr_ht(u_int vaddr)
392 {
393   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
394   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
395   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
396   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
397   return get_addr(vaddr);
398 }
399
400 void *get_addr_32(u_int vaddr,u_int flags)
401 {
402   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
403   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
404   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
405   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
406   u_int page=get_page(vaddr);
407   u_int vpage=get_vpage(vaddr);
408   struct ll_entry *head;
409   head=jump_in[page];
410   while(head!=NULL) {
411     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
412       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
413       if(head->reg32==0) {
414         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
415         if(ht_bin[0]==-1) {
416           ht_bin[1]=(int)head->addr;
417           ht_bin[0]=vaddr;
418         }else if(ht_bin[2]==-1) {
419           ht_bin[3]=(int)head->addr;
420           ht_bin[2]=vaddr;
421         }
422         //ht_bin[3]=ht_bin[1];
423         //ht_bin[2]=ht_bin[0];
424         //ht_bin[1]=(int)head->addr;
425         //ht_bin[0]=vaddr;
426       }
427       return head->addr;
428     }
429     head=head->next;
430   }
431   head=jump_dirty[vpage];
432   while(head!=NULL) {
433     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
434       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
435       // Don't restore blocks which are about to expire from the cache
436       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
437       if(verify_dirty(head->addr)) {
438         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
439         invalid_code[vaddr>>12]=0;
440         memory_map[vaddr>>12]|=0x40000000;
441         if(vpage<2048) {
442 #ifndef DISABLE_TLB
443           if(tlb_LUT_r[vaddr>>12]) {
444             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
445             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
446           }
447 #endif
448           restore_candidate[vpage>>3]|=1<<(vpage&7);
449         }
450         else restore_candidate[page>>3]|=1<<(page&7);
451         if(head->reg32==0) {
452           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
453           if(ht_bin[0]==-1) {
454             ht_bin[1]=(int)head->addr;
455             ht_bin[0]=vaddr;
456           }else if(ht_bin[2]==-1) {
457             ht_bin[3]=(int)head->addr;
458             ht_bin[2]=vaddr;
459           }
460           //ht_bin[3]=ht_bin[1];
461           //ht_bin[2]=ht_bin[0];
462           //ht_bin[1]=(int)head->addr;
463           //ht_bin[0]=vaddr;
464         }
465         return head->addr;
466       }
467     }
468     head=head->next;
469   }
470   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
471   int r=new_recompile_block(vaddr);
472   if(r==0) return get_addr(vaddr);
473   // Execute in unmapped page, generate pagefault execption
474   Status|=2;
475   Cause=(vaddr<<31)|0x8;
476   EPC=(vaddr&1)?vaddr-5:vaddr;
477   BadVAddr=(vaddr&~1);
478   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
479   EntryHi=BadVAddr&0xFFFFE000;
480   return get_addr_ht(0x80000000);
481 }
482
483 void clear_all_regs(signed char regmap[])
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
487 }
488
489 signed char get_reg(signed char regmap[],int r)
490 {
491   int hr;
492   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
493   return -1;
494 }
495
496 // Find a register that is available for two consecutive cycles
497 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
498 {
499   int hr;
500   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
501   return -1;
502 }
503
504 int count_free_regs(signed char regmap[])
505 {
506   int count=0;
507   int hr;
508   for(hr=0;hr<HOST_REGS;hr++)
509   {
510     if(hr!=EXCLUDE_REG) {
511       if(regmap[hr]<0) count++;
512     }
513   }
514   return count;
515 }
516
517 void dirty_reg(struct regstat *cur,signed char reg)
518 {
519   int hr;
520   if(!reg) return;
521   for (hr=0;hr<HOST_REGS;hr++) {
522     if((cur->regmap[hr]&63)==reg) {
523       cur->dirty|=1<<hr;
524     }
525   }
526 }
527
528 // If we dirty the lower half of a 64 bit register which is now being
529 // sign-extended, we need to dump the upper half.
530 // Note: Do this only after completion of the instruction, because
531 // some instructions may need to read the full 64-bit value even if
532 // overwriting it (eg SLTI, DSRA32).
533 static void flush_dirty_uppers(struct regstat *cur)
534 {
535   int hr,reg;
536   for (hr=0;hr<HOST_REGS;hr++) {
537     if((cur->dirty>>hr)&1) {
538       reg=cur->regmap[hr];
539       if(reg>=64) 
540         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
541     }
542   }
543 }
544
545 void set_const(struct regstat *cur,signed char reg,uint64_t value)
546 {
547   int hr;
548   if(!reg) return;
549   for (hr=0;hr<HOST_REGS;hr++) {
550     if(cur->regmap[hr]==reg) {
551       cur->isconst|=1<<hr;
552       cur->constmap[hr]=value;
553     }
554     else if((cur->regmap[hr]^64)==reg) {
555       cur->isconst|=1<<hr;
556       cur->constmap[hr]=value>>32;
557     }
558   }
559 }
560
561 void clear_const(struct regstat *cur,signed char reg)
562 {
563   int hr;
564   if(!reg) return;
565   for (hr=0;hr<HOST_REGS;hr++) {
566     if((cur->regmap[hr]&63)==reg) {
567       cur->isconst&=~(1<<hr);
568     }
569   }
570 }
571
572 int is_const(struct regstat *cur,signed char reg)
573 {
574   int hr;
575   if(!reg) return 1;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if((cur->regmap[hr]&63)==reg) {
578       return (cur->isconst>>hr)&1;
579     }
580   }
581   return 0;
582 }
583 uint64_t get_const(struct regstat *cur,signed char reg)
584 {
585   int hr;
586   if(!reg) return 0;
587   for (hr=0;hr<HOST_REGS;hr++) {
588     if(cur->regmap[hr]==reg) {
589       return cur->constmap[hr];
590     }
591   }
592   printf("Unknown constant in r%d\n",reg);
593   exit(1);
594 }
595
596 // Least soon needed registers
597 // Look at the next ten instructions and see which registers
598 // will be used.  Try not to reallocate these.
599 void lsn(u_char hsn[], int i, int *preferred_reg)
600 {
601   int j;
602   int b=-1;
603   for(j=0;j<9;j++)
604   {
605     if(i+j>=slen) {
606       j=slen-i-1;
607       break;
608     }
609     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
610     {
611       // Don't go past an unconditonal jump
612       j++;
613       break;
614     }
615   }
616   for(;j>=0;j--)
617   {
618     if(rs1[i+j]) hsn[rs1[i+j]]=j;
619     if(rs2[i+j]) hsn[rs2[i+j]]=j;
620     if(rt1[i+j]) hsn[rt1[i+j]]=j;
621     if(rt2[i+j]) hsn[rt2[i+j]]=j;
622     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
623       // Stores can allocate zero
624       hsn[rs1[i+j]]=j;
625       hsn[rs2[i+j]]=j;
626     }
627     // On some architectures stores need invc_ptr
628     #if defined(HOST_IMM8)
629     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
630       hsn[INVCP]=j;
631     }
632     #endif
633     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
634     {
635       hsn[CCREG]=j;
636       b=j;
637     }
638   }
639   if(b>=0)
640   {
641     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
642     {
643       // Follow first branch
644       int t=(ba[i+b]-start)>>2;
645       j=7-b;if(t+j>=slen) j=slen-t-1;
646       for(;j>=0;j--)
647       {
648         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
649         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
650         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
651         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
652       }
653     }
654     // TODO: preferred register based on backward branch
655   }
656   // Delay slot should preferably not overwrite branch conditions or cycle count
657   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
658     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
659     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
660     hsn[CCREG]=1;
661     // ...or hash tables
662     hsn[RHASH]=1;
663     hsn[RHTBL]=1;
664   }
665   // Coprocessor load/store needs FTEMP, even if not declared
666   if(itype[i]==C1LS) {
667     hsn[FTEMP]=0;
668   }
669   // Load L/R also uses FTEMP as a temporary register
670   if(itype[i]==LOADLR) {
671     hsn[FTEMP]=0;
672   }
673   // Also 64-bit SDL/SDR
674   if(opcode[i]==0x2c||opcode[i]==0x2d) {
675     hsn[FTEMP]=0;
676   }
677   // Don't remove the TLB registers either
678   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
679     hsn[TLREG]=0;
680   }
681   // Don't remove the miniht registers
682   if(itype[i]==UJUMP||itype[i]==RJUMP)
683   {
684     hsn[RHASH]=0;
685     hsn[RHTBL]=0;
686   }
687 }
688
689 // We only want to allocate registers if we're going to use them again soon
690 int needed_again(int r, int i)
691 {
692   int j;
693   int b=-1;
694   int rn=10;
695   int hr;
696   u_char hsn[MAXREG+1];
697   int preferred_reg;
698   
699   memset(hsn,10,sizeof(hsn));
700   lsn(hsn,i,&preferred_reg);
701   
702   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
703   {
704     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
705       return 0; // Don't need any registers if exiting the block
706   }
707   for(j=0;j<9;j++)
708   {
709     if(i+j>=slen) {
710       j=slen-i-1;
711       break;
712     }
713     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
714     {
715       // Don't go past an unconditonal jump
716       j++;
717       break;
718     }
719     if(itype[i+j]==SYSCALL||((source[i+j]&0xfc00003f)==0x0d))
720     {
721       break;
722     }
723   }
724   for(;j>=1;j--)
725   {
726     if(rs1[i+j]==r) rn=j;
727     if(rs2[i+j]==r) rn=j;
728     if((unneeded_reg[i+j]>>r)&1) rn=10;
729     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
730     {
731       b=j;
732     }
733   }
734   /*
735   if(b>=0)
736   {
737     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
738     {
739       // Follow first branch
740       int o=rn;
741       int t=(ba[i+b]-start)>>2;
742       j=7-b;if(t+j>=slen) j=slen-t-1;
743       for(;j>=0;j--)
744       {
745         if(!((unneeded_reg[t+j]>>r)&1)) {
746           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
747           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
748         }
749         else rn=o;
750       }
751     }
752   }*/
753   for(hr=0;hr<HOST_REGS;hr++) {
754     if(hr!=EXCLUDE_REG) {
755       if(rn<hsn[hr]) return 1;
756     }
757   }
758   return 0;
759 }
760
761 // Try to match register allocations at the end of a loop with those
762 // at the beginning
763 int loop_reg(int i, int r, int hr)
764 {
765   int j,k;
766   for(j=0;j<9;j++)
767   {
768     if(i+j>=slen) {
769       j=slen-i-1;
770       break;
771     }
772     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
773     {
774       // Don't go past an unconditonal jump
775       j++;
776       break;
777     }
778   }
779   k=0;
780   if(i>0){
781     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
782       k--;
783   }
784   for(;k<j;k++)
785   {
786     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
787     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
788     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
789     {
790       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
791       {
792         int t=(ba[i+k]-start)>>2;
793         int reg=get_reg(regs[t].regmap_entry,r);
794         if(reg>=0) return reg;
795         //reg=get_reg(regs[t+1].regmap_entry,r);
796         //if(reg>=0) return reg;
797       }
798     }
799   }
800   return hr;
801 }
802
803
804 // Allocate every register, preserving source/target regs
805 void alloc_all(struct regstat *cur,int i)
806 {
807   int hr;
808   
809   for(hr=0;hr<HOST_REGS;hr++) {
810     if(hr!=EXCLUDE_REG) {
811       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
812          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
813       {
814         cur->regmap[hr]=-1;
815         cur->dirty&=~(1<<hr);
816       }
817       // Don't need zeros
818       if((cur->regmap[hr]&63)==0)
819       {
820         cur->regmap[hr]=-1;
821         cur->dirty&=~(1<<hr);
822       }
823     }
824   }
825 }
826
827
828 void div64(int64_t dividend,int64_t divisor)
829 {
830   lo=dividend/divisor;
831   hi=dividend%divisor;
832   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
833   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
834 }
835 void divu64(uint64_t dividend,uint64_t divisor)
836 {
837   lo=dividend/divisor;
838   hi=dividend%divisor;
839   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
840   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
841 }
842
843 void mult64(uint64_t m1,uint64_t m2)
844 {
845    unsigned long long int op1, op2, op3, op4;
846    unsigned long long int result1, result2, result3, result4;
847    unsigned long long int temp1, temp2, temp3, temp4;
848    int sign = 0;
849    
850    if (m1 < 0)
851      {
852     op2 = -m1;
853     sign = 1 - sign;
854      }
855    else op2 = m1;
856    if (m2 < 0)
857      {
858     op4 = -m2;
859     sign = 1 - sign;
860      }
861    else op4 = m2;
862    
863    op1 = op2 & 0xFFFFFFFF;
864    op2 = (op2 >> 32) & 0xFFFFFFFF;
865    op3 = op4 & 0xFFFFFFFF;
866    op4 = (op4 >> 32) & 0xFFFFFFFF;
867    
868    temp1 = op1 * op3;
869    temp2 = (temp1 >> 32) + op1 * op4;
870    temp3 = op2 * op3;
871    temp4 = (temp3 >> 32) + op2 * op4;
872    
873    result1 = temp1 & 0xFFFFFFFF;
874    result2 = temp2 + (temp3 & 0xFFFFFFFF);
875    result3 = (result2 >> 32) + temp4;
876    result4 = (result3 >> 32);
877    
878    lo = result1 | (result2 << 32);
879    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
880    if (sign)
881      {
882     hi = ~hi;
883     if (!lo) hi++;
884     else lo = ~lo + 1;
885      }
886 }
887
888 void multu64(uint64_t m1,uint64_t m2)
889 {
890    unsigned long long int op1, op2, op3, op4;
891    unsigned long long int result1, result2, result3, result4;
892    unsigned long long int temp1, temp2, temp3, temp4;
893    
894    op1 = m1 & 0xFFFFFFFF;
895    op2 = (m1 >> 32) & 0xFFFFFFFF;
896    op3 = m2 & 0xFFFFFFFF;
897    op4 = (m2 >> 32) & 0xFFFFFFFF;
898    
899    temp1 = op1 * op3;
900    temp2 = (temp1 >> 32) + op1 * op4;
901    temp3 = op2 * op3;
902    temp4 = (temp3 >> 32) + op2 * op4;
903    
904    result1 = temp1 & 0xFFFFFFFF;
905    result2 = temp2 + (temp3 & 0xFFFFFFFF);
906    result3 = (result2 >> 32) + temp4;
907    result4 = (result3 >> 32);
908    
909    lo = result1 | (result2 << 32);
910    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
911    
912   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
913   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
914 }
915
916 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
917 {
918   if(bits) {
919     original<<=64-bits;
920     original>>=64-bits;
921     loaded<<=bits;
922     original|=loaded;
923   }
924   else original=loaded;
925   return original;
926 }
927 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
928 {
929   if(bits^56) {
930     original>>=64-(bits^56);
931     original<<=64-(bits^56);
932     loaded>>=bits^56;
933     original|=loaded;
934   }
935   else original=loaded;
936   return original;
937 }
938
939 #ifdef __i386__
940 #include "assem_x86.c"
941 #endif
942 #ifdef __x86_64__
943 #include "assem_x64.c"
944 #endif
945 #ifdef __arm__
946 #include "assem_arm.c"
947 #endif
948
949 // Add virtual address mapping to linked list
950 void ll_add(struct ll_entry **head,int vaddr,void *addr)
951 {
952   struct ll_entry *new_entry;
953   new_entry=malloc(sizeof(struct ll_entry));
954   assert(new_entry!=NULL);
955   new_entry->vaddr=vaddr;
956   new_entry->reg32=0;
957   new_entry->addr=addr;
958   new_entry->next=*head;
959   *head=new_entry;
960 }
961
962 // Add virtual address mapping for 32-bit compiled block
963 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
964 {
965   struct ll_entry *new_entry;
966   new_entry=malloc(sizeof(struct ll_entry));
967   assert(new_entry!=NULL);
968   new_entry->vaddr=vaddr;
969   new_entry->reg32=reg32;
970   new_entry->addr=addr;
971   new_entry->next=*head;
972   *head=new_entry;
973 }
974
975 // Check if an address is already compiled
976 // but don't return addresses which are about to expire from the cache
977 void *check_addr(u_int vaddr)
978 {
979   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
980   if(ht_bin[0]==vaddr) {
981     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
982       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
983   }
984   if(ht_bin[2]==vaddr) {
985     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
986       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
987   }
988   u_int page=get_page(vaddr);
989   struct ll_entry *head;
990   head=jump_in[page];
991   while(head!=NULL) {
992     if(head->vaddr==vaddr&&head->reg32==0) {
993       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
994         // Update existing entry with current address
995         if(ht_bin[0]==vaddr) {
996           ht_bin[1]=(int)head->addr;
997           return head->addr;
998         }
999         if(ht_bin[2]==vaddr) {
1000           ht_bin[3]=(int)head->addr;
1001           return head->addr;
1002         }
1003         // Insert into hash table with low priority.
1004         // Don't evict existing entries, as they are probably
1005         // addresses that are being accessed frequently.
1006         if(ht_bin[0]==-1) {
1007           ht_bin[1]=(int)head->addr;
1008           ht_bin[0]=vaddr;
1009         }else if(ht_bin[2]==-1) {
1010           ht_bin[3]=(int)head->addr;
1011           ht_bin[2]=vaddr;
1012         }
1013         return head->addr;
1014       }
1015     }
1016     head=head->next;
1017   }
1018   return 0;
1019 }
1020
1021 void remove_hash(int vaddr)
1022 {
1023   //printf("remove hash: %x\n",vaddr);
1024   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1025   if(ht_bin[2]==vaddr) {
1026     ht_bin[2]=ht_bin[3]=-1;
1027   }
1028   if(ht_bin[0]==vaddr) {
1029     ht_bin[0]=ht_bin[2];
1030     ht_bin[1]=ht_bin[3];
1031     ht_bin[2]=ht_bin[3]=-1;
1032   }
1033 }
1034
1035 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1036 {
1037   struct ll_entry *next;
1038   while(*head) {
1039     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1040        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1041     {
1042       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1043       remove_hash((*head)->vaddr);
1044       next=(*head)->next;
1045       free(*head);
1046       *head=next;
1047     }
1048     else
1049     {
1050       head=&((*head)->next);
1051     }
1052   }
1053 }
1054
1055 // Remove all entries from linked list
1056 void ll_clear(struct ll_entry **head)
1057 {
1058   struct ll_entry *cur;
1059   struct ll_entry *next;
1060   if(cur=*head) {
1061     *head=0;
1062     while(cur) {
1063       next=cur->next;
1064       free(cur);
1065       cur=next;
1066     }
1067   }
1068 }
1069
1070 // Dereference the pointers and remove if it matches
1071 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1072 {
1073   while(head) {
1074     int ptr=get_pointer(head->addr);
1075     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1076     if(((ptr>>shift)==(addr>>shift)) ||
1077        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1078     {
1079       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1080       kill_pointer(head->addr);
1081     }
1082     head=head->next;
1083   }
1084 }
1085
1086 // This is called when we write to a compiled block (see do_invstub)
1087 int invalidate_page(u_int page)
1088 {
1089   int modified=0;
1090   struct ll_entry *head;
1091   struct ll_entry *next;
1092   head=jump_in[page];
1093   jump_in[page]=0;
1094   while(head!=NULL) {
1095     inv_debug("INVALIDATE: %x\n",head->vaddr);
1096     remove_hash(head->vaddr);
1097     next=head->next;
1098     free(head);
1099     head=next;
1100   }
1101   head=jump_out[page];
1102   jump_out[page]=0;
1103   while(head!=NULL) {
1104     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1105     kill_pointer(head->addr);
1106     modified=1;
1107     next=head->next;
1108     free(head);
1109     head=next;
1110   }
1111   return modified;
1112 }
1113 void invalidate_block(u_int block)
1114 {
1115   int modified;
1116   u_int page=get_page(block<<12);
1117   u_int vpage=get_vpage(block<<12);
1118   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1119   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1120   u_int first,last;
1121   first=last=page;
1122   struct ll_entry *head;
1123   head=jump_dirty[vpage];
1124   //printf("page=%d vpage=%d\n",page,vpage);
1125   while(head!=NULL) {
1126     u_int start,end;
1127     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1128       get_bounds((int)head->addr,&start,&end);
1129       //printf("start: %x end: %x\n",start,end);
1130       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1131         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1132           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1133           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1134         }
1135       }
1136       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1137         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1138           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1139           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1140         }
1141       }
1142     }
1143     head=head->next;
1144   }
1145   //printf("first=%d last=%d\n",first,last);
1146   modified=invalidate_page(page);
1147   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1148   assert(last<page+5);
1149   // Invalidate the adjacent pages if a block crosses a 4K boundary
1150   while(first<page) {
1151     invalidate_page(first);
1152     first++;
1153   }
1154   for(first=page+1;first<last;first++) {
1155     invalidate_page(first);
1156   }
1157   
1158   // Don't trap writes
1159   invalid_code[block]=1;
1160 #ifndef DISABLE_TLB
1161   // If there is a valid TLB entry for this page, remove write protect
1162   if(tlb_LUT_w[block]) {
1163     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1164     // CHECK: Is this right?
1165     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1166     u_int real_block=tlb_LUT_w[block]>>12;
1167     invalid_code[real_block]=1;
1168     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1169   }
1170   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1171 #endif
1172   #ifdef __arm__
1173   if(modified)
1174     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1175   #endif
1176   #ifdef USE_MINI_HT
1177   memset(mini_ht,-1,sizeof(mini_ht));
1178   #endif
1179 }
1180 void invalidate_addr(u_int addr)
1181 {
1182   invalidate_block(addr>>12);
1183 }
1184 void invalidate_all_pages()
1185 {
1186   u_int page,n;
1187   for(page=0;page<4096;page++)
1188     invalidate_page(page);
1189   for(page=0;page<1048576;page++)
1190     if(!invalid_code[page]) {
1191       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1192       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1193     }
1194   #ifdef __arm__
1195   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1196   #endif
1197   #ifdef USE_MINI_HT
1198   memset(mini_ht,-1,sizeof(mini_ht));
1199   #endif
1200   #ifndef DISABLE_TLB
1201   // TLB
1202   for(page=0;page<0x100000;page++) {
1203     if(tlb_LUT_r[page]) {
1204       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1205       if(!tlb_LUT_w[page]||!invalid_code[page])
1206         memory_map[page]|=0x40000000; // Write protect
1207     }
1208     else memory_map[page]=-1;
1209     if(page==0x80000) page=0xC0000;
1210   }
1211   tlb_hacks();
1212   #endif
1213 }
1214
1215 // Add an entry to jump_out after making a link
1216 void add_link(u_int vaddr,void *src)
1217 {
1218   u_int page=get_page(vaddr);
1219   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1220   ll_add(jump_out+page,vaddr,src);
1221   //int ptr=get_pointer(src);
1222   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1223 }
1224
1225 // If a code block was found to be unmodified (bit was set in
1226 // restore_candidate) and it remains unmodified (bit is clear
1227 // in invalid_code) then move the entries for that 4K page from
1228 // the dirty list to the clean list.
1229 void clean_blocks(u_int page)
1230 {
1231   struct ll_entry *head;
1232   inv_debug("INV: clean_blocks page=%d\n",page);
1233   head=jump_dirty[page];
1234   while(head!=NULL) {
1235     if(!invalid_code[head->vaddr>>12]) {
1236       // Don't restore blocks which are about to expire from the cache
1237       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1238         u_int start,end;
1239         if(verify_dirty((int)head->addr)) {
1240           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1241           u_int i;
1242           u_int inv=0;
1243           get_bounds((int)head->addr,&start,&end);
1244           if(start-(u_int)rdram<0x800000) {
1245             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1246               inv|=invalid_code[i];
1247             }
1248           }
1249           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1250             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1251             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1252             if(addr<start||addr>=end) inv=1;
1253           }
1254           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1255             inv=1;
1256           }
1257           if(!inv) {
1258             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1259             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1260               u_int ppage=page;
1261 #ifndef DISABLE_TLB
1262               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1263 #endif
1264               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1265               //printf("page=%x, addr=%x\n",page,head->vaddr);
1266               //assert(head->vaddr>>12==(page|0x80000));
1267               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1268               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1269               if(!head->reg32) {
1270                 if(ht_bin[0]==head->vaddr) {
1271                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1272                 }
1273                 if(ht_bin[2]==head->vaddr) {
1274                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1275                 }
1276               }
1277             }
1278           }
1279         }
1280       }
1281     }
1282     head=head->next;
1283   }
1284 }
1285
1286
1287 void mov_alloc(struct regstat *current,int i)
1288 {
1289   // Note: Don't need to actually alloc the source registers
1290   if((~current->is32>>rs1[i])&1) {
1291     //alloc_reg64(current,i,rs1[i]);
1292     alloc_reg64(current,i,rt1[i]);
1293     current->is32&=~(1LL<<rt1[i]);
1294   } else {
1295     //alloc_reg(current,i,rs1[i]);
1296     alloc_reg(current,i,rt1[i]);
1297     current->is32|=(1LL<<rt1[i]);
1298   }
1299   clear_const(current,rs1[i]);
1300   clear_const(current,rt1[i]);
1301   dirty_reg(current,rt1[i]);
1302 }
1303
1304 void shiftimm_alloc(struct regstat *current,int i)
1305 {
1306   clear_const(current,rs1[i]);
1307   clear_const(current,rt1[i]);
1308   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1309   {
1310     if(rt1[i]) {
1311       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1312       else lt1[i]=rs1[i];
1313       alloc_reg(current,i,rt1[i]);
1314       current->is32|=1LL<<rt1[i];
1315       dirty_reg(current,rt1[i]);
1316     }
1317   }
1318   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1319   {
1320     if(rt1[i]) {
1321       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1322       alloc_reg64(current,i,rt1[i]);
1323       current->is32&=~(1LL<<rt1[i]);
1324       dirty_reg(current,rt1[i]);
1325     }
1326   }
1327   if(opcode2[i]==0x3c) // DSLL32
1328   {
1329     if(rt1[i]) {
1330       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1331       alloc_reg64(current,i,rt1[i]);
1332       current->is32&=~(1LL<<rt1[i]);
1333       dirty_reg(current,rt1[i]);
1334     }
1335   }
1336   if(opcode2[i]==0x3e) // DSRL32
1337   {
1338     if(rt1[i]) {
1339       alloc_reg64(current,i,rs1[i]);
1340       if(imm[i]==32) {
1341         alloc_reg64(current,i,rt1[i]);
1342         current->is32&=~(1LL<<rt1[i]);
1343       } else {
1344         alloc_reg(current,i,rt1[i]);
1345         current->is32|=1LL<<rt1[i];
1346       }
1347       dirty_reg(current,rt1[i]);
1348     }
1349   }
1350   if(opcode2[i]==0x3f) // DSRA32
1351   {
1352     if(rt1[i]) {
1353       alloc_reg64(current,i,rs1[i]);
1354       alloc_reg(current,i,rt1[i]);
1355       current->is32|=1LL<<rt1[i];
1356       dirty_reg(current,rt1[i]);
1357     }
1358   }
1359 }
1360
1361 void shift_alloc(struct regstat *current,int i)
1362 {
1363   if(rt1[i]) {
1364     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1365     {
1366       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1367       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1368       alloc_reg(current,i,rt1[i]);
1369       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1370       current->is32|=1LL<<rt1[i];
1371     } else { // DSLLV/DSRLV/DSRAV
1372       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1373       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1374       alloc_reg64(current,i,rt1[i]);
1375       current->is32&=~(1LL<<rt1[i]);
1376       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1377         alloc_reg_temp(current,i,-1);
1378     }
1379     clear_const(current,rs1[i]);
1380     clear_const(current,rs2[i]);
1381     clear_const(current,rt1[i]);
1382     dirty_reg(current,rt1[i]);
1383   }
1384 }
1385
1386 void alu_alloc(struct regstat *current,int i)
1387 {
1388   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1389     if(rt1[i]) {
1390       if(rs1[i]&&rs2[i]) {
1391         alloc_reg(current,i,rs1[i]);
1392         alloc_reg(current,i,rs2[i]);
1393       }
1394       else {
1395         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1396         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1397       }
1398       alloc_reg(current,i,rt1[i]);
1399     }
1400     current->is32|=1LL<<rt1[i];
1401   }
1402   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1403     if(rt1[i]) {
1404       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1405       {
1406         alloc_reg64(current,i,rs1[i]);
1407         alloc_reg64(current,i,rs2[i]);
1408         alloc_reg(current,i,rt1[i]);
1409       } else {
1410         alloc_reg(current,i,rs1[i]);
1411         alloc_reg(current,i,rs2[i]);
1412         alloc_reg(current,i,rt1[i]);
1413       }
1414     }
1415     current->is32|=1LL<<rt1[i];
1416   }
1417   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1418     if(rt1[i]) {
1419       if(rs1[i]&&rs2[i]) {
1420         alloc_reg(current,i,rs1[i]);
1421         alloc_reg(current,i,rs2[i]);
1422       }
1423       else
1424       {
1425         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1426         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1427       }
1428       alloc_reg(current,i,rt1[i]);
1429       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1430       {
1431         if(!((current->uu>>rt1[i])&1)) {
1432           alloc_reg64(current,i,rt1[i]);
1433         }
1434         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1435           if(rs1[i]&&rs2[i]) {
1436             alloc_reg64(current,i,rs1[i]);
1437             alloc_reg64(current,i,rs2[i]);
1438           }
1439           else
1440           {
1441             // Is is really worth it to keep 64-bit values in registers?
1442             #ifdef NATIVE_64BIT
1443             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1444             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1445             #endif
1446           }
1447         }
1448         current->is32&=~(1LL<<rt1[i]);
1449       } else {
1450         current->is32|=1LL<<rt1[i];
1451       }
1452     }
1453   }
1454   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1455     if(rt1[i]) {
1456       if(rs1[i]&&rs2[i]) {
1457         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1458           alloc_reg64(current,i,rs1[i]);
1459           alloc_reg64(current,i,rs2[i]);
1460           alloc_reg64(current,i,rt1[i]);
1461         } else {
1462           alloc_reg(current,i,rs1[i]);
1463           alloc_reg(current,i,rs2[i]);
1464           alloc_reg(current,i,rt1[i]);
1465         }
1466       }
1467       else {
1468         alloc_reg(current,i,rt1[i]);
1469         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1470           // DADD used as move, or zeroing
1471           // If we have a 64-bit source, then make the target 64 bits too
1472           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1473             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1474             alloc_reg64(current,i,rt1[i]);
1475           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1476             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1477             alloc_reg64(current,i,rt1[i]);
1478           }
1479           if(opcode2[i]>=0x2e&&rs2[i]) {
1480             // DSUB used as negation - 64-bit result
1481             // If we have a 32-bit register, extend it to 64 bits
1482             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1483             alloc_reg64(current,i,rt1[i]);
1484           }
1485         }
1486       }
1487       if(rs1[i]&&rs2[i]) {
1488         current->is32&=~(1LL<<rt1[i]);
1489       } else if(rs1[i]) {
1490         current->is32&=~(1LL<<rt1[i]);
1491         if((current->is32>>rs1[i])&1)
1492           current->is32|=1LL<<rt1[i];
1493       } else if(rs2[i]) {
1494         current->is32&=~(1LL<<rt1[i]);
1495         if((current->is32>>rs2[i])&1)
1496           current->is32|=1LL<<rt1[i];
1497       } else {
1498         current->is32|=1LL<<rt1[i];
1499       }
1500     }
1501   }
1502   clear_const(current,rs1[i]);
1503   clear_const(current,rs2[i]);
1504   clear_const(current,rt1[i]);
1505   dirty_reg(current,rt1[i]);
1506 }
1507
1508 void imm16_alloc(struct regstat *current,int i)
1509 {
1510   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1511   else lt1[i]=rs1[i];
1512   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1513   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1514     current->is32&=~(1LL<<rt1[i]);
1515     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1516       // TODO: Could preserve the 32-bit flag if the immediate is zero
1517       alloc_reg64(current,i,rt1[i]);
1518       alloc_reg64(current,i,rs1[i]);
1519     }
1520     clear_const(current,rs1[i]);
1521     clear_const(current,rt1[i]);
1522   }
1523   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1524     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1525     current->is32|=1LL<<rt1[i];
1526     clear_const(current,rs1[i]);
1527     clear_const(current,rt1[i]);
1528   }
1529   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1530     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1531       if(rs1[i]!=rt1[i]) {
1532         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1533         alloc_reg64(current,i,rt1[i]);
1534         current->is32&=~(1LL<<rt1[i]);
1535       }
1536     }
1537     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1538     if(is_const(current,rs1[i])) {
1539       int v=get_const(current,rs1[i]);
1540       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1541       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1542       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1543     }
1544     else clear_const(current,rt1[i]);
1545   }
1546   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1547     if(is_const(current,rs1[i])) {
1548       int v=get_const(current,rs1[i]);
1549       set_const(current,rt1[i],v+imm[i]);
1550     }
1551     else clear_const(current,rt1[i]);
1552     current->is32|=1LL<<rt1[i];
1553   }
1554   else {
1555     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1556     current->is32|=1LL<<rt1[i];
1557   }
1558   dirty_reg(current,rt1[i]);
1559 }
1560
1561 void load_alloc(struct regstat *current,int i)
1562 {
1563   clear_const(current,rt1[i]);
1564   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1565   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1566   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1567   if(rt1[i]) {
1568     alloc_reg(current,i,rt1[i]);
1569     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1570     {
1571       current->is32&=~(1LL<<rt1[i]);
1572       alloc_reg64(current,i,rt1[i]);
1573     }
1574     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1575     {
1576       current->is32&=~(1LL<<rt1[i]);
1577       alloc_reg64(current,i,rt1[i]);
1578       alloc_all(current,i);
1579       alloc_reg64(current,i,FTEMP);
1580     }
1581     else current->is32|=1LL<<rt1[i];
1582     dirty_reg(current,rt1[i]);
1583     // If using TLB, need a register for pointer to the mapping table
1584     if(using_tlb) alloc_reg(current,i,TLREG);
1585     // LWL/LWR need a temporary register for the old value
1586     if(opcode[i]==0x22||opcode[i]==0x26)
1587     {
1588       alloc_reg(current,i,FTEMP);
1589       alloc_reg_temp(current,i,-1);
1590     }
1591   }
1592   else
1593   {
1594     // Load to r0 (dummy load)
1595     // but we still need a register to calculate the address
1596     alloc_reg_temp(current,i,-1);
1597   }
1598 }
1599
1600 void store_alloc(struct regstat *current,int i)
1601 {
1602   clear_const(current,rs2[i]);
1603   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1604   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1605   alloc_reg(current,i,rs2[i]);
1606   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1607     alloc_reg64(current,i,rs2[i]);
1608     if(rs2[i]) alloc_reg(current,i,FTEMP);
1609   }
1610   // If using TLB, need a register for pointer to the mapping table
1611   if(using_tlb) alloc_reg(current,i,TLREG);
1612   #if defined(HOST_IMM8)
1613   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1614   else alloc_reg(current,i,INVCP);
1615   #endif
1616   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1617     alloc_reg(current,i,FTEMP);
1618   }
1619   // We need a temporary register for address generation
1620   alloc_reg_temp(current,i,-1);
1621 }
1622
1623 void c1ls_alloc(struct regstat *current,int i)
1624 {
1625   //clear_const(current,rs1[i]); // FIXME
1626   clear_const(current,rt1[i]);
1627   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1628   alloc_reg(current,i,CSREG); // Status
1629   alloc_reg(current,i,FTEMP);
1630   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1631     alloc_reg64(current,i,FTEMP);
1632   }
1633   // If using TLB, need a register for pointer to the mapping table
1634   if(using_tlb) alloc_reg(current,i,TLREG);
1635   #if defined(HOST_IMM8)
1636   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1637   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1638     alloc_reg(current,i,INVCP);
1639   #endif
1640   // We need a temporary register for address generation
1641   alloc_reg_temp(current,i,-1);
1642 }
1643
1644 #ifndef multdiv_alloc
1645 void multdiv_alloc(struct regstat *current,int i)
1646 {
1647   //  case 0x18: MULT
1648   //  case 0x19: MULTU
1649   //  case 0x1A: DIV
1650   //  case 0x1B: DIVU
1651   //  case 0x1C: DMULT
1652   //  case 0x1D: DMULTU
1653   //  case 0x1E: DDIV
1654   //  case 0x1F: DDIVU
1655   clear_const(current,rs1[i]);
1656   clear_const(current,rs2[i]);
1657   if(rs1[i]&&rs2[i])
1658   {
1659     if((opcode2[i]&4)==0) // 32-bit
1660     {
1661       current->u&=~(1LL<<HIREG);
1662       current->u&=~(1LL<<LOREG);
1663       alloc_reg(current,i,HIREG);
1664       alloc_reg(current,i,LOREG);
1665       alloc_reg(current,i,rs1[i]);
1666       alloc_reg(current,i,rs2[i]);
1667       current->is32|=1LL<<HIREG;
1668       current->is32|=1LL<<LOREG;
1669       dirty_reg(current,HIREG);
1670       dirty_reg(current,LOREG);
1671     }
1672     else // 64-bit
1673     {
1674       current->u&=~(1LL<<HIREG);
1675       current->u&=~(1LL<<LOREG);
1676       current->uu&=~(1LL<<HIREG);
1677       current->uu&=~(1LL<<LOREG);
1678       alloc_reg64(current,i,HIREG);
1679       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1680       alloc_reg64(current,i,rs1[i]);
1681       alloc_reg64(current,i,rs2[i]);
1682       alloc_all(current,i);
1683       current->is32&=~(1LL<<HIREG);
1684       current->is32&=~(1LL<<LOREG);
1685       dirty_reg(current,HIREG);
1686       dirty_reg(current,LOREG);
1687     }
1688   }
1689   else
1690   {
1691     // Multiply by zero is zero.
1692     // MIPS does not have a divide by zero exception.
1693     // The result is undefined, we return zero.
1694     alloc_reg(current,i,HIREG);
1695     alloc_reg(current,i,LOREG);
1696     current->is32|=1LL<<HIREG;
1697     current->is32|=1LL<<LOREG;
1698     dirty_reg(current,HIREG);
1699     dirty_reg(current,LOREG);
1700   }
1701 }
1702 #endif
1703
1704 void cop0_alloc(struct regstat *current,int i)
1705 {
1706   if(opcode2[i]==0) // MFC0
1707   {
1708     if(rt1[i]) {
1709       clear_const(current,rt1[i]);
1710       alloc_all(current,i);
1711       alloc_reg(current,i,rt1[i]);
1712       current->is32|=1LL<<rt1[i];
1713       dirty_reg(current,rt1[i]);
1714     }
1715   }
1716   else if(opcode2[i]==4) // MTC0
1717   {
1718     if(rs1[i]){
1719       clear_const(current,rs1[i]);
1720       alloc_reg(current,i,rs1[i]);
1721       alloc_all(current,i);
1722     }
1723     else {
1724       alloc_all(current,i); // FIXME: Keep r0
1725       current->u&=~1LL;
1726       alloc_reg(current,i,0);
1727     }
1728   }
1729   else
1730   {
1731     // TLBR/TLBWI/TLBWR/TLBP/ERET
1732     assert(opcode2[i]==0x10);
1733     alloc_all(current,i);
1734   }
1735 }
1736
1737 void cop1_alloc(struct regstat *current,int i)
1738 {
1739   alloc_reg(current,i,CSREG); // Load status
1740   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1741   {
1742     assert(rt1[i]);
1743     clear_const(current,rt1[i]);
1744     if(opcode2[i]==1) {
1745       alloc_reg64(current,i,rt1[i]); // DMFC1
1746       current->is32&=~(1LL<<rt1[i]);
1747     }else{
1748       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1749       current->is32|=1LL<<rt1[i];
1750     }
1751     dirty_reg(current,rt1[i]);
1752     alloc_reg_temp(current,i,-1);
1753   }
1754   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1755   {
1756     if(rs1[i]){
1757       clear_const(current,rs1[i]);
1758       if(opcode2[i]==5)
1759         alloc_reg64(current,i,rs1[i]); // DMTC1
1760       else
1761         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1762       alloc_reg_temp(current,i,-1);
1763     }
1764     else {
1765       current->u&=~1LL;
1766       alloc_reg(current,i,0);
1767       alloc_reg_temp(current,i,-1);
1768     }
1769   }
1770 }
1771 void fconv_alloc(struct regstat *current,int i)
1772 {
1773   alloc_reg(current,i,CSREG); // Load status
1774   alloc_reg_temp(current,i,-1);
1775 }
1776 void float_alloc(struct regstat *current,int i)
1777 {
1778   alloc_reg(current,i,CSREG); // Load status
1779   alloc_reg_temp(current,i,-1);
1780 }
1781 void fcomp_alloc(struct regstat *current,int i)
1782 {
1783   alloc_reg(current,i,CSREG); // Load status
1784   alloc_reg(current,i,FSREG); // Load flags
1785   dirty_reg(current,FSREG); // Flag will be modified
1786   alloc_reg_temp(current,i,-1);
1787 }
1788
1789 void syscall_alloc(struct regstat *current,int i)
1790 {
1791   alloc_cc(current,i);
1792   dirty_reg(current,CCREG);
1793   alloc_all(current,i);
1794   current->isconst=0;
1795 }
1796
1797 void delayslot_alloc(struct regstat *current,int i)
1798 {
1799   switch(itype[i]) {
1800     case UJUMP:
1801     case CJUMP:
1802     case SJUMP:
1803     case RJUMP:
1804     case FJUMP:
1805     case SYSCALL:
1806     case SPAN:
1807       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1808       printf("Disabled speculative precompilation\n");
1809       stop_after_jal=1;
1810       break;
1811     case IMM16:
1812       imm16_alloc(current,i);
1813       break;
1814     case LOAD:
1815     case LOADLR:
1816       load_alloc(current,i);
1817       break;
1818     case STORE:
1819     case STORELR:
1820       store_alloc(current,i);
1821       break;
1822     case ALU:
1823       alu_alloc(current,i);
1824       break;
1825     case SHIFT:
1826       shift_alloc(current,i);
1827       break;
1828     case MULTDIV:
1829       multdiv_alloc(current,i);
1830       break;
1831     case SHIFTIMM:
1832       shiftimm_alloc(current,i);
1833       break;
1834     case MOV:
1835       mov_alloc(current,i);
1836       break;
1837     case COP0:
1838       cop0_alloc(current,i);
1839       break;
1840     case COP1:
1841       cop1_alloc(current,i);
1842       break;
1843     case C1LS:
1844       c1ls_alloc(current,i);
1845       break;
1846     case FCONV:
1847       fconv_alloc(current,i);
1848       break;
1849     case FLOAT:
1850       float_alloc(current,i);
1851       break;
1852     case FCOMP:
1853       fcomp_alloc(current,i);
1854       break;
1855   }
1856 }
1857
1858 // Special case where a branch and delay slot span two pages in virtual memory
1859 static void pagespan_alloc(struct regstat *current,int i)
1860 {
1861   current->isconst=0;
1862   current->wasconst=0;
1863   regs[i].wasconst=0;
1864   alloc_all(current,i);
1865   alloc_cc(current,i);
1866   dirty_reg(current,CCREG);
1867   if(opcode[i]==3) // JAL
1868   {
1869     alloc_reg(current,i,31);
1870     dirty_reg(current,31);
1871   }
1872   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1873   {
1874     alloc_reg(current,i,rs1[i]);
1875     if (rt1[i]==31) {
1876       alloc_reg(current,i,31);
1877       dirty_reg(current,31);
1878     }
1879   }
1880   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1881   {
1882     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1883     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1884     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1885     {
1886       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1887       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1888     }
1889   }
1890   else
1891   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1892   {
1893     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1894     if(!((current->is32>>rs1[i])&1))
1895     {
1896       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1897     }
1898   }
1899   else
1900   if(opcode[i]==0x11) // BC1
1901   {
1902     alloc_reg(current,i,FSREG);
1903     alloc_reg(current,i,CSREG);
1904   }
1905   //else ...
1906 }
1907
1908 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1909 {
1910   stubs[stubcount][0]=type;
1911   stubs[stubcount][1]=addr;
1912   stubs[stubcount][2]=retaddr;
1913   stubs[stubcount][3]=a;
1914   stubs[stubcount][4]=b;
1915   stubs[stubcount][5]=c;
1916   stubs[stubcount][6]=d;
1917   stubs[stubcount][7]=e;
1918   stubcount++;
1919 }
1920
1921 // Write out a single register
1922 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1923 {
1924   int hr;
1925   for(hr=0;hr<HOST_REGS;hr++) {
1926     if(hr!=EXCLUDE_REG) {
1927       if((regmap[hr]&63)==r) {
1928         if((dirty>>hr)&1) {
1929           if(regmap[hr]<64) {
1930             emit_storereg(r,hr);
1931 #ifndef FORCE32
1932             if((is32>>regmap[hr])&1) {
1933               emit_sarimm(hr,31,hr);
1934               emit_storereg(r|64,hr);
1935             }
1936 #endif
1937           }else{
1938             emit_storereg(r|64,hr);
1939           }
1940         }
1941       }
1942     }
1943   }
1944 }
1945
1946 int mchecksum()
1947 {
1948   //if(!tracedebug) return 0;
1949   int i;
1950   int sum=0;
1951   for(i=0;i<2097152;i++) {
1952     unsigned int temp=sum;
1953     sum<<=1;
1954     sum|=(~temp)>>31;
1955     sum^=((u_int *)rdram)[i];
1956   }
1957   return sum;
1958 }
1959 int rchecksum()
1960 {
1961   int i;
1962   int sum=0;
1963   for(i=0;i<64;i++)
1964     sum^=((u_int *)reg)[i];
1965   return sum;
1966 }
1967 void rlist()
1968 {
1969   int i;
1970   printf("TRACE: ");
1971   for(i=0;i<32;i++)
1972     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1973   printf("\n");
1974 #ifndef DISABLE_COP1
1975   printf("TRACE: ");
1976   for(i=0;i<32;i++)
1977     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1978   printf("\n");
1979 #endif
1980 }
1981
1982 void enabletrace()
1983 {
1984   tracedebug=1;
1985 }
1986
1987 void memdebug(int i)
1988 {
1989   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1990   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1991   //rlist();
1992   //if(tracedebug) {
1993   //if(Count>=-2084597794) {
1994   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1995   //if(0) {
1996     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1997     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1998     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1999     rlist();
2000     #ifdef __i386__
2001     printf("TRACE: %x\n",(&i)[-1]);
2002     #endif
2003     #ifdef __arm__
2004     int j;
2005     printf("TRACE: %x \n",(&j)[10]);
2006     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2007     #endif
2008     //fflush(stdout);
2009   }
2010   //printf("TRACE: %x\n",(&i)[-1]);
2011 }
2012
2013 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2014 {
2015   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2016 }
2017
2018 void alu_assemble(int i,struct regstat *i_regs)
2019 {
2020   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2021     if(rt1[i]) {
2022       signed char s1,s2,t;
2023       t=get_reg(i_regs->regmap,rt1[i]);
2024       if(t>=0) {
2025         s1=get_reg(i_regs->regmap,rs1[i]);
2026         s2=get_reg(i_regs->regmap,rs2[i]);
2027         if(rs1[i]&&rs2[i]) {
2028           assert(s1>=0);
2029           assert(s2>=0);
2030           if(opcode2[i]&2) emit_sub(s1,s2,t);
2031           else emit_add(s1,s2,t);
2032         }
2033         else if(rs1[i]) {
2034           if(s1>=0) emit_mov(s1,t);
2035           else emit_loadreg(rs1[i],t);
2036         }
2037         else if(rs2[i]) {
2038           if(s2>=0) {
2039             if(opcode2[i]&2) emit_neg(s2,t);
2040             else emit_mov(s2,t);
2041           }
2042           else {
2043             emit_loadreg(rs2[i],t);
2044             if(opcode2[i]&2) emit_neg(t,t);
2045           }
2046         }
2047         else emit_zeroreg(t);
2048       }
2049     }
2050   }
2051   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2052     if(rt1[i]) {
2053       signed char s1l,s2l,s1h,s2h,tl,th;
2054       tl=get_reg(i_regs->regmap,rt1[i]);
2055       th=get_reg(i_regs->regmap,rt1[i]|64);
2056       if(tl>=0) {
2057         s1l=get_reg(i_regs->regmap,rs1[i]);
2058         s2l=get_reg(i_regs->regmap,rs2[i]);
2059         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2060         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2061         if(rs1[i]&&rs2[i]) {
2062           assert(s1l>=0);
2063           assert(s2l>=0);
2064           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2065           else emit_adds(s1l,s2l,tl);
2066           if(th>=0) {
2067             #ifdef INVERTED_CARRY
2068             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2069             #else
2070             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2071             #endif
2072             else emit_add(s1h,s2h,th);
2073           }
2074         }
2075         else if(rs1[i]) {
2076           if(s1l>=0) emit_mov(s1l,tl);
2077           else emit_loadreg(rs1[i],tl);
2078           if(th>=0) {
2079             if(s1h>=0) emit_mov(s1h,th);
2080             else emit_loadreg(rs1[i]|64,th);
2081           }
2082         }
2083         else if(rs2[i]) {
2084           if(s2l>=0) {
2085             if(opcode2[i]&2) emit_negs(s2l,tl);
2086             else emit_mov(s2l,tl);
2087           }
2088           else {
2089             emit_loadreg(rs2[i],tl);
2090             if(opcode2[i]&2) emit_negs(tl,tl);
2091           }
2092           if(th>=0) {
2093             #ifdef INVERTED_CARRY
2094             if(s2h>=0) emit_mov(s2h,th);
2095             else emit_loadreg(rs2[i]|64,th);
2096             if(opcode2[i]&2) {
2097               emit_adcimm(-1,th); // x86 has inverted carry flag
2098               emit_not(th,th);
2099             }
2100             #else
2101             if(opcode2[i]&2) {
2102               if(s2h>=0) emit_rscimm(s2h,0,th);
2103               else {
2104                 emit_loadreg(rs2[i]|64,th);
2105                 emit_rscimm(th,0,th);
2106               }
2107             }else{
2108               if(s2h>=0) emit_mov(s2h,th);
2109               else emit_loadreg(rs2[i]|64,th);
2110             }
2111             #endif
2112           }
2113         }
2114         else {
2115           emit_zeroreg(tl);
2116           if(th>=0) emit_zeroreg(th);
2117         }
2118       }
2119     }
2120   }
2121   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2122     if(rt1[i]) {
2123       signed char s1l,s1h,s2l,s2h,t;
2124       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2125       {
2126         t=get_reg(i_regs->regmap,rt1[i]);
2127         //assert(t>=0);
2128         if(t>=0) {
2129           s1l=get_reg(i_regs->regmap,rs1[i]);
2130           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2131           s2l=get_reg(i_regs->regmap,rs2[i]);
2132           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2133           if(rs2[i]==0) // rx<r0
2134           {
2135             assert(s1h>=0);
2136             if(opcode2[i]==0x2a) // SLT
2137               emit_shrimm(s1h,31,t);
2138             else // SLTU (unsigned can not be less than zero)
2139               emit_zeroreg(t);
2140           }
2141           else if(rs1[i]==0) // r0<rx
2142           {
2143             assert(s2h>=0);
2144             if(opcode2[i]==0x2a) // SLT
2145               emit_set_gz64_32(s2h,s2l,t);
2146             else // SLTU (set if not zero)
2147               emit_set_nz64_32(s2h,s2l,t);
2148           }
2149           else {
2150             assert(s1l>=0);assert(s1h>=0);
2151             assert(s2l>=0);assert(s2h>=0);
2152             if(opcode2[i]==0x2a) // SLT
2153               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2154             else // SLTU
2155               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2156           }
2157         }
2158       } else {
2159         t=get_reg(i_regs->regmap,rt1[i]);
2160         //assert(t>=0);
2161         if(t>=0) {
2162           s1l=get_reg(i_regs->regmap,rs1[i]);
2163           s2l=get_reg(i_regs->regmap,rs2[i]);
2164           if(rs2[i]==0) // rx<r0
2165           {
2166             assert(s1l>=0);
2167             if(opcode2[i]==0x2a) // SLT
2168               emit_shrimm(s1l,31,t);
2169             else // SLTU (unsigned can not be less than zero)
2170               emit_zeroreg(t);
2171           }
2172           else if(rs1[i]==0) // r0<rx
2173           {
2174             assert(s2l>=0);
2175             if(opcode2[i]==0x2a) // SLT
2176               emit_set_gz32(s2l,t);
2177             else // SLTU (set if not zero)
2178               emit_set_nz32(s2l,t);
2179           }
2180           else{
2181             assert(s1l>=0);assert(s2l>=0);
2182             if(opcode2[i]==0x2a) // SLT
2183               emit_set_if_less32(s1l,s2l,t);
2184             else // SLTU
2185               emit_set_if_carry32(s1l,s2l,t);
2186           }
2187         }
2188       }
2189     }
2190   }
2191   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2192     if(rt1[i]) {
2193       signed char s1l,s1h,s2l,s2h,th,tl;
2194       tl=get_reg(i_regs->regmap,rt1[i]);
2195       th=get_reg(i_regs->regmap,rt1[i]|64);
2196       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2197       {
2198         assert(tl>=0);
2199         if(tl>=0) {
2200           s1l=get_reg(i_regs->regmap,rs1[i]);
2201           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2202           s2l=get_reg(i_regs->regmap,rs2[i]);
2203           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2204           if(rs1[i]&&rs2[i]) {
2205             assert(s1l>=0);assert(s1h>=0);
2206             assert(s2l>=0);assert(s2h>=0);
2207             if(opcode2[i]==0x24) { // AND
2208               emit_and(s1l,s2l,tl);
2209               emit_and(s1h,s2h,th);
2210             } else
2211             if(opcode2[i]==0x25) { // OR
2212               emit_or(s1l,s2l,tl);
2213               emit_or(s1h,s2h,th);
2214             } else
2215             if(opcode2[i]==0x26) { // XOR
2216               emit_xor(s1l,s2l,tl);
2217               emit_xor(s1h,s2h,th);
2218             } else
2219             if(opcode2[i]==0x27) { // NOR
2220               emit_or(s1l,s2l,tl);
2221               emit_or(s1h,s2h,th);
2222               emit_not(tl,tl);
2223               emit_not(th,th);
2224             }
2225           }
2226           else
2227           {
2228             if(opcode2[i]==0x24) { // AND
2229               emit_zeroreg(tl);
2230               emit_zeroreg(th);
2231             } else
2232             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2233               if(rs1[i]){
2234                 if(s1l>=0) emit_mov(s1l,tl);
2235                 else emit_loadreg(rs1[i],tl);
2236                 if(s1h>=0) emit_mov(s1h,th);
2237                 else emit_loadreg(rs1[i]|64,th);
2238               }
2239               else
2240               if(rs2[i]){
2241                 if(s2l>=0) emit_mov(s2l,tl);
2242                 else emit_loadreg(rs2[i],tl);
2243                 if(s2h>=0) emit_mov(s2h,th);
2244                 else emit_loadreg(rs2[i]|64,th);
2245               }
2246               else{
2247                 emit_zeroreg(tl);
2248                 emit_zeroreg(th);
2249               }
2250             } else
2251             if(opcode2[i]==0x27) { // NOR
2252               if(rs1[i]){
2253                 if(s1l>=0) emit_not(s1l,tl);
2254                 else{
2255                   emit_loadreg(rs1[i],tl);
2256                   emit_not(tl,tl);
2257                 }
2258                 if(s1h>=0) emit_not(s1h,th);
2259                 else{
2260                   emit_loadreg(rs1[i]|64,th);
2261                   emit_not(th,th);
2262                 }
2263               }
2264               else
2265               if(rs2[i]){
2266                 if(s2l>=0) emit_not(s2l,tl);
2267                 else{
2268                   emit_loadreg(rs2[i],tl);
2269                   emit_not(tl,tl);
2270                 }
2271                 if(s2h>=0) emit_not(s2h,th);
2272                 else{
2273                   emit_loadreg(rs2[i]|64,th);
2274                   emit_not(th,th);
2275                 }
2276               }
2277               else {
2278                 emit_movimm(-1,tl);
2279                 emit_movimm(-1,th);
2280               }
2281             }
2282           }
2283         }
2284       }
2285       else
2286       {
2287         // 32 bit
2288         if(tl>=0) {
2289           s1l=get_reg(i_regs->regmap,rs1[i]);
2290           s2l=get_reg(i_regs->regmap,rs2[i]);
2291           if(rs1[i]&&rs2[i]) {
2292             assert(s1l>=0);
2293             assert(s2l>=0);
2294             if(opcode2[i]==0x24) { // AND
2295               emit_and(s1l,s2l,tl);
2296             } else
2297             if(opcode2[i]==0x25) { // OR
2298               emit_or(s1l,s2l,tl);
2299             } else
2300             if(opcode2[i]==0x26) { // XOR
2301               emit_xor(s1l,s2l,tl);
2302             } else
2303             if(opcode2[i]==0x27) { // NOR
2304               emit_or(s1l,s2l,tl);
2305               emit_not(tl,tl);
2306             }
2307           }
2308           else
2309           {
2310             if(opcode2[i]==0x24) { // AND
2311               emit_zeroreg(tl);
2312             } else
2313             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2314               if(rs1[i]){
2315                 if(s1l>=0) emit_mov(s1l,tl);
2316                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2317               }
2318               else
2319               if(rs2[i]){
2320                 if(s2l>=0) emit_mov(s2l,tl);
2321                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2322               }
2323               else emit_zeroreg(tl);
2324             } else
2325             if(opcode2[i]==0x27) { // NOR
2326               if(rs1[i]){
2327                 if(s1l>=0) emit_not(s1l,tl);
2328                 else {
2329                   emit_loadreg(rs1[i],tl);
2330                   emit_not(tl,tl);
2331                 }
2332               }
2333               else
2334               if(rs2[i]){
2335                 if(s2l>=0) emit_not(s2l,tl);
2336                 else {
2337                   emit_loadreg(rs2[i],tl);
2338                   emit_not(tl,tl);
2339                 }
2340               }
2341               else emit_movimm(-1,tl);
2342             }
2343           }
2344         }
2345       }
2346     }
2347   }
2348 }
2349
2350 void imm16_assemble(int i,struct regstat *i_regs)
2351 {
2352   if (opcode[i]==0x0f) { // LUI
2353     if(rt1[i]) {
2354       signed char t;
2355       t=get_reg(i_regs->regmap,rt1[i]);
2356       //assert(t>=0);
2357       if(t>=0) {
2358         if(!((i_regs->isconst>>t)&1))
2359           emit_movimm(imm[i]<<16,t);
2360       }
2361     }
2362   }
2363   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2364     if(rt1[i]) {
2365       signed char s,t;
2366       t=get_reg(i_regs->regmap,rt1[i]);
2367       s=get_reg(i_regs->regmap,rs1[i]);
2368       if(rs1[i]) {
2369         //assert(t>=0);
2370         //assert(s>=0);
2371         if(t>=0) {
2372           if(!((i_regs->isconst>>t)&1)) {
2373             if(s<0) {
2374               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2375               emit_addimm(t,imm[i],t);
2376             }else{
2377               if(!((i_regs->wasconst>>s)&1))
2378                 emit_addimm(s,imm[i],t);
2379               else
2380                 emit_movimm(constmap[i][s]+imm[i],t);
2381             }
2382           }
2383         }
2384       } else {
2385         if(t>=0) {
2386           if(!((i_regs->isconst>>t)&1))
2387             emit_movimm(imm[i],t);
2388         }
2389       }
2390     }
2391   }
2392   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2393     if(rt1[i]) {
2394       signed char sh,sl,th,tl;
2395       th=get_reg(i_regs->regmap,rt1[i]|64);
2396       tl=get_reg(i_regs->regmap,rt1[i]);
2397       sh=get_reg(i_regs->regmap,rs1[i]|64);
2398       sl=get_reg(i_regs->regmap,rs1[i]);
2399       if(tl>=0) {
2400         if(rs1[i]) {
2401           assert(sh>=0);
2402           assert(sl>=0);
2403           if(th>=0) {
2404             emit_addimm64_32(sh,sl,imm[i],th,tl);
2405           }
2406           else {
2407             emit_addimm(sl,imm[i],tl);
2408           }
2409         } else {
2410           emit_movimm(imm[i],tl);
2411           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2412         }
2413       }
2414     }
2415   }
2416   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2417     if(rt1[i]) {
2418       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2419       signed char sh,sl,t;
2420       t=get_reg(i_regs->regmap,rt1[i]);
2421       sh=get_reg(i_regs->regmap,rs1[i]|64);
2422       sl=get_reg(i_regs->regmap,rs1[i]);
2423       //assert(t>=0);
2424       if(t>=0) {
2425         if(rs1[i]>0) {
2426           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2427           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2428             if(opcode[i]==0x0a) { // SLTI
2429               if(sl<0) {
2430                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2431                 emit_slti32(t,imm[i],t);
2432               }else{
2433                 emit_slti32(sl,imm[i],t);
2434               }
2435             }
2436             else { // SLTIU
2437               if(sl<0) {
2438                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2439                 emit_sltiu32(t,imm[i],t);
2440               }else{
2441                 emit_sltiu32(sl,imm[i],t);
2442               }
2443             }
2444           }else{ // 64-bit
2445             assert(sl>=0);
2446             if(opcode[i]==0x0a) // SLTI
2447               emit_slti64_32(sh,sl,imm[i],t);
2448             else // SLTIU
2449               emit_sltiu64_32(sh,sl,imm[i],t);
2450           }
2451         }else{
2452           // SLTI(U) with r0 is just stupid,
2453           // nonetheless examples can be found
2454           if(opcode[i]==0x0a) // SLTI
2455             if(0<imm[i]) emit_movimm(1,t);
2456             else emit_zeroreg(t);
2457           else // SLTIU
2458           {
2459             if(imm[i]) emit_movimm(1,t);
2460             else emit_zeroreg(t);
2461           }
2462         }
2463       }
2464     }
2465   }
2466   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2467     if(rt1[i]) {
2468       signed char sh,sl,th,tl;
2469       th=get_reg(i_regs->regmap,rt1[i]|64);
2470       tl=get_reg(i_regs->regmap,rt1[i]);
2471       sh=get_reg(i_regs->regmap,rs1[i]|64);
2472       sl=get_reg(i_regs->regmap,rs1[i]);
2473       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2474         if(opcode[i]==0x0c) //ANDI
2475         {
2476           if(rs1[i]) {
2477             if(sl<0) {
2478               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2479               emit_andimm(tl,imm[i],tl);
2480             }else{
2481               if(!((i_regs->wasconst>>sl)&1))
2482                 emit_andimm(sl,imm[i],tl);
2483               else
2484                 emit_movimm(constmap[i][sl]&imm[i],tl);
2485             }
2486           }
2487           else
2488             emit_zeroreg(tl);
2489           if(th>=0) emit_zeroreg(th);
2490         }
2491         else
2492         {
2493           if(rs1[i]) {
2494             if(sl<0) {
2495               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2496             }
2497             if(th>=0) {
2498               if(sh<0) {
2499                 emit_loadreg(rs1[i]|64,th);
2500               }else{
2501                 emit_mov(sh,th);
2502               }
2503             }
2504             if(opcode[i]==0x0d) //ORI
2505             if(sl<0) {
2506               emit_orimm(tl,imm[i],tl);
2507             }else{
2508               if(!((i_regs->wasconst>>sl)&1))
2509                 emit_orimm(sl,imm[i],tl);
2510               else
2511                 emit_movimm(constmap[i][sl]|imm[i],tl);
2512             }
2513             if(opcode[i]==0x0e) //XORI
2514             if(sl<0) {
2515               emit_xorimm(tl,imm[i],tl);
2516             }else{
2517               if(!((i_regs->wasconst>>sl)&1))
2518                 emit_xorimm(sl,imm[i],tl);
2519               else
2520                 emit_movimm(constmap[i][sl]^imm[i],tl);
2521             }
2522           }
2523           else {
2524             emit_movimm(imm[i],tl);
2525             if(th>=0) emit_zeroreg(th);
2526           }
2527         }
2528       }
2529     }
2530   }
2531 }
2532
2533 void shiftimm_assemble(int i,struct regstat *i_regs)
2534 {
2535   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2536   {
2537     if(rt1[i]) {
2538       signed char s,t;
2539       t=get_reg(i_regs->regmap,rt1[i]);
2540       s=get_reg(i_regs->regmap,rs1[i]);
2541       //assert(t>=0);
2542       if(t>=0){
2543         if(rs1[i]==0)
2544         {
2545           emit_zeroreg(t);
2546         }
2547         else
2548         {
2549           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2550           if(imm[i]) {
2551             if(opcode2[i]==0) // SLL
2552             {
2553               emit_shlimm(s<0?t:s,imm[i],t);
2554             }
2555             if(opcode2[i]==2) // SRL
2556             {
2557               emit_shrimm(s<0?t:s,imm[i],t);
2558             }
2559             if(opcode2[i]==3) // SRA
2560             {
2561               emit_sarimm(s<0?t:s,imm[i],t);
2562             }
2563           }else{
2564             // Shift by zero
2565             if(s>=0 && s!=t) emit_mov(s,t);
2566           }
2567         }
2568       }
2569       //emit_storereg(rt1[i],t); //DEBUG
2570     }
2571   }
2572   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2573   {
2574     if(rt1[i]) {
2575       signed char sh,sl,th,tl;
2576       th=get_reg(i_regs->regmap,rt1[i]|64);
2577       tl=get_reg(i_regs->regmap,rt1[i]);
2578       sh=get_reg(i_regs->regmap,rs1[i]|64);
2579       sl=get_reg(i_regs->regmap,rs1[i]);
2580       if(tl>=0) {
2581         if(rs1[i]==0)
2582         {
2583           emit_zeroreg(tl);
2584           if(th>=0) emit_zeroreg(th);
2585         }
2586         else
2587         {
2588           assert(sl>=0);
2589           assert(sh>=0);
2590           if(imm[i]) {
2591             if(opcode2[i]==0x38) // DSLL
2592             {
2593               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2594               emit_shlimm(sl,imm[i],tl);
2595             }
2596             if(opcode2[i]==0x3a) // DSRL
2597             {
2598               emit_shrdimm(sl,sh,imm[i],tl);
2599               if(th>=0) emit_shrimm(sh,imm[i],th);
2600             }
2601             if(opcode2[i]==0x3b) // DSRA
2602             {
2603               emit_shrdimm(sl,sh,imm[i],tl);
2604               if(th>=0) emit_sarimm(sh,imm[i],th);
2605             }
2606           }else{
2607             // Shift by zero
2608             if(sl!=tl) emit_mov(sl,tl);
2609             if(th>=0&&sh!=th) emit_mov(sh,th);
2610           }
2611         }
2612       }
2613     }
2614   }
2615   if(opcode2[i]==0x3c) // DSLL32
2616   {
2617     if(rt1[i]) {
2618       signed char sl,tl,th;
2619       tl=get_reg(i_regs->regmap,rt1[i]);
2620       th=get_reg(i_regs->regmap,rt1[i]|64);
2621       sl=get_reg(i_regs->regmap,rs1[i]);
2622       if(th>=0||tl>=0){
2623         assert(tl>=0);
2624         assert(th>=0);
2625         assert(sl>=0);
2626         emit_mov(sl,th);
2627         emit_zeroreg(tl);
2628         if(imm[i]>32)
2629         {
2630           emit_shlimm(th,imm[i]&31,th);
2631         }
2632       }
2633     }
2634   }
2635   if(opcode2[i]==0x3e) // DSRL32
2636   {
2637     if(rt1[i]) {
2638       signed char sh,tl,th;
2639       tl=get_reg(i_regs->regmap,rt1[i]);
2640       th=get_reg(i_regs->regmap,rt1[i]|64);
2641       sh=get_reg(i_regs->regmap,rs1[i]|64);
2642       if(tl>=0){
2643         assert(sh>=0);
2644         emit_mov(sh,tl);
2645         if(th>=0) emit_zeroreg(th);
2646         if(imm[i]>32)
2647         {
2648           emit_shrimm(tl,imm[i]&31,tl);
2649         }
2650       }
2651     }
2652   }
2653   if(opcode2[i]==0x3f) // DSRA32
2654   {
2655     if(rt1[i]) {
2656       signed char sh,tl;
2657       tl=get_reg(i_regs->regmap,rt1[i]);
2658       sh=get_reg(i_regs->regmap,rs1[i]|64);
2659       if(tl>=0){
2660         assert(sh>=0);
2661         emit_mov(sh,tl);
2662         if(imm[i]>32)
2663         {
2664           emit_sarimm(tl,imm[i]&31,tl);
2665         }
2666       }
2667     }
2668   }
2669 }
2670
2671 #ifndef shift_assemble
2672 void shift_assemble(int i,struct regstat *i_regs)
2673 {
2674   printf("Need shift_assemble for this architecture.\n");
2675   exit(1);
2676 }
2677 #endif
2678
2679 void load_assemble(int i,struct regstat *i_regs)
2680 {
2681   int s,th,tl,addr,map=-1;
2682   int offset;
2683   int jaddr=0;
2684   int memtarget,c=0;
2685   u_int hr,reglist=0;
2686   th=get_reg(i_regs->regmap,rt1[i]|64);
2687   tl=get_reg(i_regs->regmap,rt1[i]);
2688   s=get_reg(i_regs->regmap,rs1[i]);
2689   offset=imm[i];
2690   for(hr=0;hr<HOST_REGS;hr++) {
2691     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2692   }
2693   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2694   if(s>=0) {
2695     c=(i_regs->wasconst>>s)&1;
2696     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2697     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2698   }
2699   if(offset||s<0||c) addr=tl;
2700   else addr=s;
2701   //printf("load_assemble: c=%d\n",c);
2702   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2703   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2704   if(tl>=0) {
2705     //assert(tl>=0);
2706     //assert(rt1[i]);
2707     reglist&=~(1<<tl);
2708     if(th>=0) reglist&=~(1<<th);
2709     if(!using_tlb) {
2710       if(!c) {
2711 //#define R29_HACK 1
2712         #ifdef R29_HACK
2713         // Strmnnrmn's speed hack
2714         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2715         #endif
2716         {
2717           emit_cmpimm(addr,0x800000);
2718           jaddr=(int)out;
2719           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2720           // Hint to branch predictor that the branch is unlikely to be taken
2721           if(rs1[i]>=28)
2722             emit_jno_unlikely(0);
2723           else
2724           #endif
2725           emit_jno(0);
2726         }
2727       }
2728     }else{ // using tlb
2729       int x=0;
2730       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2731       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2732       map=get_reg(i_regs->regmap,TLREG);
2733       assert(map>=0);
2734       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2735       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2736     }
2737     if (opcode[i]==0x20) { // LB
2738       if(!c||memtarget) {
2739         #ifdef HOST_IMM_ADDR32
2740         if(c)
2741           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2742         else
2743         #endif
2744         {
2745           //emit_xorimm(addr,3,tl);
2746           //gen_tlb_addr_r(tl,map);
2747           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2748           int x=0;
2749           if(!c) emit_xorimm(addr,3,tl);
2750           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2751           emit_movsbl_indexed_tlb(x,tl,map,tl);
2752         }
2753         if(jaddr)
2754           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2755       }
2756       else
2757         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2758     }
2759     if (opcode[i]==0x21) { // LH
2760       if(!c||memtarget) {
2761         #ifdef HOST_IMM_ADDR32
2762         if(c)
2763           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2764         else
2765         #endif
2766         {
2767           int x=0;
2768           if(!c) emit_xorimm(addr,2,tl);
2769           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2770           //#ifdef
2771           //emit_movswl_indexed_tlb(x,tl,map,tl);
2772           //else
2773           if(map>=0) {
2774             gen_tlb_addr_r(tl,map);
2775             emit_movswl_indexed(x,tl,tl);
2776           }else
2777             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2778         }
2779         if(jaddr)
2780           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2781       }
2782       else
2783         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2784     }
2785     if (opcode[i]==0x23) { // LW
2786       if(!c||memtarget) {
2787         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2788         #ifdef HOST_IMM_ADDR32
2789         if(c)
2790           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2791         else
2792         #endif
2793         emit_readword_indexed_tlb(0,addr,map,tl);
2794         if(jaddr)
2795           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2796       }
2797       else
2798         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2799     }
2800     if (opcode[i]==0x24) { // LBU
2801       if(!c||memtarget) {
2802         #ifdef HOST_IMM_ADDR32
2803         if(c)
2804           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2805         else
2806         #endif
2807         {
2808           //emit_xorimm(addr,3,tl);
2809           //gen_tlb_addr_r(tl,map);
2810           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2811           int x=0;
2812           if(!c) emit_xorimm(addr,3,tl);
2813           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2814           emit_movzbl_indexed_tlb(x,tl,map,tl);
2815         }
2816         if(jaddr)
2817           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2818       }
2819       else
2820         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2821     }
2822     if (opcode[i]==0x25) { // LHU
2823       if(!c||memtarget) {
2824         #ifdef HOST_IMM_ADDR32
2825         if(c)
2826           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2827         else
2828         #endif
2829         {
2830           int x=0;
2831           if(!c) emit_xorimm(addr,2,tl);
2832           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2833           //#ifdef
2834           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2835           //#else
2836           if(map>=0) {
2837             gen_tlb_addr_r(tl,map);
2838             emit_movzwl_indexed(x,tl,tl);
2839           }else
2840             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2841           if(jaddr)
2842             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2843         }
2844       }
2845       else
2846         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2847     }
2848     if (opcode[i]==0x27) { // LWU
2849       assert(th>=0);
2850       if(!c||memtarget) {
2851         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2852         #ifdef HOST_IMM_ADDR32
2853         if(c)
2854           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2855         else
2856         #endif
2857         emit_readword_indexed_tlb(0,addr,map,tl);
2858         if(jaddr)
2859           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2860       }
2861       else {
2862         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2863       }
2864       emit_zeroreg(th);
2865     }
2866     if (opcode[i]==0x37) { // LD
2867       if(!c||memtarget) {
2868         //gen_tlb_addr_r(tl,map);
2869         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2870         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2871         #ifdef HOST_IMM_ADDR32
2872         if(c)
2873           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2874         else
2875         #endif
2876         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2877         if(jaddr)
2878           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2879       }
2880       else
2881         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2882     }
2883     //emit_storereg(rt1[i],tl); // DEBUG
2884   }
2885   //if(opcode[i]==0x23)
2886   //if(opcode[i]==0x24)
2887   //if(opcode[i]==0x23||opcode[i]==0x24)
2888   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2889   {
2890     //emit_pusha();
2891     save_regs(0x100f);
2892         emit_readword((int)&last_count,ECX);
2893         #ifdef __i386__
2894         if(get_reg(i_regs->regmap,CCREG)<0)
2895           emit_loadreg(CCREG,HOST_CCREG);
2896         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2897         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2898         emit_writeword(HOST_CCREG,(int)&Count);
2899         #endif
2900         #ifdef __arm__
2901         if(get_reg(i_regs->regmap,CCREG)<0)
2902           emit_loadreg(CCREG,0);
2903         else
2904           emit_mov(HOST_CCREG,0);
2905         emit_add(0,ECX,0);
2906         emit_addimm(0,2*ccadj[i],0);
2907         emit_writeword(0,(int)&Count);
2908         #endif
2909     emit_call((int)memdebug);
2910     //emit_popa();
2911     restore_regs(0x100f);
2912   }/**/
2913 }
2914
2915 #ifndef loadlr_assemble
2916 void loadlr_assemble(int i,struct regstat *i_regs)
2917 {
2918   printf("Need loadlr_assemble for this architecture.\n");
2919   exit(1);
2920 }
2921 #endif
2922
2923 void store_assemble(int i,struct regstat *i_regs)
2924 {
2925   int s,th,tl,map=-1;
2926   int addr,temp;
2927   int offset;
2928   int jaddr=0,jaddr2,type;
2929   int memtarget,c=0;
2930   int agr=AGEN1+(i&1);
2931   u_int hr,reglist=0;
2932   th=get_reg(i_regs->regmap,rs2[i]|64);
2933   tl=get_reg(i_regs->regmap,rs2[i]);
2934   s=get_reg(i_regs->regmap,rs1[i]);
2935   temp=get_reg(i_regs->regmap,agr);
2936   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2937   offset=imm[i];
2938   if(s>=0) {
2939     c=(i_regs->wasconst>>s)&1;
2940     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2941     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2942   }
2943   assert(tl>=0);
2944   assert(temp>=0);
2945   for(hr=0;hr<HOST_REGS;hr++) {
2946     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2947   }
2948   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2949   if(offset||s<0||c) addr=temp;
2950   else addr=s;
2951   if(!using_tlb) {
2952     if(!c) {
2953       #ifdef R29_HACK
2954       // Strmnnrmn's speed hack
2955       memtarget=1;
2956       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2957       #endif
2958       emit_cmpimm(addr,0x800000);
2959       #ifdef DESTRUCTIVE_SHIFT
2960       if(s==addr) emit_mov(s,temp);
2961       #endif
2962       #ifdef R29_HACK
2963       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2964       #endif
2965       {
2966         jaddr=(int)out;
2967         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2968         // Hint to branch predictor that the branch is unlikely to be taken
2969         if(rs1[i]>=28)
2970           emit_jno_unlikely(0);
2971         else
2972         #endif
2973         emit_jno(0);
2974       }
2975     }
2976   }else{ // using tlb
2977     int x=0;
2978     if (opcode[i]==0x28) x=3; // SB
2979     if (opcode[i]==0x29) x=2; // SH
2980     map=get_reg(i_regs->regmap,TLREG);
2981     assert(map>=0);
2982     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
2983     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
2984   }
2985
2986   if (opcode[i]==0x28) { // SB
2987     if(!c||memtarget) {
2988       int x=0;
2989       if(!c) emit_xorimm(addr,3,temp);
2990       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2991       //gen_tlb_addr_w(temp,map);
2992       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2993       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
2994     }
2995     type=STOREB_STUB;
2996   }
2997   if (opcode[i]==0x29) { // SH
2998     if(!c||memtarget) {
2999       int x=0;
3000       if(!c) emit_xorimm(addr,2,temp);
3001       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3002       //#ifdef
3003       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3004       //#else
3005       if(map>=0) {
3006         gen_tlb_addr_w(temp,map);
3007         emit_writehword_indexed(tl,x,temp);
3008       }else
3009         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3010     }
3011     type=STOREH_STUB;
3012   }
3013   if (opcode[i]==0x2B) { // SW
3014     if(!c||memtarget)
3015       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3016       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3017     type=STOREW_STUB;
3018   }
3019   if (opcode[i]==0x3F) { // SD
3020     if(!c||memtarget) {
3021       if(rs2[i]) {
3022         assert(th>=0);
3023         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3024         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3025         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3026       }else{
3027         // Store zero
3028         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3029         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3030         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3031       }
3032     }
3033     type=STORED_STUB;
3034   }
3035   if(jaddr) {
3036     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3037   } else if(!memtarget) {
3038     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3039   }
3040   if(!using_tlb) {
3041     if(!c||memtarget) {
3042       #ifdef DESTRUCTIVE_SHIFT
3043       // The x86 shift operation is 'destructive'; it overwrites the
3044       // source register, so we need to make a copy first and use that.
3045       addr=temp;
3046       #endif
3047       #if defined(HOST_IMM8)
3048       int ir=get_reg(i_regs->regmap,INVCP);
3049       assert(ir>=0);
3050       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3051       #else
3052       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3053       #endif
3054       jaddr2=(int)out;
3055       emit_jne(0);
3056       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3057     }
3058   }
3059   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3060   //if(opcode[i]==0x2B || opcode[i]==0x28)
3061   //if(opcode[i]==0x2B || opcode[i]==0x29)
3062   //if(opcode[i]==0x2B)
3063   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3064   {
3065     //emit_pusha();
3066     save_regs(0x100f);
3067         emit_readword((int)&last_count,ECX);
3068         #ifdef __i386__
3069         if(get_reg(i_regs->regmap,CCREG)<0)
3070           emit_loadreg(CCREG,HOST_CCREG);
3071         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3072         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3073         emit_writeword(HOST_CCREG,(int)&Count);
3074         #endif
3075         #ifdef __arm__
3076         if(get_reg(i_regs->regmap,CCREG)<0)
3077           emit_loadreg(CCREG,0);
3078         else
3079           emit_mov(HOST_CCREG,0);
3080         emit_add(0,ECX,0);
3081         emit_addimm(0,2*ccadj[i],0);
3082         emit_writeword(0,(int)&Count);
3083         #endif
3084     emit_call((int)memdebug);
3085     //emit_popa();
3086     restore_regs(0x100f);
3087   }/**/
3088 }
3089
3090 void storelr_assemble(int i,struct regstat *i_regs)
3091 {
3092   int s,th,tl;
3093   int temp;
3094   int temp2;
3095   int offset;
3096   int jaddr=0,jaddr2;
3097   int case1,case2,case3;
3098   int done0,done1,done2;
3099   int memtarget,c=0;
3100   u_int hr,reglist=0;
3101   th=get_reg(i_regs->regmap,rs2[i]|64);
3102   tl=get_reg(i_regs->regmap,rs2[i]);
3103   s=get_reg(i_regs->regmap,rs1[i]);
3104   temp=get_reg(i_regs->regmap,-1);
3105   offset=imm[i];
3106   if(s>=0) {
3107     c=(i_regs->isconst>>s)&1;
3108     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3109     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3110   }
3111   assert(tl>=0);
3112   for(hr=0;hr<HOST_REGS;hr++) {
3113     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3114   }
3115   if(tl>=0) {
3116     assert(temp>=0);
3117     if(!using_tlb) {
3118       if(!c) {
3119         emit_cmpimm(s<0||offset?temp:s,0x800000);
3120         if(!offset&&s!=temp) emit_mov(s,temp);
3121         jaddr=(int)out;
3122         emit_jno(0);
3123       }
3124       else
3125       {
3126         if(!memtarget||!rs1[i]) {
3127           jaddr=(int)out;
3128           emit_jmp(0);
3129         }
3130       }
3131       if((u_int)rdram!=0x80000000) 
3132         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3133     }else{ // using tlb
3134       int map=get_reg(i_regs->regmap,TLREG);
3135       assert(map>=0);
3136       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3137       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3138       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3139       if(!jaddr&&!memtarget) {
3140         jaddr=(int)out;
3141         emit_jmp(0);
3142       }
3143       gen_tlb_addr_w(temp,map);
3144     }
3145
3146     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3147       temp2=get_reg(i_regs->regmap,FTEMP);
3148       if(!rs2[i]) temp2=th=tl;
3149     }
3150
3151     emit_testimm(temp,2);
3152     case2=(int)out;
3153     emit_jne(0);
3154     emit_testimm(temp,1);
3155     case1=(int)out;
3156     emit_jne(0);
3157     // 0
3158     if (opcode[i]==0x2A) { // SWL
3159       emit_writeword_indexed(tl,0,temp);
3160     }
3161     if (opcode[i]==0x2E) { // SWR
3162       emit_writebyte_indexed(tl,3,temp);
3163     }
3164     if (opcode[i]==0x2C) { // SDL
3165       emit_writeword_indexed(th,0,temp);
3166       if(rs2[i]) emit_mov(tl,temp2);
3167     }
3168     if (opcode[i]==0x2D) { // SDR
3169       emit_writebyte_indexed(tl,3,temp);
3170       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3171     }
3172     done0=(int)out;
3173     emit_jmp(0);
3174     // 1
3175     set_jump_target(case1,(int)out);
3176     if (opcode[i]==0x2A) { // SWL
3177       // Write 3 msb into three least significant bytes
3178       if(rs2[i]) emit_rorimm(tl,8,tl);
3179       emit_writehword_indexed(tl,-1,temp);
3180       if(rs2[i]) emit_rorimm(tl,16,tl);
3181       emit_writebyte_indexed(tl,1,temp);
3182       if(rs2[i]) emit_rorimm(tl,8,tl);
3183     }
3184     if (opcode[i]==0x2E) { // SWR
3185       // Write two lsb into two most significant bytes
3186       emit_writehword_indexed(tl,1,temp);
3187     }
3188     if (opcode[i]==0x2C) { // SDL
3189       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3190       // Write 3 msb into three least significant bytes
3191       if(rs2[i]) emit_rorimm(th,8,th);
3192       emit_writehword_indexed(th,-1,temp);
3193       if(rs2[i]) emit_rorimm(th,16,th);
3194       emit_writebyte_indexed(th,1,temp);
3195       if(rs2[i]) emit_rorimm(th,8,th);
3196     }
3197     if (opcode[i]==0x2D) { // SDR
3198       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3199       // Write two lsb into two most significant bytes
3200       emit_writehword_indexed(tl,1,temp);
3201     }
3202     done1=(int)out;
3203     emit_jmp(0);
3204     // 2
3205     set_jump_target(case2,(int)out);
3206     emit_testimm(temp,1);
3207     case3=(int)out;
3208     emit_jne(0);
3209     if (opcode[i]==0x2A) { // SWL
3210       // Write two msb into two least significant bytes
3211       if(rs2[i]) emit_rorimm(tl,16,tl);
3212       emit_writehword_indexed(tl,-2,temp);
3213       if(rs2[i]) emit_rorimm(tl,16,tl);
3214     }
3215     if (opcode[i]==0x2E) { // SWR
3216       // Write 3 lsb into three most significant bytes
3217       emit_writebyte_indexed(tl,-1,temp);
3218       if(rs2[i]) emit_rorimm(tl,8,tl);
3219       emit_writehword_indexed(tl,0,temp);
3220       if(rs2[i]) emit_rorimm(tl,24,tl);
3221     }
3222     if (opcode[i]==0x2C) { // SDL
3223       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3224       // Write two msb into two least significant bytes
3225       if(rs2[i]) emit_rorimm(th,16,th);
3226       emit_writehword_indexed(th,-2,temp);
3227       if(rs2[i]) emit_rorimm(th,16,th);
3228     }
3229     if (opcode[i]==0x2D) { // SDR
3230       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3231       // Write 3 lsb into three most significant bytes
3232       emit_writebyte_indexed(tl,-1,temp);
3233       if(rs2[i]) emit_rorimm(tl,8,tl);
3234       emit_writehword_indexed(tl,0,temp);
3235       if(rs2[i]) emit_rorimm(tl,24,tl);
3236     }
3237     done2=(int)out;
3238     emit_jmp(0);
3239     // 3
3240     set_jump_target(case3,(int)out);
3241     if (opcode[i]==0x2A) { // SWL
3242       // Write msb into least significant byte
3243       if(rs2[i]) emit_rorimm(tl,24,tl);
3244       emit_writebyte_indexed(tl,-3,temp);
3245       if(rs2[i]) emit_rorimm(tl,8,tl);
3246     }
3247     if (opcode[i]==0x2E) { // SWR
3248       // Write entire word
3249       emit_writeword_indexed(tl,-3,temp);
3250     }
3251     if (opcode[i]==0x2C) { // SDL
3252       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3253       // Write msb into least significant byte
3254       if(rs2[i]) emit_rorimm(th,24,th);
3255       emit_writebyte_indexed(th,-3,temp);
3256       if(rs2[i]) emit_rorimm(th,8,th);
3257     }
3258     if (opcode[i]==0x2D) { // SDR
3259       if(rs2[i]) emit_mov(th,temp2);
3260       // Write entire word
3261       emit_writeword_indexed(tl,-3,temp);
3262     }
3263     set_jump_target(done0,(int)out);
3264     set_jump_target(done1,(int)out);
3265     set_jump_target(done2,(int)out);
3266     if (opcode[i]==0x2C) { // SDL
3267       emit_testimm(temp,4);
3268       done0=(int)out;
3269       emit_jne(0);
3270       emit_andimm(temp,~3,temp);
3271       emit_writeword_indexed(temp2,4,temp);
3272       set_jump_target(done0,(int)out);
3273     }
3274     if (opcode[i]==0x2D) { // SDR
3275       emit_testimm(temp,4);
3276       done0=(int)out;
3277       emit_jeq(0);
3278       emit_andimm(temp,~3,temp);
3279       emit_writeword_indexed(temp2,-4,temp);
3280       set_jump_target(done0,(int)out);
3281     }
3282     if(!c||!memtarget)
3283       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3284   }
3285   if(!using_tlb) {
3286     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3287     #if defined(HOST_IMM8)
3288     int ir=get_reg(i_regs->regmap,INVCP);
3289     assert(ir>=0);
3290     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3291     #else
3292     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3293     #endif
3294     jaddr2=(int)out;
3295     emit_jne(0);
3296     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3297   }
3298   /*
3299     emit_pusha();
3300     //save_regs(0x100f);
3301         emit_readword((int)&last_count,ECX);
3302         if(get_reg(i_regs->regmap,CCREG)<0)
3303           emit_loadreg(CCREG,HOST_CCREG);
3304         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3305         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3306         emit_writeword(HOST_CCREG,(int)&Count);
3307     emit_call((int)memdebug);
3308     emit_popa();
3309     //restore_regs(0x100f);
3310   /**/
3311 }
3312
3313 void c1ls_assemble(int i,struct regstat *i_regs)
3314 {
3315 #ifndef DISABLE_COP1
3316   int s,th,tl;
3317   int temp,ar;
3318   int map=-1;
3319   int offset;
3320   int c=0;
3321   int jaddr,jaddr2=0,jaddr3,type;
3322   int agr=AGEN1+(i&1);
3323   u_int hr,reglist=0;
3324   th=get_reg(i_regs->regmap,FTEMP|64);
3325   tl=get_reg(i_regs->regmap,FTEMP);
3326   s=get_reg(i_regs->regmap,rs1[i]);
3327   temp=get_reg(i_regs->regmap,agr);
3328   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3329   offset=imm[i];
3330   assert(tl>=0);
3331   assert(rs1[i]>0);
3332   assert(temp>=0);
3333   for(hr=0;hr<HOST_REGS;hr++) {
3334     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3335   }
3336   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3337   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3338   {
3339     // Loads use a temporary register which we need to save
3340     reglist|=1<<temp;
3341   }
3342   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3343     ar=temp;
3344   else // LWC1/LDC1
3345     ar=tl;
3346   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3347   //else c=(i_regs->wasconst>>s)&1;
3348   if(s>=0) c=(i_regs->wasconst>>s)&1;
3349   // Check cop1 unusable
3350   if(!cop1_usable) {
3351     signed char rs=get_reg(i_regs->regmap,CSREG);
3352     assert(rs>=0);
3353     emit_testimm(rs,0x20000000);
3354     jaddr=(int)out;
3355     emit_jeq(0);
3356     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3357     cop1_usable=1;
3358   }
3359   if (opcode[i]==0x39) { // SWC1 (get float address)
3360     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3361   }
3362   if (opcode[i]==0x3D) { // SDC1 (get double address)
3363     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3364   }
3365   // Generate address + offset
3366   if(!using_tlb) {
3367     if(!c)
3368       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3369   }
3370   else
3371   {
3372     map=get_reg(i_regs->regmap,TLREG);
3373     assert(map>=0);
3374     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3375       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3376     }
3377     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3378       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3379     }
3380   }
3381   if (opcode[i]==0x39) { // SWC1 (read float)
3382     emit_readword_indexed(0,tl,tl);
3383   }
3384   if (opcode[i]==0x3D) { // SDC1 (read double)
3385     emit_readword_indexed(4,tl,th);
3386     emit_readword_indexed(0,tl,tl);
3387   }
3388   if (opcode[i]==0x31) { // LWC1 (get target address)
3389     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3390   }
3391   if (opcode[i]==0x35) { // LDC1 (get target address)
3392     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3393   }
3394   if(!using_tlb) {
3395     if(!c) {
3396       jaddr2=(int)out;
3397       emit_jno(0);
3398     }
3399     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3400       jaddr2=(int)out;
3401       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3402     }
3403     #ifdef DESTRUCTIVE_SHIFT
3404     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3405       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3406     }
3407     #endif
3408   }else{
3409     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3410       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3411     }
3412     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3413       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3414     }
3415   }
3416   if (opcode[i]==0x31) { // LWC1
3417     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3418     //gen_tlb_addr_r(ar,map);
3419     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3420     #ifdef HOST_IMM_ADDR32
3421     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3422     else
3423     #endif
3424     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3425     type=LOADW_STUB;
3426   }
3427   if (opcode[i]==0x35) { // LDC1
3428     assert(th>=0);
3429     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3430     //gen_tlb_addr_r(ar,map);
3431     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3432     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3433     #ifdef HOST_IMM_ADDR32
3434     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3435     else
3436     #endif
3437     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3438     type=LOADD_STUB;
3439   }
3440   if (opcode[i]==0x39) { // SWC1
3441     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3442     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3443     type=STOREW_STUB;
3444   }
3445   if (opcode[i]==0x3D) { // SDC1
3446     assert(th>=0);
3447     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3448     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3449     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3450     type=STORED_STUB;
3451   }
3452   if(!using_tlb) {
3453     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3454       #ifndef DESTRUCTIVE_SHIFT
3455       temp=offset||c||s<0?ar:s;
3456       #endif
3457       #if defined(HOST_IMM8)
3458       int ir=get_reg(i_regs->regmap,INVCP);
3459       assert(ir>=0);
3460       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3461       #else
3462       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3463       #endif
3464       jaddr3=(int)out;
3465       emit_jne(0);
3466       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3467     }
3468   }
3469   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3470   if (opcode[i]==0x31) { // LWC1 (write float)
3471     emit_writeword_indexed(tl,0,temp);
3472   }
3473   if (opcode[i]==0x35) { // LDC1 (write double)
3474     emit_writeword_indexed(th,4,temp);
3475     emit_writeword_indexed(tl,0,temp);
3476   }
3477   //if(opcode[i]==0x39)
3478   /*if(opcode[i]==0x39||opcode[i]==0x31)
3479   {
3480     emit_pusha();
3481         emit_readword((int)&last_count,ECX);
3482         if(get_reg(i_regs->regmap,CCREG)<0)
3483           emit_loadreg(CCREG,HOST_CCREG);
3484         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3485         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3486         emit_writeword(HOST_CCREG,(int)&Count);
3487     emit_call((int)memdebug);
3488     emit_popa();
3489   }/**/
3490 #else
3491   cop1_unusable(i, i_regs);
3492 #endif
3493 }
3494
3495 #ifndef multdiv_assemble
3496 void multdiv_assemble(int i,struct regstat *i_regs)
3497 {
3498   printf("Need multdiv_assemble for this architecture.\n");
3499   exit(1);
3500 }
3501 #endif
3502
3503 void mov_assemble(int i,struct regstat *i_regs)
3504 {
3505   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3506   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3507   assert(rt1[i]>0);
3508   if(rt1[i]) {
3509     signed char sh,sl,th,tl;
3510     th=get_reg(i_regs->regmap,rt1[i]|64);
3511     tl=get_reg(i_regs->regmap,rt1[i]);
3512     //assert(tl>=0);
3513     if(tl>=0) {
3514       sh=get_reg(i_regs->regmap,rs1[i]|64);
3515       sl=get_reg(i_regs->regmap,rs1[i]);
3516       if(sl>=0) emit_mov(sl,tl);
3517       else emit_loadreg(rs1[i],tl);
3518       if(th>=0) {
3519         if(sh>=0) emit_mov(sh,th);
3520         else emit_loadreg(rs1[i]|64,th);
3521       }
3522     }
3523   }
3524 }
3525
3526 #ifndef fconv_assemble
3527 void fconv_assemble(int i,struct regstat *i_regs)
3528 {
3529   printf("Need fconv_assemble for this architecture.\n");
3530   exit(1);
3531 }
3532 #endif
3533
3534 #if 0
3535 void float_assemble(int i,struct regstat *i_regs)
3536 {
3537   printf("Need float_assemble for this architecture.\n");
3538   exit(1);
3539 }
3540 #endif
3541
3542 void syscall_assemble(int i,struct regstat *i_regs)
3543 {
3544   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3545   assert(ccreg==HOST_CCREG);
3546   assert(!is_delayslot);
3547   emit_movimm(start+i*4,EAX); // Get PC
3548   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3549   emit_jmp((int)jump_syscall);
3550 }
3551
3552 void ds_assemble(int i,struct regstat *i_regs)
3553 {
3554   is_delayslot=1;
3555   switch(itype[i]) {
3556     case ALU:
3557       alu_assemble(i,i_regs);break;
3558     case IMM16:
3559       imm16_assemble(i,i_regs);break;
3560     case SHIFT:
3561       shift_assemble(i,i_regs);break;
3562     case SHIFTIMM:
3563       shiftimm_assemble(i,i_regs);break;
3564     case LOAD:
3565       load_assemble(i,i_regs);break;
3566     case LOADLR:
3567       loadlr_assemble(i,i_regs);break;
3568     case STORE:
3569       store_assemble(i,i_regs);break;
3570     case STORELR:
3571       storelr_assemble(i,i_regs);break;
3572     case COP0:
3573       cop0_assemble(i,i_regs);break;
3574     case COP1:
3575       cop1_assemble(i,i_regs);break;
3576     case C1LS:
3577       c1ls_assemble(i,i_regs);break;
3578     case FCONV:
3579       fconv_assemble(i,i_regs);break;
3580     case FLOAT:
3581       float_assemble(i,i_regs);break;
3582     case FCOMP:
3583       fcomp_assemble(i,i_regs);break;
3584     case MULTDIV:
3585       multdiv_assemble(i,i_regs);break;
3586     case MOV:
3587       mov_assemble(i,i_regs);break;
3588     case SYSCALL:
3589     case SPAN:
3590     case UJUMP:
3591     case RJUMP:
3592     case CJUMP:
3593     case SJUMP:
3594     case FJUMP:
3595       printf("Jump in the delay slot.  This is probably a bug.\n");
3596   }
3597   is_delayslot=0;
3598 }
3599
3600 // Is the branch target a valid internal jump?
3601 int internal_branch(uint64_t i_is32,int addr)
3602 {
3603   if(addr&1) return 0; // Indirect (register) jump
3604   if(addr>=start && addr<start+slen*4-4)
3605   {
3606     int t=(addr-start)>>2;
3607     // Delay slots are not valid branch targets
3608     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3609     // 64 -> 32 bit transition requires a recompile
3610     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3611     {
3612       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3613       else printf("optimizable: yes\n");
3614     }*/
3615     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3616     if(requires_32bit[t]&~i_is32) return 0;
3617     else return 1;
3618   }
3619   return 0;
3620 }
3621
3622 #ifndef wb_invalidate
3623 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3624   uint64_t u,uint64_t uu)
3625 {
3626   int hr;
3627   for(hr=0;hr<HOST_REGS;hr++) {
3628     if(hr!=EXCLUDE_REG) {
3629       if(pre[hr]!=entry[hr]) {
3630         if(pre[hr]>=0) {
3631           if((dirty>>hr)&1) {
3632             if(get_reg(entry,pre[hr])<0) {
3633               if(pre[hr]<64) {
3634                 if(!((u>>pre[hr])&1)) {
3635                   emit_storereg(pre[hr],hr);
3636                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3637                     emit_sarimm(hr,31,hr);
3638                     emit_storereg(pre[hr]|64,hr);
3639                   }
3640                 }
3641               }else{
3642                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3643                   emit_storereg(pre[hr],hr);
3644                 }
3645               }
3646             }
3647           }
3648         }
3649       }
3650     }
3651   }
3652   // Move from one register to another (no writeback)
3653   for(hr=0;hr<HOST_REGS;hr++) {
3654     if(hr!=EXCLUDE_REG) {
3655       if(pre[hr]!=entry[hr]) {
3656         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3657           int nr;
3658           if((nr=get_reg(entry,pre[hr]))>=0) {
3659             emit_mov(hr,nr);
3660           }
3661         }
3662       }
3663     }
3664   }
3665 }
3666 #endif
3667
3668 // Load the specified registers
3669 // This only loads the registers given as arguments because
3670 // we don't want to load things that will be overwritten
3671 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3672 {
3673   int hr;
3674   // Load 32-bit regs
3675   for(hr=0;hr<HOST_REGS;hr++) {
3676     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3677       if(entry[hr]!=regmap[hr]) {
3678         if(regmap[hr]==rs1||regmap[hr]==rs2)
3679         {
3680           if(regmap[hr]==0) {
3681             emit_zeroreg(hr);
3682           }
3683           else
3684           {
3685             emit_loadreg(regmap[hr],hr);
3686           }
3687         }
3688       }
3689     }
3690   }
3691   //Load 64-bit regs
3692   for(hr=0;hr<HOST_REGS;hr++) {
3693     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3694       if(entry[hr]!=regmap[hr]) {
3695         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3696         {
3697           assert(regmap[hr]!=64);
3698           if((is32>>(regmap[hr]&63))&1) {
3699             int lr=get_reg(regmap,regmap[hr]-64);
3700             if(lr>=0)
3701               emit_sarimm(lr,31,hr);
3702             else
3703               emit_loadreg(regmap[hr],hr);
3704           }
3705           else
3706           {
3707             emit_loadreg(regmap[hr],hr);
3708           }
3709         }
3710       }
3711     }
3712   }
3713 }
3714
3715 // Load registers prior to the start of a loop
3716 // so that they are not loaded within the loop
3717 static void loop_preload(signed char pre[],signed char entry[])
3718 {
3719   int hr;
3720   for(hr=0;hr<HOST_REGS;hr++) {
3721     if(hr!=EXCLUDE_REG) {
3722       if(pre[hr]!=entry[hr]) {
3723         if(entry[hr]>=0) {
3724           if(get_reg(pre,entry[hr])<0) {
3725             assem_debug("loop preload:\n");
3726             //printf("loop preload: %d\n",hr);
3727             if(entry[hr]==0) {
3728               emit_zeroreg(hr);
3729             }
3730             else if(entry[hr]<TEMPREG)
3731             {
3732               emit_loadreg(entry[hr],hr);
3733             }
3734             else if(entry[hr]-64<TEMPREG)
3735             {
3736               emit_loadreg(entry[hr],hr);
3737             }
3738           }
3739         }
3740       }
3741     }
3742   }
3743 }
3744
3745 // Generate address for load/store instruction
3746 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3747 {
3748   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3749     int ra;
3750     int agr=AGEN1+(i&1);
3751     int mgr=MGEN1+(i&1);
3752     if(itype[i]==LOAD) {
3753       ra=get_reg(i_regs->regmap,rt1[i]);
3754       //if(rt1[i]) assert(ra>=0);
3755     }
3756     if(itype[i]==LOADLR) {
3757       ra=get_reg(i_regs->regmap,FTEMP);
3758     }
3759     if(itype[i]==STORE||itype[i]==STORELR) {
3760       ra=get_reg(i_regs->regmap,agr);
3761       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3762     }
3763     if(itype[i]==C1LS) {
3764       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3765         ra=get_reg(i_regs->regmap,FTEMP);
3766       else { // SWC1/SDC1
3767         ra=get_reg(i_regs->regmap,agr);
3768         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3769       }
3770     }
3771     int rs=get_reg(i_regs->regmap,rs1[i]);
3772     int rm=get_reg(i_regs->regmap,TLREG);
3773     if(ra>=0) {
3774       int offset=imm[i];
3775       int c=(i_regs->wasconst>>rs)&1;
3776       if(rs1[i]==0) {
3777         // Using r0 as a base address
3778         /*if(rm>=0) {
3779           if(!entry||entry[rm]!=mgr) {
3780             generate_map_const(offset,rm);
3781           } // else did it in the previous cycle
3782         }*/
3783         if(!entry||entry[ra]!=agr) {
3784           if (opcode[i]==0x22||opcode[i]==0x26) {
3785             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3786           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3787             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3788           }else{
3789             emit_movimm(offset,ra);
3790           }
3791         } // else did it in the previous cycle
3792       }
3793       else if(rs<0) {
3794         if(!entry||entry[ra]!=rs1[i])
3795           emit_loadreg(rs1[i],ra);
3796         //if(!entry||entry[ra]!=rs1[i])
3797         //  printf("poor load scheduling!\n");
3798       }
3799       else if(c) {
3800         if(rm>=0) {
3801           if(!entry||entry[rm]!=mgr) {
3802             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3803               // Stores to memory go thru the mapper to detect self-modifying
3804               // code, loads don't.
3805               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3806                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3807                 generate_map_const(constmap[i][rs]+offset,rm);
3808             }else{
3809               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3810                 generate_map_const(constmap[i][rs]+offset,rm);
3811             }
3812           }
3813         }
3814         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3815           if(!entry||entry[ra]!=agr) {
3816             if (opcode[i]==0x22||opcode[i]==0x26) {
3817               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3818             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3819               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3820             }else{
3821               #ifdef HOST_IMM_ADDR32
3822               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3823                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3824               #endif
3825               emit_movimm(constmap[i][rs]+offset,ra);
3826             }
3827           } // else did it in the previous cycle
3828         } // else load_consts already did it
3829       }
3830       if(offset&&!c&&rs1[i]) {
3831         if(rs>=0) {
3832           emit_addimm(rs,offset,ra);
3833         }else{
3834           emit_addimm(ra,offset,ra);
3835         }
3836       }
3837     }
3838   }
3839   // Preload constants for next instruction
3840   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3841     int agr,ra;
3842     #ifndef HOST_IMM_ADDR32
3843     // Mapper entry
3844     agr=MGEN1+((i+1)&1);
3845     ra=get_reg(i_regs->regmap,agr);
3846     if(ra>=0) {
3847       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3848       int offset=imm[i+1];
3849       int c=(regs[i+1].wasconst>>rs)&1;
3850       if(c) {
3851         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3852           // Stores to memory go thru the mapper to detect self-modifying
3853           // code, loads don't.
3854           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3855              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3856             generate_map_const(constmap[i+1][rs]+offset,ra);
3857         }else{
3858           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3859             generate_map_const(constmap[i+1][rs]+offset,ra);
3860         }
3861       }
3862       /*else if(rs1[i]==0) {
3863         generate_map_const(offset,ra);
3864       }*/
3865     }
3866     #endif
3867     // Actual address
3868     agr=AGEN1+((i+1)&1);
3869     ra=get_reg(i_regs->regmap,agr);
3870     if(ra>=0) {
3871       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3872       int offset=imm[i+1];
3873       int c=(regs[i+1].wasconst>>rs)&1;
3874       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3875         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3876           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3877         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3878           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3879         }else{
3880           #ifdef HOST_IMM_ADDR32
3881           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3882              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3883           #endif
3884           emit_movimm(constmap[i+1][rs]+offset,ra);
3885         }
3886       }
3887       else if(rs1[i+1]==0) {
3888         // Using r0 as a base address
3889         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3890           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3891         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3892           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3893         }else{
3894           emit_movimm(offset,ra);
3895         }
3896       }
3897     }
3898   }
3899 }
3900
3901 int get_final_value(int hr, int i, int *value)
3902 {
3903   int reg=regs[i].regmap[hr];
3904   while(i<slen-1) {
3905     if(regs[i+1].regmap[hr]!=reg) break;
3906     if(!((regs[i+1].isconst>>hr)&1)) break;
3907     if(bt[i+1]) break;
3908     i++;
3909   }
3910   if(i<slen-1) {
3911     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3912       *value=constmap[i][hr];
3913       return 1;
3914     }
3915     if(!bt[i+1]) {
3916       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3917         // Load in delay slot, out-of-order execution
3918         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3919         {
3920           #ifdef HOST_IMM_ADDR32
3921           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3922           #endif
3923           // Precompute load address
3924           *value=constmap[i][hr]+imm[i+2];
3925           return 1;
3926         }
3927       }
3928       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3929       {
3930         #ifdef HOST_IMM_ADDR32
3931         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3932         #endif
3933         // Precompute load address
3934         *value=constmap[i][hr]+imm[i+1];
3935         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3936         return 1;
3937       }
3938     }
3939   }
3940   *value=constmap[i][hr];
3941   //printf("c=%x\n",(int)constmap[i][hr]);
3942   if(i==slen-1) return 1;
3943   if(reg<64) {
3944     return !((unneeded_reg[i+1]>>reg)&1);
3945   }else{
3946     return !((unneeded_reg_upper[i+1]>>reg)&1);
3947   }
3948 }
3949
3950 // Load registers with known constants
3951 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3952 {
3953   int hr;
3954   // Load 32-bit regs
3955   for(hr=0;hr<HOST_REGS;hr++) {
3956     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3957       //if(entry[hr]!=regmap[hr]) {
3958       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3959         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3960           int value;
3961           if(get_final_value(hr,i,&value)) {
3962             if(value==0) {
3963               emit_zeroreg(hr);
3964             }
3965             else {
3966               emit_movimm(value,hr);
3967             }
3968           }
3969         }
3970       }
3971     }
3972   }
3973   // Load 64-bit regs
3974   for(hr=0;hr<HOST_REGS;hr++) {
3975     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3976       //if(entry[hr]!=regmap[hr]) {
3977       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3978         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3979           if((is32>>(regmap[hr]&63))&1) {
3980             int lr=get_reg(regmap,regmap[hr]-64);
3981             assert(lr>=0);
3982             emit_sarimm(lr,31,hr);
3983           }
3984           else
3985           {
3986             int value;
3987             if(get_final_value(hr,i,&value)) {
3988               if(value==0) {
3989                 emit_zeroreg(hr);
3990               }
3991               else {
3992                 emit_movimm(value,hr);
3993               }
3994             }
3995           }
3996         }
3997       }
3998     }
3999   }
4000 }
4001 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4002 {
4003   int hr;
4004   // Load 32-bit regs
4005   for(hr=0;hr<HOST_REGS;hr++) {
4006     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4007       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4008         int value=constmap[i][hr];
4009         if(value==0) {
4010           emit_zeroreg(hr);
4011         }
4012         else {
4013           emit_movimm(value,hr);
4014         }
4015       }
4016     }
4017   }
4018   // Load 64-bit regs
4019   for(hr=0;hr<HOST_REGS;hr++) {
4020     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4021       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4022         if((is32>>(regmap[hr]&63))&1) {
4023           int lr=get_reg(regmap,regmap[hr]-64);
4024           assert(lr>=0);
4025           emit_sarimm(lr,31,hr);
4026         }
4027         else
4028         {
4029           int value=constmap[i][hr];
4030           if(value==0) {
4031             emit_zeroreg(hr);
4032           }
4033           else {
4034             emit_movimm(value,hr);
4035           }
4036         }
4037       }
4038     }
4039   }
4040 }
4041
4042 // Write out all dirty registers (except cycle count)
4043 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4044 {
4045   int hr;
4046   for(hr=0;hr<HOST_REGS;hr++) {
4047     if(hr!=EXCLUDE_REG) {
4048       if(i_regmap[hr]>0) {
4049         if(i_regmap[hr]!=CCREG) {
4050           if((i_dirty>>hr)&1) {
4051             if(i_regmap[hr]<64) {
4052               emit_storereg(i_regmap[hr],hr);
4053 #ifndef FORCE32
4054               if( ((i_is32>>i_regmap[hr])&1) ) {
4055                 #ifdef DESTRUCTIVE_WRITEBACK
4056                 emit_sarimm(hr,31,hr);
4057                 emit_storereg(i_regmap[hr]|64,hr);
4058                 #else
4059                 emit_sarimm(hr,31,HOST_TEMPREG);
4060                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4061                 #endif
4062               }
4063 #endif
4064             }else{
4065               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4066                 emit_storereg(i_regmap[hr],hr);
4067               }
4068             }
4069           }
4070         }
4071       }
4072     }
4073   }
4074 }
4075 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4076 // This writes the registers not written by store_regs_bt
4077 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4078 {
4079   int hr;
4080   int t=(addr-start)>>2;
4081   for(hr=0;hr<HOST_REGS;hr++) {
4082     if(hr!=EXCLUDE_REG) {
4083       if(i_regmap[hr]>0) {
4084         if(i_regmap[hr]!=CCREG) {
4085           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4086             if((i_dirty>>hr)&1) {
4087               if(i_regmap[hr]<64) {
4088                 emit_storereg(i_regmap[hr],hr);
4089 #ifndef FORCE32
4090                 if( ((i_is32>>i_regmap[hr])&1) ) {
4091                   #ifdef DESTRUCTIVE_WRITEBACK
4092                   emit_sarimm(hr,31,hr);
4093                   emit_storereg(i_regmap[hr]|64,hr);
4094                   #else
4095                   emit_sarimm(hr,31,HOST_TEMPREG);
4096                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4097                   #endif
4098                 }
4099 #endif
4100               }else{
4101                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4102                   emit_storereg(i_regmap[hr],hr);
4103                 }
4104               }
4105             }
4106           }
4107         }
4108       }
4109     }
4110   }
4111 }
4112
4113 // Load all registers (except cycle count)
4114 void load_all_regs(signed char i_regmap[])
4115 {
4116   int hr;
4117   for(hr=0;hr<HOST_REGS;hr++) {
4118     if(hr!=EXCLUDE_REG) {
4119       if(i_regmap[hr]==0) {
4120         emit_zeroreg(hr);
4121       }
4122       else
4123       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4124       {
4125         emit_loadreg(i_regmap[hr],hr);
4126       }
4127     }
4128   }
4129 }
4130
4131 // Load all current registers also needed by next instruction
4132 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4133 {
4134   int hr;
4135   for(hr=0;hr<HOST_REGS;hr++) {
4136     if(hr!=EXCLUDE_REG) {
4137       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4138         if(i_regmap[hr]==0) {
4139           emit_zeroreg(hr);
4140         }
4141         else
4142         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4143         {
4144           emit_loadreg(i_regmap[hr],hr);
4145         }
4146       }
4147     }
4148   }
4149 }
4150
4151 // Load all regs, storing cycle count if necessary
4152 void load_regs_entry(int t)
4153 {
4154   int hr;
4155   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4156   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4157   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4158     emit_storereg(CCREG,HOST_CCREG);
4159   }
4160   // Load 32-bit regs
4161   for(hr=0;hr<HOST_REGS;hr++) {
4162     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4163       if(regs[t].regmap_entry[hr]==0) {
4164         emit_zeroreg(hr);
4165       }
4166       else if(regs[t].regmap_entry[hr]!=CCREG)
4167       {
4168         emit_loadreg(regs[t].regmap_entry[hr],hr);
4169       }
4170     }
4171   }
4172   // Load 64-bit regs
4173   for(hr=0;hr<HOST_REGS;hr++) {
4174     if(regs[t].regmap_entry[hr]>=64) {
4175       assert(regs[t].regmap_entry[hr]!=64);
4176       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4177         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4178         if(lr<0) {
4179           emit_loadreg(regs[t].regmap_entry[hr],hr);
4180         }
4181         else
4182         {
4183           emit_sarimm(lr,31,hr);
4184         }
4185       }
4186       else
4187       {
4188         emit_loadreg(regs[t].regmap_entry[hr],hr);
4189       }
4190     }
4191   }
4192 }
4193
4194 // Store dirty registers prior to branch
4195 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4196 {
4197   if(internal_branch(i_is32,addr))
4198   {
4199     int t=(addr-start)>>2;
4200     int hr;
4201     for(hr=0;hr<HOST_REGS;hr++) {
4202       if(hr!=EXCLUDE_REG) {
4203         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4204           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4205             if((i_dirty>>hr)&1) {
4206               if(i_regmap[hr]<64) {
4207                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4208                   emit_storereg(i_regmap[hr],hr);
4209                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4210                     #ifdef DESTRUCTIVE_WRITEBACK
4211                     emit_sarimm(hr,31,hr);
4212                     emit_storereg(i_regmap[hr]|64,hr);
4213                     #else
4214                     emit_sarimm(hr,31,HOST_TEMPREG);
4215                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4216                     #endif
4217                   }
4218                 }
4219               }else{
4220                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4221                   emit_storereg(i_regmap[hr],hr);
4222                 }
4223               }
4224             }
4225           }
4226         }
4227       }
4228     }
4229   }
4230   else
4231   {
4232     // Branch out of this block, write out all dirty regs
4233     wb_dirtys(i_regmap,i_is32,i_dirty);
4234   }
4235 }
4236
4237 // Load all needed registers for branch target
4238 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4239 {
4240   //if(addr>=start && addr<(start+slen*4))
4241   if(internal_branch(i_is32,addr))
4242   {
4243     int t=(addr-start)>>2;
4244     int hr;
4245     // Store the cycle count before loading something else
4246     if(i_regmap[HOST_CCREG]!=CCREG) {
4247       assert(i_regmap[HOST_CCREG]==-1);
4248     }
4249     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4250       emit_storereg(CCREG,HOST_CCREG);
4251     }
4252     // Load 32-bit regs
4253     for(hr=0;hr<HOST_REGS;hr++) {
4254       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4255         #ifdef DESTRUCTIVE_WRITEBACK
4256         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4257         #else
4258         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4259         #endif
4260           if(regs[t].regmap_entry[hr]==0) {
4261             emit_zeroreg(hr);
4262           }
4263           else if(regs[t].regmap_entry[hr]!=CCREG)
4264           {
4265             emit_loadreg(regs[t].regmap_entry[hr],hr);
4266           }
4267         }
4268       }
4269     }
4270     //Load 64-bit regs
4271     for(hr=0;hr<HOST_REGS;hr++) {
4272       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4273         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4274           assert(regs[t].regmap_entry[hr]!=64);
4275           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4276             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4277             if(lr<0) {
4278               emit_loadreg(regs[t].regmap_entry[hr],hr);
4279             }
4280             else
4281             {
4282               emit_sarimm(lr,31,hr);
4283             }
4284           }
4285           else
4286           {
4287             emit_loadreg(regs[t].regmap_entry[hr],hr);
4288           }
4289         }
4290         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4291           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4292           assert(lr>=0);
4293           emit_sarimm(lr,31,hr);
4294         }
4295       }
4296     }
4297   }
4298 }
4299
4300 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4301 {
4302   if(addr>=start && addr<start+slen*4-4)
4303   {
4304     int t=(addr-start)>>2;
4305     int hr;
4306     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4307     for(hr=0;hr<HOST_REGS;hr++)
4308     {
4309       if(hr!=EXCLUDE_REG)
4310       {
4311         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4312         {
4313           if(regs[t].regmap_entry[hr]!=-1)
4314           {
4315             return 0;
4316           }
4317           else 
4318           if((i_dirty>>hr)&1)
4319           {
4320             if(i_regmap[hr]<64)
4321             {
4322               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4323                 return 0;
4324             }
4325             else
4326             {
4327               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4328                 return 0;
4329             }
4330           }
4331         }
4332         else // Same register but is it 32-bit or dirty?
4333         if(i_regmap[hr]>=0)
4334         {
4335           if(!((regs[t].dirty>>hr)&1))
4336           {
4337             if((i_dirty>>hr)&1)
4338             {
4339               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4340               {
4341                 //printf("%x: dirty no match\n",addr);
4342                 return 0;
4343               }
4344             }
4345           }
4346           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4347           {
4348             //printf("%x: is32 no match\n",addr);
4349             return 0;
4350           }
4351         }
4352       }
4353     }
4354     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4355     if(requires_32bit[t]&~i_is32) return 0;
4356     // Delay slots are not valid branch targets
4357     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4358     // Delay slots require additional processing, so do not match
4359     if(is_ds[t]) return 0;
4360   }
4361   else
4362   {
4363     int hr;
4364     for(hr=0;hr<HOST_REGS;hr++)
4365     {
4366       if(hr!=EXCLUDE_REG)
4367       {
4368         if(i_regmap[hr]>=0)
4369         {
4370           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4371           {
4372             if((i_dirty>>hr)&1)
4373             {
4374               return 0;
4375             }
4376           }
4377         }
4378       }
4379     }
4380   }
4381   return 1;
4382 }
4383
4384 // Used when a branch jumps into the delay slot of another branch
4385 void ds_assemble_entry(int i)
4386 {
4387   int t=(ba[i]-start)>>2;
4388   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4389   assem_debug("Assemble delay slot at %x\n",ba[i]);
4390   assem_debug("<->\n");
4391   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4392     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4393   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4394   address_generation(t,&regs[t],regs[t].regmap_entry);
4395   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4396     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4397   cop1_usable=0;
4398   is_delayslot=0;
4399   switch(itype[t]) {
4400     case ALU:
4401       alu_assemble(t,&regs[t]);break;
4402     case IMM16:
4403       imm16_assemble(t,&regs[t]);break;
4404     case SHIFT:
4405       shift_assemble(t,&regs[t]);break;
4406     case SHIFTIMM:
4407       shiftimm_assemble(t,&regs[t]);break;
4408     case LOAD:
4409       load_assemble(t,&regs[t]);break;
4410     case LOADLR:
4411       loadlr_assemble(t,&regs[t]);break;
4412     case STORE:
4413       store_assemble(t,&regs[t]);break;
4414     case STORELR:
4415       storelr_assemble(t,&regs[t]);break;
4416     case COP0:
4417       cop0_assemble(t,&regs[t]);break;
4418     case COP1:
4419       cop1_assemble(t,&regs[t]);break;
4420     case C1LS:
4421       c1ls_assemble(t,&regs[t]);break;
4422     case FCONV:
4423       fconv_assemble(t,&regs[t]);break;
4424     case FLOAT:
4425       float_assemble(t,&regs[t]);break;
4426     case FCOMP:
4427       fcomp_assemble(t,&regs[t]);break;
4428     case MULTDIV:
4429       multdiv_assemble(t,&regs[t]);break;
4430     case MOV:
4431       mov_assemble(t,&regs[t]);break;
4432     case SYSCALL:
4433     case SPAN:
4434     case UJUMP:
4435     case RJUMP:
4436     case CJUMP:
4437     case SJUMP:
4438     case FJUMP:
4439       printf("Jump in the delay slot.  This is probably a bug.\n");
4440   }
4441   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4442   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4443   if(internal_branch(regs[t].is32,ba[i]+4))
4444     assem_debug("branch: internal\n");
4445   else
4446     assem_debug("branch: external\n");
4447   assert(internal_branch(regs[t].is32,ba[i]+4));
4448   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4449   emit_jmp(0);
4450 }
4451
4452 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4453 {
4454   int count;
4455   int jaddr;
4456   int idle=0;
4457   if(itype[i]==RJUMP)
4458   {
4459     *adj=0;
4460   }
4461   //if(ba[i]>=start && ba[i]<(start+slen*4))
4462   if(internal_branch(branch_regs[i].is32,ba[i]))
4463   {
4464     int t=(ba[i]-start)>>2;
4465     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4466     else *adj=ccadj[t];
4467   }
4468   else
4469   {
4470     *adj=0;
4471   }
4472   count=ccadj[i];
4473   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4474     // Idle loop
4475     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4476     idle=(int)out;
4477     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4478     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4479     jaddr=(int)out;
4480     emit_jmp(0);
4481   }
4482   else if(*adj==0||invert) {
4483     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4484     jaddr=(int)out;
4485     emit_jns(0);
4486   }
4487   else
4488   {
4489     emit_cmpimm(HOST_CCREG,-2*(count+2));
4490     jaddr=(int)out;
4491     emit_jns(0);
4492   }
4493   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4494 }
4495
4496 void do_ccstub(int n)
4497 {
4498   literal_pool(256);
4499   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4500   set_jump_target(stubs[n][1],(int)out);
4501   int i=stubs[n][4];
4502   if(stubs[n][6]==NULLDS) {
4503     // Delay slot instruction is nullified ("likely" branch)
4504     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4505   }
4506   else if(stubs[n][6]!=TAKEN) {
4507     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4508   }
4509   else {
4510     if(internal_branch(branch_regs[i].is32,ba[i]))
4511       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4512   }
4513   if(stubs[n][5]!=-1)
4514   {
4515     // Save PC as return address
4516     emit_movimm(stubs[n][5],EAX);
4517     emit_writeword(EAX,(int)&pcaddr);
4518   }
4519   else
4520   {
4521     // Return address depends on which way the branch goes
4522     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4523     {
4524       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4525       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4526       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4527       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4528       if(rs1[i]==0)
4529       {
4530         s1l=s2l;s1h=s2h;
4531         s2l=s2h=-1;
4532       }
4533       else if(rs2[i]==0)
4534       {
4535         s2l=s2h=-1;
4536       }
4537       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4538         s1h=s2h=-1;
4539       }
4540       assert(s1l>=0);
4541       #ifdef DESTRUCTIVE_WRITEBACK
4542       if(rs1[i]) {
4543         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4544           emit_loadreg(rs1[i],s1l);
4545       } 
4546       else {
4547         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4548           emit_loadreg(rs2[i],s1l);
4549       }
4550       if(s2l>=0)
4551         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4552           emit_loadreg(rs2[i],s2l);
4553       #endif
4554       int hr=0;
4555       int addr,alt,ntaddr;
4556       while(hr<HOST_REGS)
4557       {
4558         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4559            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4560            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4561         {
4562           addr=hr++;break;
4563         }
4564         hr++;
4565       }
4566       while(hr<HOST_REGS)
4567       {
4568         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4569            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4570            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4571         {
4572           alt=hr++;break;
4573         }
4574         hr++;
4575       }
4576       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4577       {
4578         while(hr<HOST_REGS)
4579         {
4580           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4581              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4582              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4583           {
4584             ntaddr=hr;break;
4585           }
4586           hr++;
4587         }
4588         assert(hr<HOST_REGS);
4589       }
4590       if((opcode[i]&0x2f)==4) // BEQ
4591       {
4592         #ifdef HAVE_CMOV_IMM
4593         if(s1h<0) {
4594           if(s2l>=0) emit_cmp(s1l,s2l);
4595           else emit_test(s1l,s1l);
4596           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4597         }
4598         else
4599         #endif
4600         {
4601           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4602           if(s1h>=0) {
4603             if(s2h>=0) emit_cmp(s1h,s2h);
4604             else emit_test(s1h,s1h);
4605             emit_cmovne_reg(alt,addr);
4606           }
4607           if(s2l>=0) emit_cmp(s1l,s2l);
4608           else emit_test(s1l,s1l);
4609           emit_cmovne_reg(alt,addr);
4610         }
4611       }
4612       if((opcode[i]&0x2f)==5) // BNE
4613       {
4614         #ifdef HAVE_CMOV_IMM
4615         if(s1h<0) {
4616           if(s2l>=0) emit_cmp(s1l,s2l);
4617           else emit_test(s1l,s1l);
4618           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4619         }
4620         else
4621         #endif
4622         {
4623           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4624           if(s1h>=0) {
4625             if(s2h>=0) emit_cmp(s1h,s2h);
4626             else emit_test(s1h,s1h);
4627             emit_cmovne_reg(alt,addr);
4628           }
4629           if(s2l>=0) emit_cmp(s1l,s2l);
4630           else emit_test(s1l,s1l);
4631           emit_cmovne_reg(alt,addr);
4632         }
4633       }
4634       if((opcode[i]&0x2f)==6) // BLEZ
4635       {
4636         //emit_movimm(ba[i],alt);
4637         //emit_movimm(start+i*4+8,addr);
4638         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4639         emit_cmpimm(s1l,1);
4640         if(s1h>=0) emit_mov(addr,ntaddr);
4641         emit_cmovl_reg(alt,addr);
4642         if(s1h>=0) {
4643           emit_test(s1h,s1h);
4644           emit_cmovne_reg(ntaddr,addr);
4645           emit_cmovs_reg(alt,addr);
4646         }
4647       }
4648       if((opcode[i]&0x2f)==7) // BGTZ
4649       {
4650         //emit_movimm(ba[i],addr);
4651         //emit_movimm(start+i*4+8,ntaddr);
4652         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4653         emit_cmpimm(s1l,1);
4654         if(s1h>=0) emit_mov(addr,alt);
4655         emit_cmovl_reg(ntaddr,addr);
4656         if(s1h>=0) {
4657           emit_test(s1h,s1h);
4658           emit_cmovne_reg(alt,addr);
4659           emit_cmovs_reg(ntaddr,addr);
4660         }
4661       }
4662       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4663       {
4664         //emit_movimm(ba[i],alt);
4665         //emit_movimm(start+i*4+8,addr);
4666         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4667         if(s1h>=0) emit_test(s1h,s1h);
4668         else emit_test(s1l,s1l);
4669         emit_cmovs_reg(alt,addr);
4670       }
4671       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4672       {
4673         //emit_movimm(ba[i],addr);
4674         //emit_movimm(start+i*4+8,alt);
4675         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4676         if(s1h>=0) emit_test(s1h,s1h);
4677         else emit_test(s1l,s1l);
4678         emit_cmovs_reg(alt,addr);
4679       }
4680       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4681         if(source[i]&0x10000) // BC1T
4682         {
4683           //emit_movimm(ba[i],alt);
4684           //emit_movimm(start+i*4+8,addr);
4685           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4686           emit_testimm(s1l,0x800000);
4687           emit_cmovne_reg(alt,addr);
4688         }
4689         else // BC1F
4690         {
4691           //emit_movimm(ba[i],addr);
4692           //emit_movimm(start+i*4+8,alt);
4693           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4694           emit_testimm(s1l,0x800000);
4695           emit_cmovne_reg(alt,addr);
4696         }
4697       }
4698       emit_writeword(addr,(int)&pcaddr);
4699     }
4700     else
4701     if(itype[i]==RJUMP)
4702     {
4703       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4704       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4705         r=get_reg(branch_regs[i].regmap,RTEMP);
4706       }
4707       emit_writeword(r,(int)&pcaddr);
4708     }
4709     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4710   }
4711   // Update cycle count
4712   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4713   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4714   emit_call((int)cc_interrupt);
4715   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4716   if(stubs[n][6]==TAKEN) {
4717     if(internal_branch(branch_regs[i].is32,ba[i]))
4718       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4719     else if(itype[i]==RJUMP) {
4720       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4721         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4722       else
4723         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4724     }
4725   }else if(stubs[n][6]==NOTTAKEN) {
4726     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4727     else load_all_regs(branch_regs[i].regmap);
4728   }else if(stubs[n][6]==NULLDS) {
4729     // Delay slot instruction is nullified ("likely" branch)
4730     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4731     else load_all_regs(regs[i].regmap);
4732   }else{
4733     load_all_regs(branch_regs[i].regmap);
4734   }
4735   emit_jmp(stubs[n][2]); // return address
4736   
4737   /* This works but uses a lot of memory...
4738   emit_readword((int)&last_count,ECX);
4739   emit_add(HOST_CCREG,ECX,EAX);
4740   emit_writeword(EAX,(int)&Count);
4741   emit_call((int)gen_interupt);
4742   emit_readword((int)&Count,HOST_CCREG);
4743   emit_readword((int)&next_interupt,EAX);
4744   emit_readword((int)&pending_exception,EBX);
4745   emit_writeword(EAX,(int)&last_count);
4746   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4747   emit_test(EBX,EBX);
4748   int jne_instr=(int)out;
4749   emit_jne(0);
4750   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4751   load_all_regs(branch_regs[i].regmap);
4752   emit_jmp(stubs[n][2]); // return address
4753   set_jump_target(jne_instr,(int)out);
4754   emit_readword((int)&pcaddr,EAX);
4755   // Call get_addr_ht instead of doing the hash table here.
4756   // This code is executed infrequently and takes up a lot of space
4757   // so smaller is better.
4758   emit_storereg(CCREG,HOST_CCREG);
4759   emit_pushreg(EAX);
4760   emit_call((int)get_addr_ht);
4761   emit_loadreg(CCREG,HOST_CCREG);
4762   emit_addimm(ESP,4,ESP);
4763   emit_jmpreg(EAX);*/
4764 }
4765
4766 add_to_linker(int addr,int target,int ext)
4767 {
4768   link_addr[linkcount][0]=addr;
4769   link_addr[linkcount][1]=target;
4770   link_addr[linkcount][2]=ext;  
4771   linkcount++;
4772 }
4773
4774 void ujump_assemble(int i,struct regstat *i_regs)
4775 {
4776   signed char *i_regmap=i_regs->regmap;
4777   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4778   address_generation(i+1,i_regs,regs[i].regmap_entry);
4779   #ifdef REG_PREFETCH
4780   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4781   if(rt1[i]==31&&temp>=0) 
4782   {
4783     int return_address=start+i*4+8;
4784     if(get_reg(branch_regs[i].regmap,31)>0) 
4785     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4786   }
4787   #endif
4788   ds_assemble(i+1,i_regs);
4789   uint64_t bc_unneeded=branch_regs[i].u;
4790   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4791   bc_unneeded|=1|(1LL<<rt1[i]);
4792   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4793   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4794                 bc_unneeded,bc_unneeded_upper);
4795   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4796   if(rt1[i]==31) {
4797     int rt;
4798     unsigned int return_address;
4799     assert(rt1[i+1]!=31);
4800     assert(rt2[i+1]!=31);
4801     rt=get_reg(branch_regs[i].regmap,31);
4802     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4803     //assert(rt>=0);
4804     return_address=start+i*4+8;
4805     if(rt>=0) {
4806       #ifdef USE_MINI_HT
4807       if(internal_branch(branch_regs[i].is32,return_address)) {
4808         int temp=rt+1;
4809         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4810            branch_regs[i].regmap[temp]>=0)
4811         {
4812           temp=get_reg(branch_regs[i].regmap,-1);
4813         }
4814         #ifdef HOST_TEMPREG
4815         if(temp<0) temp=HOST_TEMPREG;
4816         #endif
4817         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4818         else emit_movimm(return_address,rt);
4819       }
4820       else
4821       #endif
4822       {
4823         #ifdef REG_PREFETCH
4824         if(temp>=0) 
4825         {
4826           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4827         }
4828         #endif
4829         emit_movimm(return_address,rt); // PC into link register
4830         #ifdef IMM_PREFETCH
4831         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4832         #endif
4833       }
4834     }
4835   }
4836   int cc,adj;
4837   cc=get_reg(branch_regs[i].regmap,CCREG);
4838   assert(cc==HOST_CCREG);
4839   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4840   #ifdef REG_PREFETCH
4841   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4842   #endif
4843   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4844   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4845   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4846   if(internal_branch(branch_regs[i].is32,ba[i]))
4847     assem_debug("branch: internal\n");
4848   else
4849     assem_debug("branch: external\n");
4850   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4851     ds_assemble_entry(i);
4852   }
4853   else {
4854     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4855     emit_jmp(0);
4856   }
4857 }
4858
4859 void rjump_assemble(int i,struct regstat *i_regs)
4860 {
4861   signed char *i_regmap=i_regs->regmap;
4862   int temp;
4863   int rs,cc,adj;
4864   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4865   assert(rs>=0);
4866   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4867     // Delay slot abuse, make a copy of the branch address register
4868     temp=get_reg(branch_regs[i].regmap,RTEMP);
4869     assert(temp>=0);
4870     assert(regs[i].regmap[temp]==RTEMP);
4871     emit_mov(rs,temp);
4872     rs=temp;
4873   }
4874   address_generation(i+1,i_regs,regs[i].regmap_entry);
4875   #ifdef REG_PREFETCH
4876   if(rt1[i]==31) 
4877   {
4878     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4879       int return_address=start+i*4+8;
4880       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4881     }
4882   }
4883   #endif
4884   #ifdef USE_MINI_HT
4885   if(rs1[i]==31) {
4886     int rh=get_reg(regs[i].regmap,RHASH);
4887     if(rh>=0) do_preload_rhash(rh);
4888   }
4889   #endif
4890   ds_assemble(i+1,i_regs);
4891   uint64_t bc_unneeded=branch_regs[i].u;
4892   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4893   bc_unneeded|=1|(1LL<<rt1[i]);
4894   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4895   bc_unneeded&=~(1LL<<rs1[i]);
4896   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4897                 bc_unneeded,bc_unneeded_upper);
4898   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4899   if(rt1[i]==31) {
4900     int rt,return_address;
4901     assert(rt1[i+1]!=31);
4902     assert(rt2[i+1]!=31);
4903     rt=get_reg(branch_regs[i].regmap,31);
4904     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4905     assert(rt>=0);
4906     return_address=start+i*4+8;
4907     #ifdef REG_PREFETCH
4908     if(temp>=0) 
4909     {
4910       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4911     }
4912     #endif
4913     emit_movimm(return_address,rt); // PC into link register
4914     #ifdef IMM_PREFETCH
4915     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4916     #endif
4917   }
4918   cc=get_reg(branch_regs[i].regmap,CCREG);
4919   assert(cc==HOST_CCREG);
4920   #ifdef USE_MINI_HT
4921   int rh=get_reg(branch_regs[i].regmap,RHASH);
4922   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4923   if(rs1[i]==31) {
4924     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4925     do_preload_rhtbl(ht);
4926     do_rhash(rs,rh);
4927   }
4928   #endif
4929   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4930   #ifdef DESTRUCTIVE_WRITEBACK
4931   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4932     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4933       emit_loadreg(rs1[i],rs);
4934     }
4935   }
4936   #endif
4937   #ifdef REG_PREFETCH
4938   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4939   #endif
4940   #ifdef USE_MINI_HT
4941   if(rs1[i]==31) {
4942     do_miniht_load(ht,rh);
4943   }
4944   #endif
4945   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4946   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4947   //assert(adj==0);
4948   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
4949   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4950   emit_jns(0);
4951   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4952   #ifdef USE_MINI_HT
4953   if(rs1[i]==31) {
4954     do_miniht_jump(rs,rh,ht);
4955   }
4956   else
4957   #endif
4958   {
4959     //if(rs!=EAX) emit_mov(rs,EAX);
4960     //emit_jmp((int)jump_vaddr_eax);
4961     emit_jmp(jump_vaddr_reg[rs]);
4962   }
4963   /* Check hash table
4964   temp=!rs;
4965   emit_mov(rs,temp);
4966   emit_shrimm(rs,16,rs);
4967   emit_xor(temp,rs,rs);
4968   emit_movzwl_reg(rs,rs);
4969   emit_shlimm(rs,4,rs);
4970   emit_cmpmem_indexed((int)hash_table,rs,temp);
4971   emit_jne((int)out+14);
4972   emit_readword_indexed((int)hash_table+4,rs,rs);
4973   emit_jmpreg(rs);
4974   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4975   emit_addimm_no_flags(8,rs);
4976   emit_jeq((int)out-17);
4977   // No hit on hash table, call compiler
4978   emit_pushreg(temp);
4979 //DEBUG >
4980 #ifdef DEBUG_CYCLE_COUNT
4981   emit_readword((int)&last_count,ECX);
4982   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4983   emit_readword((int)&next_interupt,ECX);
4984   emit_writeword(HOST_CCREG,(int)&Count);
4985   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4986   emit_writeword(ECX,(int)&last_count);
4987 #endif
4988 //DEBUG <
4989   emit_storereg(CCREG,HOST_CCREG);
4990   emit_call((int)get_addr);
4991   emit_loadreg(CCREG,HOST_CCREG);
4992   emit_addimm(ESP,4,ESP);
4993   emit_jmpreg(EAX);*/
4994   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4995   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4996   #endif
4997 }
4998
4999 void cjump_assemble(int i,struct regstat *i_regs)
5000 {
5001   signed char *i_regmap=i_regs->regmap;
5002   int cc;
5003   int match;
5004   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5005   assem_debug("match=%d\n",match);
5006   int s1h,s1l,s2h,s2l;
5007   int prev_cop1_usable=cop1_usable;
5008   int unconditional=0,nop=0;
5009   int only32=0;
5010   int ooo=1;
5011   int invert=0;
5012   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5013   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5014   if(likely[i]) ooo=0;
5015   if(!match) invert=1;
5016   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5017   if(i>(ba[i]-start)>>2) invert=1;
5018   #endif
5019     
5020   if(ooo)
5021     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5022        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5023   {
5024     // Write-after-read dependency prevents out of order execution
5025     // First test branch condition, then execute delay slot, then branch
5026     ooo=0;
5027   }
5028
5029   if(ooo) {
5030     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5031     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5032     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5033     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5034   }
5035   else {
5036     s1l=get_reg(i_regmap,rs1[i]);
5037     s1h=get_reg(i_regmap,rs1[i]|64);
5038     s2l=get_reg(i_regmap,rs2[i]);
5039     s2h=get_reg(i_regmap,rs2[i]|64);
5040   }
5041   if(rs1[i]==0&&rs2[i]==0)
5042   {
5043     if(opcode[i]&1) nop=1;
5044     else unconditional=1;
5045     //assert(opcode[i]!=5);
5046     //assert(opcode[i]!=7);
5047     //assert(opcode[i]!=0x15);
5048     //assert(opcode[i]!=0x17);
5049   }
5050   else if(rs1[i]==0)
5051   {
5052     s1l=s2l;s1h=s2h;
5053     s2l=s2h=-1;
5054     only32=(regs[i].was32>>rs2[i])&1;
5055   }
5056   else if(rs2[i]==0)
5057   {
5058     s2l=s2h=-1;
5059     only32=(regs[i].was32>>rs1[i])&1;
5060   }
5061   else {
5062     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5063   }
5064
5065   if(ooo) {
5066     // Out of order execution (delay slot first)
5067     //printf("OOOE\n");
5068     address_generation(i+1,i_regs,regs[i].regmap_entry);
5069     ds_assemble(i+1,i_regs);
5070     int adj;
5071     uint64_t bc_unneeded=branch_regs[i].u;
5072     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5073     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5074     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5075     bc_unneeded|=1;
5076     bc_unneeded_upper|=1;
5077     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5078                   bc_unneeded,bc_unneeded_upper);
5079     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5080     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5081     cc=get_reg(branch_regs[i].regmap,CCREG);
5082     assert(cc==HOST_CCREG);
5083     if(unconditional) 
5084       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5085     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5086     //assem_debug("cycle count (adj)\n");
5087     if(unconditional) {
5088       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5089       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5090         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5091         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5092         if(internal)
5093           assem_debug("branch: internal\n");
5094         else
5095           assem_debug("branch: external\n");
5096         if(internal&&is_ds[(ba[i]-start)>>2]) {
5097           ds_assemble_entry(i);
5098         }
5099         else {
5100           add_to_linker((int)out,ba[i],internal);
5101           emit_jmp(0);
5102         }
5103         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5104         if(((u_int)out)&7) emit_addnop(0);
5105         #endif
5106       }
5107     }
5108     else if(nop) {
5109       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5110       int jaddr=(int)out;
5111       emit_jns(0);
5112       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5113     }
5114     else {
5115       int taken=0,nottaken=0,nottaken1=0;
5116       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5117       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5118       if(!only32)
5119       {
5120         assert(s1h>=0);
5121         if(opcode[i]==4) // BEQ
5122         {
5123           if(s2h>=0) emit_cmp(s1h,s2h);
5124           else emit_test(s1h,s1h);
5125           nottaken1=(int)out;
5126           emit_jne(1);
5127         }
5128         if(opcode[i]==5) // BNE
5129         {
5130           if(s2h>=0) emit_cmp(s1h,s2h);
5131           else emit_test(s1h,s1h);
5132           if(invert) taken=(int)out;
5133           else add_to_linker((int)out,ba[i],internal);
5134           emit_jne(0);
5135         }
5136         if(opcode[i]==6) // BLEZ
5137         {
5138           emit_test(s1h,s1h);
5139           if(invert) taken=(int)out;
5140           else add_to_linker((int)out,ba[i],internal);
5141           emit_js(0);
5142           nottaken1=(int)out;
5143           emit_jne(1);
5144         }
5145         if(opcode[i]==7) // BGTZ
5146         {
5147           emit_test(s1h,s1h);
5148           nottaken1=(int)out;
5149           emit_js(1);
5150           if(invert) taken=(int)out;
5151           else add_to_linker((int)out,ba[i],internal);
5152           emit_jne(0);
5153         }
5154       } // if(!only32)
5155           
5156       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5157       assert(s1l>=0);
5158       if(opcode[i]==4) // BEQ
5159       {
5160         if(s2l>=0) emit_cmp(s1l,s2l);
5161         else emit_test(s1l,s1l);
5162         if(invert){
5163           nottaken=(int)out;
5164           emit_jne(1);
5165         }else{
5166           add_to_linker((int)out,ba[i],internal);
5167           emit_jeq(0);
5168         }
5169       }
5170       if(opcode[i]==5) // BNE
5171       {
5172         if(s2l>=0) emit_cmp(s1l,s2l);
5173         else emit_test(s1l,s1l);
5174         if(invert){
5175           nottaken=(int)out;
5176           emit_jeq(1);
5177         }else{
5178           add_to_linker((int)out,ba[i],internal);
5179           emit_jne(0);
5180         }
5181       }
5182       if(opcode[i]==6) // BLEZ
5183       {
5184         emit_cmpimm(s1l,1);
5185         if(invert){
5186           nottaken=(int)out;
5187           emit_jge(1);
5188         }else{
5189           add_to_linker((int)out,ba[i],internal);
5190           emit_jl(0);
5191         }
5192       }
5193       if(opcode[i]==7) // BGTZ
5194       {
5195         emit_cmpimm(s1l,1);
5196         if(invert){
5197           nottaken=(int)out;
5198           emit_jl(1);
5199         }else{
5200           add_to_linker((int)out,ba[i],internal);
5201           emit_jge(0);
5202         }
5203       }
5204       if(invert) {
5205         if(taken) set_jump_target(taken,(int)out);
5206         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5207         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5208           if(adj) {
5209             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5210             add_to_linker((int)out,ba[i],internal);
5211           }else{
5212             emit_addnop(13);
5213             add_to_linker((int)out,ba[i],internal*2);
5214           }
5215           emit_jmp(0);
5216         }else
5217         #endif
5218         {
5219           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5220           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5221           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5222           if(internal)
5223             assem_debug("branch: internal\n");
5224           else
5225             assem_debug("branch: external\n");
5226           if(internal&&is_ds[(ba[i]-start)>>2]) {
5227             ds_assemble_entry(i);
5228           }
5229           else {
5230             add_to_linker((int)out,ba[i],internal);
5231             emit_jmp(0);
5232           }
5233         }
5234         set_jump_target(nottaken,(int)out);
5235       }
5236
5237       if(nottaken1) set_jump_target(nottaken1,(int)out);
5238       if(adj) {
5239         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5240       }
5241     } // (!unconditional)
5242   } // if(ooo)
5243   else
5244   {
5245     // In-order execution (branch first)
5246     //if(likely[i]) printf("IOL\n");
5247     //else
5248     //printf("IOE\n");
5249     int taken=0,nottaken=0,nottaken1=0;
5250     if(!unconditional&&!nop) {
5251       if(!only32)
5252       {
5253         assert(s1h>=0);
5254         if((opcode[i]&0x2f)==4) // BEQ
5255         {
5256           if(s2h>=0) emit_cmp(s1h,s2h);
5257           else emit_test(s1h,s1h);
5258           nottaken1=(int)out;
5259           emit_jne(2);
5260         }
5261         if((opcode[i]&0x2f)==5) // BNE
5262         {
5263           if(s2h>=0) emit_cmp(s1h,s2h);
5264           else emit_test(s1h,s1h);
5265           taken=(int)out;
5266           emit_jne(1);
5267         }
5268         if((opcode[i]&0x2f)==6) // BLEZ
5269         {
5270           emit_test(s1h,s1h);
5271           taken=(int)out;
5272           emit_js(1);
5273           nottaken1=(int)out;
5274           emit_jne(2);
5275         }
5276         if((opcode[i]&0x2f)==7) // BGTZ
5277         {
5278           emit_test(s1h,s1h);
5279           nottaken1=(int)out;
5280           emit_js(2);
5281           taken=(int)out;
5282           emit_jne(1);
5283         }
5284       } // if(!only32)
5285           
5286       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5287       assert(s1l>=0);
5288       if((opcode[i]&0x2f)==4) // BEQ
5289       {
5290         if(s2l>=0) emit_cmp(s1l,s2l);
5291         else emit_test(s1l,s1l);
5292         nottaken=(int)out;
5293         emit_jne(2);
5294       }
5295       if((opcode[i]&0x2f)==5) // BNE
5296       {
5297         if(s2l>=0) emit_cmp(s1l,s2l);
5298         else emit_test(s1l,s1l);
5299         nottaken=(int)out;
5300         emit_jeq(2);
5301       }
5302       if((opcode[i]&0x2f)==6) // BLEZ
5303       {
5304         emit_cmpimm(s1l,1);
5305         nottaken=(int)out;
5306         emit_jge(2);
5307       }
5308       if((opcode[i]&0x2f)==7) // BGTZ
5309       {
5310         emit_cmpimm(s1l,1);
5311         nottaken=(int)out;
5312         emit_jl(2);
5313       }
5314     } // if(!unconditional)
5315     int adj;
5316     uint64_t ds_unneeded=branch_regs[i].u;
5317     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5318     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5319     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5320     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5321     ds_unneeded|=1;
5322     ds_unneeded_upper|=1;
5323     // branch taken
5324     if(!nop) {
5325       if(taken) set_jump_target(taken,(int)out);
5326       assem_debug("1:\n");
5327       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5328                     ds_unneeded,ds_unneeded_upper);
5329       // load regs
5330       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5331       address_generation(i+1,&branch_regs[i],0);
5332       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5333       ds_assemble(i+1,&branch_regs[i]);
5334       cc=get_reg(branch_regs[i].regmap,CCREG);
5335       if(cc==-1) {
5336         emit_loadreg(CCREG,cc=HOST_CCREG);
5337         // CHECK: Is the following instruction (fall thru) allocated ok?
5338       }
5339       assert(cc==HOST_CCREG);
5340       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5341       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5342       assem_debug("cycle count (adj)\n");
5343       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5344       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5345       if(internal)
5346         assem_debug("branch: internal\n");
5347       else
5348         assem_debug("branch: external\n");
5349       if(internal&&is_ds[(ba[i]-start)>>2]) {
5350         ds_assemble_entry(i);
5351       }
5352       else {
5353         add_to_linker((int)out,ba[i],internal);
5354         emit_jmp(0);
5355       }
5356     }
5357     // branch not taken
5358     cop1_usable=prev_cop1_usable;
5359     if(!unconditional) {
5360       if(nottaken1) set_jump_target(nottaken1,(int)out);
5361       set_jump_target(nottaken,(int)out);
5362       assem_debug("2:\n");
5363       if(!likely[i]) {
5364         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5365                       ds_unneeded,ds_unneeded_upper);
5366         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5367         address_generation(i+1,&branch_regs[i],0);
5368         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5369         ds_assemble(i+1,&branch_regs[i]);
5370       }
5371       cc=get_reg(branch_regs[i].regmap,CCREG);
5372       if(cc==-1&&!likely[i]) {
5373         // Cycle count isn't in a register, temporarily load it then write it out
5374         emit_loadreg(CCREG,HOST_CCREG);
5375         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5376         int jaddr=(int)out;
5377         emit_jns(0);
5378         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5379         emit_storereg(CCREG,HOST_CCREG);
5380       }
5381       else{
5382         cc=get_reg(i_regmap,CCREG);
5383         assert(cc==HOST_CCREG);
5384         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5385         int jaddr=(int)out;
5386         emit_jns(0);
5387         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5388       }
5389     }
5390   }
5391 }
5392
5393 void sjump_assemble(int i,struct regstat *i_regs)
5394 {
5395   signed char *i_regmap=i_regs->regmap;
5396   int cc;
5397   int match;
5398   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5399   assem_debug("smatch=%d\n",match);
5400   int s1h,s1l;
5401   int prev_cop1_usable=cop1_usable;
5402   int unconditional=0,nevertaken=0;
5403   int only32=0;
5404   int ooo=1;
5405   int invert=0;
5406   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5407   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5408   if(likely[i]) ooo=0;
5409   if(!match) invert=1;
5410   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5411   if(i>(ba[i]-start)>>2) invert=1;
5412   #endif
5413
5414   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5415   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5416
5417   if(ooo)
5418     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5419   {
5420     // Write-after-read dependency prevents out of order execution
5421     // First test branch condition, then execute delay slot, then branch
5422     ooo=0;
5423   }
5424   // TODO: Conditional branches w/link must execute in-order so that
5425   // condition test and write to r31 occur before cycle count test
5426
5427   if(ooo) {
5428     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5429     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5430   }
5431   else {
5432     s1l=get_reg(i_regmap,rs1[i]);
5433     s1h=get_reg(i_regmap,rs1[i]|64);
5434   }
5435   if(rs1[i]==0)
5436   {
5437     if(opcode2[i]&1) unconditional=1;
5438     else nevertaken=1;
5439     // These are never taken (r0 is never less than zero)
5440     //assert(opcode2[i]!=0);
5441     //assert(opcode2[i]!=2);
5442     //assert(opcode2[i]!=0x10);
5443     //assert(opcode2[i]!=0x12);
5444   }
5445   else {
5446     only32=(regs[i].was32>>rs1[i])&1;
5447   }
5448
5449   if(ooo) {
5450     // Out of order execution (delay slot first)
5451     //printf("OOOE\n");
5452     address_generation(i+1,i_regs,regs[i].regmap_entry);
5453     ds_assemble(i+1,i_regs);
5454     int adj;
5455     uint64_t bc_unneeded=branch_regs[i].u;
5456     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5457     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5458     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5459     bc_unneeded|=1;
5460     bc_unneeded_upper|=1;
5461     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5462                   bc_unneeded,bc_unneeded_upper);
5463     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5464     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5465     if(rt1[i]==31) {
5466       int rt,return_address;
5467       assert(rt1[i+1]!=31);
5468       assert(rt2[i+1]!=31);
5469       rt=get_reg(branch_regs[i].regmap,31);
5470       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5471       if(rt>=0) {
5472         // Save the PC even if the branch is not taken
5473         return_address=start+i*4+8;
5474         emit_movimm(return_address,rt); // PC into link register
5475         #ifdef IMM_PREFETCH
5476         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5477         #endif
5478       }
5479     }
5480     cc=get_reg(branch_regs[i].regmap,CCREG);
5481     assert(cc==HOST_CCREG);
5482     if(unconditional) 
5483       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5484     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5485     assem_debug("cycle count (adj)\n");
5486     if(unconditional) {
5487       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5488       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5489         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5490         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5491         if(internal)
5492           assem_debug("branch: internal\n");
5493         else
5494           assem_debug("branch: external\n");
5495         if(internal&&is_ds[(ba[i]-start)>>2]) {
5496           ds_assemble_entry(i);
5497         }
5498         else {
5499           add_to_linker((int)out,ba[i],internal);
5500           emit_jmp(0);
5501         }
5502         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5503         if(((u_int)out)&7) emit_addnop(0);
5504         #endif
5505       }
5506     }
5507     else if(nevertaken) {
5508       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5509       int jaddr=(int)out;
5510       emit_jns(0);
5511       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5512     }
5513     else {
5514       int nottaken=0;
5515       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5516       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5517       if(!only32)
5518       {
5519         assert(s1h>=0);
5520         if(opcode2[i]==0) // BLTZ
5521         {
5522           emit_test(s1h,s1h);
5523           if(invert){
5524             nottaken=(int)out;
5525             emit_jns(1);
5526           }else{
5527             add_to_linker((int)out,ba[i],internal);
5528             emit_js(0);
5529           }
5530         }
5531         if(opcode2[i]==1) // BGEZ
5532         {
5533           emit_test(s1h,s1h);
5534           if(invert){
5535             nottaken=(int)out;
5536             emit_js(1);
5537           }else{
5538             add_to_linker((int)out,ba[i],internal);
5539             emit_jns(0);
5540           }
5541         }
5542       } // if(!only32)
5543       else
5544       {
5545         assert(s1l>=0);
5546         if(opcode2[i]==0) // BLTZ
5547         {
5548           emit_test(s1l,s1l);
5549           if(invert){
5550             nottaken=(int)out;
5551             emit_jns(1);
5552           }else{
5553             add_to_linker((int)out,ba[i],internal);
5554             emit_js(0);
5555           }
5556         }
5557         if(opcode2[i]==1) // BGEZ
5558         {
5559           emit_test(s1l,s1l);
5560           if(invert){
5561             nottaken=(int)out;
5562             emit_js(1);
5563           }else{
5564             add_to_linker((int)out,ba[i],internal);
5565             emit_jns(0);
5566           }
5567         }
5568       } // if(!only32)
5569           
5570       if(invert) {
5571         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5572         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5573           if(adj) {
5574             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5575             add_to_linker((int)out,ba[i],internal);
5576           }else{
5577             emit_addnop(13);
5578             add_to_linker((int)out,ba[i],internal*2);
5579           }
5580           emit_jmp(0);
5581         }else
5582         #endif
5583         {
5584           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5585           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5586           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5587           if(internal)
5588             assem_debug("branch: internal\n");
5589           else
5590             assem_debug("branch: external\n");
5591           if(internal&&is_ds[(ba[i]-start)>>2]) {
5592             ds_assemble_entry(i);
5593           }
5594           else {
5595             add_to_linker((int)out,ba[i],internal);
5596             emit_jmp(0);
5597           }
5598         }
5599         set_jump_target(nottaken,(int)out);
5600       }
5601
5602       if(adj) {
5603         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5604       }
5605     } // (!unconditional)
5606   } // if(ooo)
5607   else
5608   {
5609     // In-order execution (branch first)
5610     //printf("IOE\n");
5611     int nottaken=0;
5612     if(!unconditional) {
5613       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5614       if(!only32)
5615       {
5616         assert(s1h>=0);
5617         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5618         {
5619           emit_test(s1h,s1h);
5620           nottaken=(int)out;
5621           emit_jns(1);
5622         }
5623         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5624         {
5625           emit_test(s1h,s1h);
5626           nottaken=(int)out;
5627           emit_js(1);
5628         }
5629       } // if(!only32)
5630       else
5631       {
5632         assert(s1l>=0);
5633         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5634         {
5635           emit_test(s1l,s1l);
5636           nottaken=(int)out;
5637           emit_jns(1);
5638         }
5639         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5640         {
5641           emit_test(s1l,s1l);
5642           nottaken=(int)out;
5643           emit_js(1);
5644         }
5645       }
5646     } // if(!unconditional)
5647     int adj;
5648     uint64_t ds_unneeded=branch_regs[i].u;
5649     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5650     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5651     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5652     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5653     ds_unneeded|=1;
5654     ds_unneeded_upper|=1;
5655     // branch taken
5656     if(!nevertaken) {
5657       //assem_debug("1:\n");
5658       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5659                     ds_unneeded,ds_unneeded_upper);
5660       // load regs
5661       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5662       address_generation(i+1,&branch_regs[i],0);
5663       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5664       ds_assemble(i+1,&branch_regs[i]);
5665       cc=get_reg(branch_regs[i].regmap,CCREG);
5666       if(cc==-1) {
5667         emit_loadreg(CCREG,cc=HOST_CCREG);
5668         // CHECK: Is the following instruction (fall thru) allocated ok?
5669       }
5670       assert(cc==HOST_CCREG);
5671       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5672       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5673       assem_debug("cycle count (adj)\n");
5674       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5675       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5676       if(internal)
5677         assem_debug("branch: internal\n");
5678       else
5679         assem_debug("branch: external\n");
5680       if(internal&&is_ds[(ba[i]-start)>>2]) {
5681         ds_assemble_entry(i);
5682       }
5683       else {
5684         add_to_linker((int)out,ba[i],internal);
5685         emit_jmp(0);
5686       }
5687     }
5688     // branch not taken
5689     cop1_usable=prev_cop1_usable;
5690     if(!unconditional) {
5691       set_jump_target(nottaken,(int)out);
5692       assem_debug("1:\n");
5693       if(!likely[i]) {
5694         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5695                       ds_unneeded,ds_unneeded_upper);
5696         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5697         address_generation(i+1,&branch_regs[i],0);
5698         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5699         ds_assemble(i+1,&branch_regs[i]);
5700       }
5701       cc=get_reg(branch_regs[i].regmap,CCREG);
5702       if(cc==-1&&!likely[i]) {
5703         // Cycle count isn't in a register, temporarily load it then write it out
5704         emit_loadreg(CCREG,HOST_CCREG);
5705         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5706         int jaddr=(int)out;
5707         emit_jns(0);
5708         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5709         emit_storereg(CCREG,HOST_CCREG);
5710       }
5711       else{
5712         cc=get_reg(i_regmap,CCREG);
5713         assert(cc==HOST_CCREG);
5714         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5715         int jaddr=(int)out;
5716         emit_jns(0);
5717         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5718       }
5719     }
5720   }
5721 }
5722
5723 void fjump_assemble(int i,struct regstat *i_regs)
5724 {
5725   signed char *i_regmap=i_regs->regmap;
5726   int cc;
5727   int match;
5728   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5729   assem_debug("fmatch=%d\n",match);
5730   int fs,cs;
5731   int eaddr;
5732   int ooo=1;
5733   int invert=0;
5734   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5735   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5736   if(likely[i]) ooo=0;
5737   if(!match) invert=1;
5738   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5739   if(i>(ba[i]-start)>>2) invert=1;
5740   #endif
5741
5742   if(ooo)
5743     if(itype[i+1]==FCOMP)
5744   {
5745     // Write-after-read dependency prevents out of order execution
5746     // First test branch condition, then execute delay slot, then branch
5747     ooo=0;
5748   }
5749
5750   if(ooo) {
5751     fs=get_reg(branch_regs[i].regmap,FSREG);
5752     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5753   }
5754   else {
5755     fs=get_reg(i_regmap,FSREG);
5756   }
5757
5758   // Check cop1 unusable
5759   if(!cop1_usable) {
5760     cs=get_reg(i_regmap,CSREG);
5761     assert(cs>=0);
5762     emit_testimm(cs,0x20000000);
5763     eaddr=(int)out;
5764     emit_jeq(0);
5765     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5766     cop1_usable=1;
5767   }
5768
5769   if(ooo) {
5770     // Out of order execution (delay slot first)
5771     //printf("OOOE\n");
5772     ds_assemble(i+1,i_regs);
5773     int adj;
5774     uint64_t bc_unneeded=branch_regs[i].u;
5775     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5776     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5777     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5778     bc_unneeded|=1;
5779     bc_unneeded_upper|=1;
5780     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5781                   bc_unneeded,bc_unneeded_upper);
5782     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5783     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5784     cc=get_reg(branch_regs[i].regmap,CCREG);
5785     assert(cc==HOST_CCREG);
5786     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5787     assem_debug("cycle count (adj)\n");
5788     if(1) {
5789       int nottaken=0;
5790       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5791       if(1) {
5792         assert(fs>=0);
5793         emit_testimm(fs,0x800000);
5794         if(source[i]&0x10000) // BC1T
5795         {
5796           if(invert){
5797             nottaken=(int)out;
5798             emit_jeq(1);
5799           }else{
5800             add_to_linker((int)out,ba[i],internal);
5801             emit_jne(0);
5802           }
5803         }
5804         else // BC1F
5805           if(invert){
5806             nottaken=(int)out;
5807             emit_jne(1);
5808           }else{
5809             add_to_linker((int)out,ba[i],internal);
5810             emit_jeq(0);
5811           }
5812         {
5813         }
5814       } // if(!only32)
5815           
5816       if(invert) {
5817         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5818         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5819         else if(match) emit_addnop(13);
5820         #endif
5821         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5822         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5823         if(internal)
5824           assem_debug("branch: internal\n");
5825         else
5826           assem_debug("branch: external\n");
5827         if(internal&&is_ds[(ba[i]-start)>>2]) {
5828           ds_assemble_entry(i);
5829         }
5830         else {
5831           add_to_linker((int)out,ba[i],internal);
5832           emit_jmp(0);
5833         }
5834         set_jump_target(nottaken,(int)out);
5835       }
5836
5837       if(adj) {
5838         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5839       }
5840     } // (!unconditional)
5841   } // if(ooo)
5842   else
5843   {
5844     // In-order execution (branch first)
5845     //printf("IOE\n");
5846     int nottaken=0;
5847     if(1) {
5848       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5849       if(1) {
5850         assert(fs>=0);
5851         emit_testimm(fs,0x800000);
5852         if(source[i]&0x10000) // BC1T
5853         {
5854           nottaken=(int)out;
5855           emit_jeq(1);
5856         }
5857         else // BC1F
5858         {
5859           nottaken=(int)out;
5860           emit_jne(1);
5861         }
5862       }
5863     } // if(!unconditional)
5864     int adj;
5865     uint64_t ds_unneeded=branch_regs[i].u;
5866     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5867     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5868     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5869     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5870     ds_unneeded|=1;
5871     ds_unneeded_upper|=1;
5872     // branch taken
5873     //assem_debug("1:\n");
5874     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5875                   ds_unneeded,ds_unneeded_upper);
5876     // load regs
5877     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5878     address_generation(i+1,&branch_regs[i],0);
5879     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5880     ds_assemble(i+1,&branch_regs[i]);
5881     cc=get_reg(branch_regs[i].regmap,CCREG);
5882     if(cc==-1) {
5883       emit_loadreg(CCREG,cc=HOST_CCREG);
5884       // CHECK: Is the following instruction (fall thru) allocated ok?
5885     }
5886     assert(cc==HOST_CCREG);
5887     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5888     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5889     assem_debug("cycle count (adj)\n");
5890     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5891     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5892     if(internal)
5893       assem_debug("branch: internal\n");
5894     else
5895       assem_debug("branch: external\n");
5896     if(internal&&is_ds[(ba[i]-start)>>2]) {
5897       ds_assemble_entry(i);
5898     }
5899     else {
5900       add_to_linker((int)out,ba[i],internal);
5901       emit_jmp(0);
5902     }
5903
5904     // branch not taken
5905     if(1) { // <- FIXME (don't need this)
5906       set_jump_target(nottaken,(int)out);
5907       assem_debug("1:\n");
5908       if(!likely[i]) {
5909         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5910                       ds_unneeded,ds_unneeded_upper);
5911         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5912         address_generation(i+1,&branch_regs[i],0);
5913         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5914         ds_assemble(i+1,&branch_regs[i]);
5915       }
5916       cc=get_reg(branch_regs[i].regmap,CCREG);
5917       if(cc==-1&&!likely[i]) {
5918         // Cycle count isn't in a register, temporarily load it then write it out
5919         emit_loadreg(CCREG,HOST_CCREG);
5920         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5921         int jaddr=(int)out;
5922         emit_jns(0);
5923         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5924         emit_storereg(CCREG,HOST_CCREG);
5925       }
5926       else{
5927         cc=get_reg(i_regmap,CCREG);
5928         assert(cc==HOST_CCREG);
5929         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5930         int jaddr=(int)out;
5931         emit_jns(0);
5932         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5933       }
5934     }
5935   }
5936 }
5937
5938 static void pagespan_assemble(int i,struct regstat *i_regs)
5939 {
5940   int s1l=get_reg(i_regs->regmap,rs1[i]);
5941   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5942   int s2l=get_reg(i_regs->regmap,rs2[i]);
5943   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5944   void *nt_branch=NULL;
5945   int taken=0;
5946   int nottaken=0;
5947   int unconditional=0;
5948   if(rs1[i]==0)
5949   {
5950     s1l=s2l;s1h=s2h;
5951     s2l=s2h=-1;
5952   }
5953   else if(rs2[i]==0)
5954   {
5955     s2l=s2h=-1;
5956   }
5957   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5958     s1h=s2h=-1;
5959   }
5960   int hr=0;
5961   int addr,alt,ntaddr;
5962   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5963   else {
5964     while(hr<HOST_REGS)
5965     {
5966       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5967          (i_regs->regmap[hr]&63)!=rs1[i] &&
5968          (i_regs->regmap[hr]&63)!=rs2[i] )
5969       {
5970         addr=hr++;break;
5971       }
5972       hr++;
5973     }
5974   }
5975   while(hr<HOST_REGS)
5976   {
5977     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5978        (i_regs->regmap[hr]&63)!=rs1[i] &&
5979        (i_regs->regmap[hr]&63)!=rs2[i] )
5980     {
5981       alt=hr++;break;
5982     }
5983     hr++;
5984   }
5985   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5986   {
5987     while(hr<HOST_REGS)
5988     {
5989       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5990          (i_regs->regmap[hr]&63)!=rs1[i] &&
5991          (i_regs->regmap[hr]&63)!=rs2[i] )
5992       {
5993         ntaddr=hr;break;
5994       }
5995       hr++;
5996     }
5997   }
5998   assert(hr<HOST_REGS);
5999   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6000     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6001   }
6002   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6003   if(opcode[i]==2) // J
6004   {
6005     unconditional=1;
6006   }
6007   if(opcode[i]==3) // JAL
6008   {
6009     // TODO: mini_ht
6010     int rt=get_reg(i_regs->regmap,31);
6011     emit_movimm(start+i*4+8,rt);
6012     unconditional=1;
6013   }
6014   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6015   {
6016     emit_mov(s1l,addr);
6017     if(opcode2[i]==9) // JALR
6018     {
6019       int rt=get_reg(i_regs->regmap,31);
6020       emit_movimm(start+i*4+8,rt);
6021     }
6022   }
6023   if((opcode[i]&0x3f)==4) // BEQ
6024   {
6025     if(rs1[i]==rs2[i])
6026     {
6027       unconditional=1;
6028     }
6029     else
6030     #ifdef HAVE_CMOV_IMM
6031     if(s1h<0) {
6032       if(s2l>=0) emit_cmp(s1l,s2l);
6033       else emit_test(s1l,s1l);
6034       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6035     }
6036     else
6037     #endif
6038     {
6039       assert(s1l>=0);
6040       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6041       if(s1h>=0) {
6042         if(s2h>=0) emit_cmp(s1h,s2h);
6043         else emit_test(s1h,s1h);
6044         emit_cmovne_reg(alt,addr);
6045       }
6046       if(s2l>=0) emit_cmp(s1l,s2l);
6047       else emit_test(s1l,s1l);
6048       emit_cmovne_reg(alt,addr);
6049     }
6050   }
6051   if((opcode[i]&0x3f)==5) // BNE
6052   {
6053     #ifdef HAVE_CMOV_IMM
6054     if(s1h<0) {
6055       if(s2l>=0) emit_cmp(s1l,s2l);
6056       else emit_test(s1l,s1l);
6057       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6058     }
6059     else
6060     #endif
6061     {
6062       assert(s1l>=0);
6063       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6064       if(s1h>=0) {
6065         if(s2h>=0) emit_cmp(s1h,s2h);
6066         else emit_test(s1h,s1h);
6067         emit_cmovne_reg(alt,addr);
6068       }
6069       if(s2l>=0) emit_cmp(s1l,s2l);
6070       else emit_test(s1l,s1l);
6071       emit_cmovne_reg(alt,addr);
6072     }
6073   }
6074   if((opcode[i]&0x3f)==0x14) // BEQL
6075   {
6076     if(s1h>=0) {
6077       if(s2h>=0) emit_cmp(s1h,s2h);
6078       else emit_test(s1h,s1h);
6079       nottaken=(int)out;
6080       emit_jne(0);
6081     }
6082     if(s2l>=0) emit_cmp(s1l,s2l);
6083     else emit_test(s1l,s1l);
6084     if(nottaken) set_jump_target(nottaken,(int)out);
6085     nottaken=(int)out;
6086     emit_jne(0);
6087   }
6088   if((opcode[i]&0x3f)==0x15) // BNEL
6089   {
6090     if(s1h>=0) {
6091       if(s2h>=0) emit_cmp(s1h,s2h);
6092       else emit_test(s1h,s1h);
6093       taken=(int)out;
6094       emit_jne(0);
6095     }
6096     if(s2l>=0) emit_cmp(s1l,s2l);
6097     else emit_test(s1l,s1l);
6098     nottaken=(int)out;
6099     emit_jeq(0);
6100     if(taken) set_jump_target(taken,(int)out);
6101   }
6102   if((opcode[i]&0x3f)==6) // BLEZ
6103   {
6104     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6105     emit_cmpimm(s1l,1);
6106     if(s1h>=0) emit_mov(addr,ntaddr);
6107     emit_cmovl_reg(alt,addr);
6108     if(s1h>=0) {
6109       emit_test(s1h,s1h);
6110       emit_cmovne_reg(ntaddr,addr);
6111       emit_cmovs_reg(alt,addr);
6112     }
6113   }
6114   if((opcode[i]&0x3f)==7) // BGTZ
6115   {
6116     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6117     emit_cmpimm(s1l,1);
6118     if(s1h>=0) emit_mov(addr,alt);
6119     emit_cmovl_reg(ntaddr,addr);
6120     if(s1h>=0) {
6121       emit_test(s1h,s1h);
6122       emit_cmovne_reg(alt,addr);
6123       emit_cmovs_reg(ntaddr,addr);
6124     }
6125   }
6126   if((opcode[i]&0x3f)==0x16) // BLEZL
6127   {
6128     assert((opcode[i]&0x3f)!=0x16);
6129   }
6130   if((opcode[i]&0x3f)==0x17) // BGTZL
6131   {
6132     assert((opcode[i]&0x3f)!=0x17);
6133   }
6134   assert(opcode[i]!=1); // BLTZ/BGEZ
6135
6136   //FIXME: Check CSREG
6137   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6138     if((source[i]&0x30000)==0) // BC1F
6139     {
6140       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6141       emit_testimm(s1l,0x800000);
6142       emit_cmovne_reg(alt,addr);
6143     }
6144     if((source[i]&0x30000)==0x10000) // BC1T
6145     {
6146       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6147       emit_testimm(s1l,0x800000);
6148       emit_cmovne_reg(alt,addr);
6149     }
6150     if((source[i]&0x30000)==0x20000) // BC1FL
6151     {
6152       emit_testimm(s1l,0x800000);
6153       nottaken=(int)out;
6154       emit_jne(0);
6155     }
6156     if((source[i]&0x30000)==0x30000) // BC1TL
6157     {
6158       emit_testimm(s1l,0x800000);
6159       nottaken=(int)out;
6160       emit_jeq(0);
6161     }
6162   }
6163
6164   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6165   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6166   if(likely[i]||unconditional)
6167   {
6168     emit_movimm(ba[i],HOST_BTREG);
6169   }
6170   else if(addr!=HOST_BTREG)
6171   {
6172     emit_mov(addr,HOST_BTREG);
6173   }
6174   void *branch_addr=out;
6175   emit_jmp(0);
6176   int target_addr=start+i*4+5;
6177   void *stub=out;
6178   void *compiled_target_addr=check_addr(target_addr);
6179   emit_extjump_ds((int)branch_addr,target_addr);
6180   if(compiled_target_addr) {
6181     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6182     add_link(target_addr,stub);
6183   }
6184   else set_jump_target((int)branch_addr,(int)stub);
6185   if(likely[i]) {
6186     // Not-taken path
6187     set_jump_target((int)nottaken,(int)out);
6188     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6189     void *branch_addr=out;
6190     emit_jmp(0);
6191     int target_addr=start+i*4+8;
6192     void *stub=out;
6193     void *compiled_target_addr=check_addr(target_addr);
6194     emit_extjump_ds((int)branch_addr,target_addr);
6195     if(compiled_target_addr) {
6196       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6197       add_link(target_addr,stub);
6198     }
6199     else set_jump_target((int)branch_addr,(int)stub);
6200   }
6201 }
6202
6203 // Assemble the delay slot for the above
6204 static void pagespan_ds()
6205 {
6206   assem_debug("initial delay slot:\n");
6207   u_int vaddr=start+1;
6208   u_int page=get_page(vaddr);
6209   u_int vpage=get_vpage(vaddr);
6210   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6211   do_dirty_stub_ds();
6212   ll_add(jump_in+page,vaddr,(void *)out);
6213   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6214   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6215     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6216   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6217     emit_writeword(HOST_BTREG,(int)&branch_target);
6218   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6219   address_generation(0,&regs[0],regs[0].regmap_entry);
6220   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6221     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6222   cop1_usable=0;
6223   is_delayslot=0;
6224   switch(itype[0]) {
6225     case ALU:
6226       alu_assemble(0,&regs[0]);break;
6227     case IMM16:
6228       imm16_assemble(0,&regs[0]);break;
6229     case SHIFT:
6230       shift_assemble(0,&regs[0]);break;
6231     case SHIFTIMM:
6232       shiftimm_assemble(0,&regs[0]);break;
6233     case LOAD:
6234       load_assemble(0,&regs[0]);break;
6235     case LOADLR:
6236       loadlr_assemble(0,&regs[0]);break;
6237     case STORE:
6238       store_assemble(0,&regs[0]);break;
6239     case STORELR:
6240       storelr_assemble(0,&regs[0]);break;
6241     case COP0:
6242       cop0_assemble(0,&regs[0]);break;
6243     case COP1:
6244       cop1_assemble(0,&regs[0]);break;
6245     case C1LS:
6246       c1ls_assemble(0,&regs[0]);break;
6247     case FCONV:
6248       fconv_assemble(0,&regs[0]);break;
6249     case FLOAT:
6250       float_assemble(0,&regs[0]);break;
6251     case FCOMP:
6252       fcomp_assemble(0,&regs[0]);break;
6253     case MULTDIV:
6254       multdiv_assemble(0,&regs[0]);break;
6255     case MOV:
6256       mov_assemble(0,&regs[0]);break;
6257     case SYSCALL:
6258     case SPAN:
6259     case UJUMP:
6260     case RJUMP:
6261     case CJUMP:
6262     case SJUMP:
6263     case FJUMP:
6264       printf("Jump in the delay slot.  This is probably a bug.\n");
6265   }
6266   int btaddr=get_reg(regs[0].regmap,BTREG);
6267   if(btaddr<0) {
6268     btaddr=get_reg(regs[0].regmap,-1);
6269     emit_readword((int)&branch_target,btaddr);
6270   }
6271   assert(btaddr!=HOST_CCREG);
6272   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6273 #ifdef HOST_IMM8
6274   emit_movimm(start+4,HOST_TEMPREG);
6275   emit_cmp(btaddr,HOST_TEMPREG);
6276 #else
6277   emit_cmpimm(btaddr,start+4);
6278 #endif
6279   int branch=(int)out;
6280   emit_jeq(0);
6281   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6282   emit_jmp(jump_vaddr_reg[btaddr]);
6283   set_jump_target(branch,(int)out);
6284   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6285   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6286 }
6287
6288 // Basic liveness analysis for MIPS registers
6289 void unneeded_registers(int istart,int iend,int r)
6290 {
6291   int i;
6292   uint64_t u,uu,b,bu;
6293   uint64_t temp_u,temp_uu;
6294   uint64_t tdep;
6295   if(iend==slen-1) {
6296     u=1;uu=1;
6297   }else{
6298     u=unneeded_reg[iend+1];
6299     uu=unneeded_reg_upper[iend+1];
6300     u=1;uu=1;
6301   }
6302   for (i=iend;i>=istart;i--)
6303   {
6304     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6305     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6306     {
6307       // If subroutine call, flag return address as a possible branch target
6308       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6309       
6310       if(ba[i]<start || ba[i]>=(start+slen*4))
6311       {
6312         // Branch out of this block, flush all regs
6313         u=1;
6314         uu=1;
6315         /* Hexagon hack 
6316         if(itype[i]==UJUMP&&rt1[i]==31)
6317         {
6318           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6319         }
6320         if(itype[i]==RJUMP&&rs1[i]==31)
6321         {
6322           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6323         }
6324         if(start>0x80000400&&start<0x80800000) {
6325           if(itype[i]==UJUMP&&rt1[i]==31)
6326           {
6327             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6328             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6329           }
6330           if(itype[i]==RJUMP&&rs1[i]==31)
6331           {
6332             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6333             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6334           }
6335         }*/
6336         branch_unneeded_reg[i]=u;
6337         branch_unneeded_reg_upper[i]=uu;
6338         // Merge in delay slot
6339         tdep=(~uu>>rt1[i+1])&1;
6340         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6341         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6342         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6343         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6344         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6345         u|=1;uu|=1;
6346         // If branch is "likely" (and conditional)
6347         // then we skip the delay slot on the fall-thru path
6348         if(likely[i]) {
6349           if(i<slen-1) {
6350             u&=unneeded_reg[i+2];
6351             uu&=unneeded_reg_upper[i+2];
6352           }
6353           else
6354           {
6355             u=1;
6356             uu=1;
6357           }
6358         }
6359       }
6360       else
6361       {
6362         // Internal branch, flag target
6363         bt[(ba[i]-start)>>2]=1;
6364         if(ba[i]<=start+i*4) {
6365           // Backward branch
6366           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6367           {
6368             // Unconditional branch
6369             temp_u=1;temp_uu=1;
6370           } else {
6371             // Conditional branch (not taken case)
6372             temp_u=unneeded_reg[i+2];
6373             temp_uu=unneeded_reg_upper[i+2];
6374           }
6375           // Merge in delay slot
6376           tdep=(~temp_uu>>rt1[i+1])&1;
6377           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6378           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6379           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6380           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6381           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6382           temp_u|=1;temp_uu|=1;
6383           // If branch is "likely" (and conditional)
6384           // then we skip the delay slot on the fall-thru path
6385           if(likely[i]) {
6386             if(i<slen-1) {
6387               temp_u&=unneeded_reg[i+2];
6388               temp_uu&=unneeded_reg_upper[i+2];
6389             }
6390             else
6391             {
6392               temp_u=1;
6393               temp_uu=1;
6394             }
6395           }
6396           tdep=(~temp_uu>>rt1[i])&1;
6397           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6398           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6399           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6400           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6401           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6402           temp_u|=1;temp_uu|=1;
6403           unneeded_reg[i]=temp_u;
6404           unneeded_reg_upper[i]=temp_uu;
6405           // Only go three levels deep.  This recursion can take an
6406           // excessive amount of time if there are a lot of nested loops.
6407           if(r<2) {
6408             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6409           }else{
6410             unneeded_reg[(ba[i]-start)>>2]=1;
6411             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6412           }
6413         } /*else*/ if(1) {
6414           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6415           {
6416             // Unconditional branch
6417             u=unneeded_reg[(ba[i]-start)>>2];
6418             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6419             branch_unneeded_reg[i]=u;
6420             branch_unneeded_reg_upper[i]=uu;
6421         //u=1;
6422         //uu=1;
6423         //branch_unneeded_reg[i]=u;
6424         //branch_unneeded_reg_upper[i]=uu;
6425             // Merge in delay slot
6426             tdep=(~uu>>rt1[i+1])&1;
6427             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6428             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6429             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6430             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6431             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6432             u|=1;uu|=1;
6433           } else {
6434             // Conditional branch
6435             b=unneeded_reg[(ba[i]-start)>>2];
6436             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6437             branch_unneeded_reg[i]=b;
6438             branch_unneeded_reg_upper[i]=bu;
6439         //b=1;
6440         //bu=1;
6441         //branch_unneeded_reg[i]=b;
6442         //branch_unneeded_reg_upper[i]=bu;
6443             // Branch delay slot
6444             tdep=(~uu>>rt1[i+1])&1;
6445             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6446             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6447             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6448             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6449             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6450             b|=1;bu|=1;
6451             // If branch is "likely" then we skip the
6452             // delay slot on the fall-thru path
6453             if(likely[i]) {
6454               u=b;
6455               uu=bu;
6456               if(i<slen-1) {
6457                 u&=unneeded_reg[i+2];
6458                 uu&=unneeded_reg_upper[i+2];
6459         //u=1;
6460         //uu=1;
6461               }
6462             } else {
6463               u&=b;
6464               uu&=bu;
6465         //u=1;
6466         //uu=1;
6467             }
6468             if(i<slen-1) {
6469               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6470               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6471         //branch_unneeded_reg[i]=1;
6472         //branch_unneeded_reg_upper[i]=1;
6473             } else {
6474               branch_unneeded_reg[i]=1;
6475               branch_unneeded_reg_upper[i]=1;
6476             }
6477           }
6478         }
6479       }
6480     }
6481     else if(itype[i]==SYSCALL)
6482     {
6483       // SYSCALL instruction (software interrupt)
6484       u=1;
6485       uu=1;
6486     }
6487     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6488     {
6489       // ERET instruction (return from interrupt)
6490       u=1;
6491       uu=1;
6492     }
6493     //u=uu=1; // DEBUG
6494     tdep=(~uu>>rt1[i])&1;
6495     // Written registers are unneeded
6496     u|=1LL<<rt1[i];
6497     u|=1LL<<rt2[i];
6498     uu|=1LL<<rt1[i];
6499     uu|=1LL<<rt2[i];
6500     // Accessed registers are needed
6501     u&=~(1LL<<rs1[i]);
6502     u&=~(1LL<<rs2[i]);
6503     uu&=~(1LL<<us1[i]);
6504     uu&=~(1LL<<us2[i]);
6505     // Source-target dependencies
6506     uu&=~(tdep<<dep1[i]);
6507     uu&=~(tdep<<dep2[i]);
6508     // R0 is always unneeded
6509     u|=1;uu|=1;
6510     // Save it
6511     unneeded_reg[i]=u;
6512     unneeded_reg_upper[i]=uu;
6513 #ifdef FORCE32
6514     unneeded_reg_upper[i]=-1LL;
6515 #endif
6516     /*
6517     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6518     printf("U:");
6519     int r;
6520     for(r=1;r<=CCREG;r++) {
6521       if((unneeded_reg[i]>>r)&1) {
6522         if(r==HIREG) printf(" HI");
6523         else if(r==LOREG) printf(" LO");
6524         else printf(" r%d",r);
6525       }
6526     }
6527     printf(" UU:");
6528     for(r=1;r<=CCREG;r++) {
6529       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6530         if(r==HIREG) printf(" HI");
6531         else if(r==LOREG) printf(" LO");
6532         else printf(" r%d",r);
6533       }
6534     }
6535     printf("\n");*/
6536   }
6537 }
6538
6539 // Identify registers which are likely to contain 32-bit values
6540 // This is used to predict whether any branches will jump to a
6541 // location with 64-bit values in registers.
6542 static void provisional_32bit()
6543 {
6544   int i,j;
6545   uint64_t is32=1;
6546   uint64_t lastbranch=1;
6547   
6548   for(i=0;i<slen;i++)
6549   {
6550     if(i>0) {
6551       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6552         if(i>1) is32=lastbranch;
6553         else is32=1;
6554       }
6555     }
6556     if(i>1)
6557     {
6558       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6559         if(likely[i-2]) {
6560           if(i>2) is32=lastbranch;
6561           else is32=1;
6562         }
6563       }
6564       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6565       {
6566         if(rs1[i-2]==0||rs2[i-2]==0)
6567         {
6568           if(rs1[i-2]) {
6569             is32|=1LL<<rs1[i-2];
6570           }
6571           if(rs2[i-2]) {
6572             is32|=1LL<<rs2[i-2];
6573           }
6574         }
6575       }
6576     }
6577     // If something jumps here with 64-bit values
6578     // then promote those registers to 64 bits
6579     if(bt[i])
6580     {
6581       uint64_t temp_is32=is32;
6582       for(j=i-1;j>=0;j--)
6583       {
6584         if(ba[j]==start+i*4) 
6585           //temp_is32&=branch_regs[j].is32;
6586           temp_is32&=p32[j];
6587       }
6588       for(j=i;j<slen;j++)
6589       {
6590         if(ba[j]==start+i*4) 
6591           temp_is32=1;
6592       }
6593       is32=temp_is32;
6594     }
6595     int type=itype[i];
6596     int op=opcode[i];
6597     int op2=opcode2[i];
6598     int rt=rt1[i];
6599     int s1=rs1[i];
6600     int s2=rs2[i];
6601     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6602       // Branches don't write registers, consider the delay slot instead.
6603       type=itype[i+1];
6604       op=opcode[i+1];
6605       op2=opcode2[i+1];
6606       rt=rt1[i+1];
6607       s1=rs1[i+1];
6608       s2=rs2[i+1];
6609       lastbranch=is32;
6610     }
6611     switch(type) {
6612       case LOAD:
6613         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6614            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6615           is32&=~(1LL<<rt);
6616         else
6617           is32|=1LL<<rt;
6618         break;
6619       case STORE:
6620       case STORELR:
6621         break;
6622       case LOADLR:
6623         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6624         if(op==0x22) is32|=1LL<<rt; // LWL
6625         break;
6626       case IMM16:
6627         if (op==0x08||op==0x09|| // ADDI/ADDIU
6628             op==0x0a||op==0x0b|| // SLTI/SLTIU
6629             op==0x0c|| // ANDI
6630             op==0x0f)  // LUI
6631         {
6632           is32|=1LL<<rt;
6633         }
6634         if(op==0x18||op==0x19) { // DADDI/DADDIU
6635           is32&=~(1LL<<rt);
6636           //if(imm[i]==0)
6637           //  is32|=((is32>>s1)&1LL)<<rt;
6638         }
6639         if(op==0x0d||op==0x0e) { // ORI/XORI
6640           uint64_t sr=((is32>>s1)&1LL);
6641           is32&=~(1LL<<rt);
6642           is32|=sr<<rt;
6643         }
6644         break;
6645       case UJUMP:
6646         break;
6647       case RJUMP:
6648         break;
6649       case CJUMP:
6650         break;
6651       case SJUMP:
6652         break;
6653       case FJUMP:
6654         break;
6655       case ALU:
6656         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6657           is32|=1LL<<rt;
6658         }
6659         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6660           is32|=1LL<<rt;
6661         }
6662         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6663           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6664           is32&=~(1LL<<rt);
6665           is32|=sr<<rt;
6666         }
6667         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6668           if(s1==0&&s2==0) {
6669             is32|=1LL<<rt;
6670           }
6671           else if(s2==0) {
6672             uint64_t sr=((is32>>s1)&1LL);
6673             is32&=~(1LL<<rt);
6674             is32|=sr<<rt;
6675           }
6676           else if(s1==0) {
6677             uint64_t sr=((is32>>s2)&1LL);
6678             is32&=~(1LL<<rt);
6679             is32|=sr<<rt;
6680           }
6681           else {
6682             is32&=~(1LL<<rt);
6683           }
6684         }
6685         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6686           if(s1==0&&s2==0) {
6687             is32|=1LL<<rt;
6688           }
6689           else if(s2==0) {
6690             uint64_t sr=((is32>>s1)&1LL);
6691             is32&=~(1LL<<rt);
6692             is32|=sr<<rt;
6693           }
6694           else {
6695             is32&=~(1LL<<rt);
6696           }
6697         }
6698         break;
6699       case MULTDIV:
6700         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6701           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6702         }
6703         else {
6704           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6705         }
6706         break;
6707       case MOV:
6708         {
6709           uint64_t sr=((is32>>s1)&1LL);
6710           is32&=~(1LL<<rt);
6711           is32|=sr<<rt;
6712         }
6713         break;
6714       case SHIFT:
6715         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6716         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6717         break;
6718       case SHIFTIMM:
6719         is32|=1LL<<rt;
6720         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6721         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6722         break;
6723       case COP0:
6724         if(op2==0) is32|=1LL<<rt; // MFC0
6725         break;
6726       case COP1:
6727         if(op2==0) is32|=1LL<<rt; // MFC1
6728         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6729         if(op2==2) is32|=1LL<<rt; // CFC1
6730         break;
6731       case C1LS:
6732         break;
6733       case FLOAT:
6734       case FCONV:
6735         break;
6736       case FCOMP:
6737         break;
6738       case SYSCALL:
6739         break;
6740       default:
6741         break;
6742     }
6743     is32|=1;
6744     p32[i]=is32;
6745
6746     if(i>0)
6747     {
6748       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6749       {
6750         if(rt1[i-1]==31) // JAL/JALR
6751         {
6752           // Subroutine call will return here, don't alloc any registers
6753           is32=1;
6754         }
6755         else if(i+1<slen)
6756         {
6757           // Internal branch will jump here, match registers to caller
6758           is32=0x3FFFFFFFFLL;
6759         }
6760       }
6761     }
6762   }
6763 }
6764
6765 // Identify registers which may be assumed to contain 32-bit values
6766 // and where optimizations will rely on this.
6767 // This is used to determine whether backward branches can safely
6768 // jump to a location with 64-bit values in registers.
6769 static void provisional_r32()
6770 {
6771   u_int r32=0;
6772   int i;
6773   
6774   for (i=slen-1;i>=0;i--)
6775   {
6776     int hr;
6777     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6778     {
6779       if(ba[i]<start || ba[i]>=(start+slen*4))
6780       {
6781         // Branch out of this block, don't need anything
6782         r32=0;
6783       }
6784       else
6785       {
6786         // Internal branch
6787         // Need whatever matches the target
6788         // (and doesn't get overwritten by the delay slot instruction)
6789         r32=0;
6790         int t=(ba[i]-start)>>2;
6791         if(ba[i]>start+i*4) {
6792           // Forward branch
6793           //if(!(requires_32bit[t]&~regs[i].was32))
6794           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6795           if(!(pr32[t]&~regs[i].was32))
6796             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6797         }else{
6798           // Backward branch
6799           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6800             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6801         }
6802       }
6803       // Conditional branch may need registers for following instructions
6804       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6805       {
6806         if(i<slen-2) {
6807           //r32|=requires_32bit[i+2];
6808           r32|=pr32[i+2];
6809           r32&=regs[i].was32;
6810           // Mark this address as a branch target since it may be called
6811           // upon return from interrupt
6812           //bt[i+2]=1;
6813         }
6814       }
6815       // Merge in delay slot
6816       if(!likely[i]) {
6817         // These are overwritten unless the branch is "likely"
6818         // and the delay slot is nullified if not taken
6819         r32&=~(1LL<<rt1[i+1]);
6820         r32&=~(1LL<<rt2[i+1]);
6821       }
6822       // Assume these are needed (delay slot)
6823       if(us1[i+1]>0)
6824       {
6825         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6826       }
6827       if(us2[i+1]>0)
6828       {
6829         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6830       }
6831       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6832       {
6833         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6834       }
6835       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6836       {
6837         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6838       }
6839     }
6840     else if(itype[i]==SYSCALL)
6841     {
6842       // SYSCALL instruction (software interrupt)
6843       r32=0;
6844     }
6845     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6846     {
6847       // ERET instruction (return from interrupt)
6848       r32=0;
6849     }
6850     // Check 32 bits
6851     r32&=~(1LL<<rt1[i]);
6852     r32&=~(1LL<<rt2[i]);
6853     if(us1[i]>0)
6854     {
6855       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6856     }
6857     if(us2[i]>0)
6858     {
6859       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6860     }
6861     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6862     {
6863       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6864     }
6865     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6866     {
6867       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6868     }
6869     //requires_32bit[i]=r32;
6870     pr32[i]=r32;
6871     
6872     // Dirty registers which are 32-bit, require 32-bit input
6873     // as they will be written as 32-bit values
6874     for(hr=0;hr<HOST_REGS;hr++)
6875     {
6876       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6877         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6878           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6879           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6880           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6881         }
6882       }
6883     }
6884   }
6885 }
6886
6887 // Write back dirty registers as soon as we will no longer modify them,
6888 // so that we don't end up with lots of writes at the branches.
6889 void clean_registers(int istart,int iend,int wr)
6890 {
6891   int i;
6892   int r;
6893   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6894   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6895   if(iend==slen-1) {
6896     will_dirty_i=will_dirty_next=0;
6897     wont_dirty_i=wont_dirty_next=0;
6898   }else{
6899     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6900     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6901   }
6902   for (i=iend;i>=istart;i--)
6903   {
6904     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6905     {
6906       if(ba[i]<start || ba[i]>=(start+slen*4))
6907       {
6908         // Branch out of this block, flush all regs
6909         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6910         {
6911           // Unconditional branch
6912           will_dirty_i=0;
6913           wont_dirty_i=0;
6914           // Merge in delay slot (will dirty)
6915           for(r=0;r<HOST_REGS;r++) {
6916             if(r!=EXCLUDE_REG) {
6917               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6918               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6919               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6920               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6921               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6922               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6923               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6924               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6925               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6926               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6927               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6928               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6929               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6930               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6931             }
6932           }
6933         }
6934         else
6935         {
6936           // Conditional branch
6937           will_dirty_i=0;
6938           wont_dirty_i=wont_dirty_next;
6939           // Merge in delay slot (will dirty)
6940           for(r=0;r<HOST_REGS;r++) {
6941             if(r!=EXCLUDE_REG) {
6942               if(!likely[i]) {
6943                 // Might not dirty if likely branch is not taken
6944                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6945                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6946                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6947                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6948                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6949                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6950                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6951                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6952                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6953                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6954                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6955                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6956                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6957                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6958               }
6959             }
6960           }
6961         }
6962         // Merge in delay slot (wont dirty)
6963         for(r=0;r<HOST_REGS;r++) {
6964           if(r!=EXCLUDE_REG) {
6965             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6966             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6967             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6968             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6969             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6970             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6971             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6972             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6973             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6974             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6975           }
6976         }
6977         if(wr) {
6978           #ifndef DESTRUCTIVE_WRITEBACK
6979           branch_regs[i].dirty&=wont_dirty_i;
6980           #endif
6981           branch_regs[i].dirty|=will_dirty_i;
6982         }
6983       }
6984       else
6985       {
6986         // Internal branch
6987         if(ba[i]<=start+i*4) {
6988           // Backward branch
6989           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6990           {
6991             // Unconditional branch
6992             temp_will_dirty=0;
6993             temp_wont_dirty=0;
6994             // Merge in delay slot (will dirty)
6995             for(r=0;r<HOST_REGS;r++) {
6996               if(r!=EXCLUDE_REG) {
6997                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6998                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6999                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7000                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7001                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7002                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7003                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7004                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7005                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7006                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7007                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7008                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7009                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7010                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7011               }
7012             }
7013           } else {
7014             // Conditional branch (not taken case)
7015             temp_will_dirty=will_dirty_next;
7016             temp_wont_dirty=wont_dirty_next;
7017             // Merge in delay slot (will dirty)
7018             for(r=0;r<HOST_REGS;r++) {
7019               if(r!=EXCLUDE_REG) {
7020                 if(!likely[i]) {
7021                   // Will not dirty if likely branch is not taken
7022                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7023                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7024                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7025                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7026                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7027                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7028                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7029                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7030                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7031                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7032                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7033                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7034                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7035                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7036                 }
7037               }
7038             }
7039           }
7040           // Merge in delay slot (wont dirty)
7041           for(r=0;r<HOST_REGS;r++) {
7042             if(r!=EXCLUDE_REG) {
7043               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7044               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7045               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7046               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7047               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7048               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7049               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7050               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7051               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7052               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7053             }
7054           }
7055           // Deal with changed mappings
7056           if(i<iend) {
7057             for(r=0;r<HOST_REGS;r++) {
7058               if(r!=EXCLUDE_REG) {
7059                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7060                   temp_will_dirty&=~(1<<r);
7061                   temp_wont_dirty&=~(1<<r);
7062                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7063                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7064                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7065                   } else {
7066                     temp_will_dirty|=1<<r;
7067                     temp_wont_dirty|=1<<r;
7068                   }
7069                 }
7070               }
7071             }
7072           }
7073           if(wr) {
7074             will_dirty[i]=temp_will_dirty;
7075             wont_dirty[i]=temp_wont_dirty;
7076             clean_registers((ba[i]-start)>>2,i-1,0);
7077           }else{
7078             // Limit recursion.  It can take an excessive amount
7079             // of time if there are a lot of nested loops.
7080             will_dirty[(ba[i]-start)>>2]=0;
7081             wont_dirty[(ba[i]-start)>>2]=-1;
7082           }
7083         }
7084         /*else*/ if(1)
7085         {
7086           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7087           {
7088             // Unconditional branch
7089             will_dirty_i=0;
7090             wont_dirty_i=0;
7091           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7092             for(r=0;r<HOST_REGS;r++) {
7093               if(r!=EXCLUDE_REG) {
7094                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7095                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7096                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7097                 }
7098               }
7099             }
7100           //}
7101             // Merge in delay slot
7102             for(r=0;r<HOST_REGS;r++) {
7103               if(r!=EXCLUDE_REG) {
7104                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7105                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7106                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7107                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7108                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7109                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7110                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7111                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7112                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7113                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7114                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7115                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7116                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7117                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7118               }
7119             }
7120           } else {
7121             // Conditional branch
7122             will_dirty_i=will_dirty_next;
7123             wont_dirty_i=wont_dirty_next;
7124           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7125             for(r=0;r<HOST_REGS;r++) {
7126               if(r!=EXCLUDE_REG) {
7127                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7128                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7129                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7130                 }
7131                 else
7132                 {
7133                   will_dirty_i&=~(1<<r);
7134                 }
7135                 // Treat delay slot as part of branch too
7136                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7137                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7138                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7139                 }
7140                 else
7141                 {
7142                   will_dirty[i+1]&=~(1<<r);
7143                 }*/
7144               }
7145             }
7146           //}
7147             // Merge in delay slot
7148             for(r=0;r<HOST_REGS;r++) {
7149               if(r!=EXCLUDE_REG) {
7150                 if(!likely[i]) {
7151                   // Might not dirty if likely branch is not taken
7152                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7153                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7154                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7155                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7156                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7157                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7158                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7159                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7160                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7161                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7162                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7163                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7164                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7165                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7166                 }
7167               }
7168             }
7169           }
7170           // Merge in delay slot
7171           for(r=0;r<HOST_REGS;r++) {
7172             if(r!=EXCLUDE_REG) {
7173               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7174               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7175               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7176               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7177               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7178               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7179               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7180               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7181               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7182               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7183             }
7184           }
7185           if(wr) {
7186             #ifndef DESTRUCTIVE_WRITEBACK
7187             branch_regs[i].dirty&=wont_dirty_i;
7188             #endif
7189             branch_regs[i].dirty|=will_dirty_i;
7190           }
7191         }
7192       }
7193     }
7194     else if(itype[i]==SYSCALL)
7195     {
7196       // SYSCALL instruction (software interrupt)
7197       will_dirty_i=0;
7198       wont_dirty_i=0;
7199     }
7200     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7201     {
7202       // ERET instruction (return from interrupt)
7203       will_dirty_i=0;
7204       wont_dirty_i=0;
7205     }
7206     will_dirty_next=will_dirty_i;
7207     wont_dirty_next=wont_dirty_i;
7208     for(r=0;r<HOST_REGS;r++) {
7209       if(r!=EXCLUDE_REG) {
7210         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7211         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7212         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7213         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7214         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7215         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7216         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7217         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7218         if(i>istart) {
7219           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7220           {
7221             // Don't store a register immediately after writing it,
7222             // may prevent dual-issue.
7223             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7224             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7225           }
7226         }
7227       }
7228     }
7229     // Save it
7230     will_dirty[i]=will_dirty_i;
7231     wont_dirty[i]=wont_dirty_i;
7232     // Mark registers that won't be dirtied as not dirty
7233     if(wr) {
7234       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7235       for(r=0;r<HOST_REGS;r++) {
7236         if((will_dirty_i>>r)&1) {
7237           printf(" r%d",r);
7238         }
7239       }
7240       printf("\n");*/
7241
7242       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7243         regs[i].dirty|=will_dirty_i;
7244         #ifndef DESTRUCTIVE_WRITEBACK
7245         regs[i].dirty&=wont_dirty_i;
7246         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7247         {
7248           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7249             for(r=0;r<HOST_REGS;r++) {
7250               if(r!=EXCLUDE_REG) {
7251                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7252                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7253                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7254               }
7255             }
7256           }
7257         }
7258         else
7259         {
7260           if(i<iend) {
7261             for(r=0;r<HOST_REGS;r++) {
7262               if(r!=EXCLUDE_REG) {
7263                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7264                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7265                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7266               }
7267             }
7268           }
7269         }
7270         #endif
7271       //}
7272     }
7273     // Deal with changed mappings
7274     temp_will_dirty=will_dirty_i;
7275     temp_wont_dirty=wont_dirty_i;
7276     for(r=0;r<HOST_REGS;r++) {
7277       if(r!=EXCLUDE_REG) {
7278         int nr;
7279         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7280           if(wr) {
7281             #ifndef DESTRUCTIVE_WRITEBACK
7282             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7283             #endif
7284             regs[i].wasdirty|=will_dirty_i&(1<<r);
7285           }
7286         }
7287         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7288           // Register moved to a different register
7289           will_dirty_i&=~(1<<r);
7290           wont_dirty_i&=~(1<<r);
7291           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7292           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7293           if(wr) {
7294             #ifndef DESTRUCTIVE_WRITEBACK
7295             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7296             #endif
7297             regs[i].wasdirty|=will_dirty_i&(1<<r);
7298           }
7299         }
7300         else {
7301           will_dirty_i&=~(1<<r);
7302           wont_dirty_i&=~(1<<r);
7303           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7304             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7305             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7306           } else {
7307             wont_dirty_i|=1<<r;
7308             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7309           }
7310         }
7311       }
7312     }
7313   }
7314 }
7315
7316   /* disassembly */
7317 void disassemble_inst(int i)
7318 {
7319     if (bt[i]) printf("*"); else printf(" ");
7320     switch(itype[i]) {
7321       case UJUMP:
7322         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7323       case CJUMP:
7324         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7325       case SJUMP:
7326         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7327       case FJUMP:
7328         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7329       case RJUMP:
7330         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7331       case SPAN:
7332         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7333       case IMM16:
7334         if(opcode[i]==0xf) //LUI
7335           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7336         else
7337           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7338         break;
7339       case LOAD:
7340       case LOADLR:
7341         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7342         break;
7343       case STORE:
7344       case STORELR:
7345         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7346         break;
7347       case ALU:
7348       case SHIFT:
7349         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7350         break;
7351       case MULTDIV:
7352         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7353         break;
7354       case SHIFTIMM:
7355         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7356         break;
7357       case MOV:
7358         if((opcode2[i]&0x1d)==0x10)
7359           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7360         else if((opcode2[i]&0x1d)==0x11)
7361           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7362         else
7363           printf (" %x: %s\n",start+i*4,insn[i]);
7364         break;
7365       case COP0:
7366         if(opcode2[i]==0)
7367           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7368         else if(opcode2[i]==4)
7369           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7370         else printf (" %x: %s\n",start+i*4,insn[i]);
7371         break;
7372       case COP1:
7373         if(opcode2[i]<3)
7374           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7375         else if(opcode2[i]>3)
7376           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7377         else printf (" %x: %s\n",start+i*4,insn[i]);
7378         break;
7379       case C1LS:
7380         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7381         break;
7382       default:
7383         //printf (" %s %8x\n",insn[i],source[i]);
7384         printf (" %x: %s\n",start+i*4,insn[i]);
7385     }
7386 }
7387
7388 void new_dynarec_init()
7389 {
7390   printf("Init new dynarec\n");
7391   out=(u_char *)BASE_ADDR;
7392   if (mmap (out, 1<<TARGET_SIZE_2,
7393             PROT_READ | PROT_WRITE | PROT_EXEC,
7394             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7395             -1, 0) <= 0) {printf("mmap() failed\n");}
7396 #ifdef MUPEN64
7397   rdword=&readmem_dword;
7398   fake_pc.f.r.rs=&readmem_dword;
7399   fake_pc.f.r.rt=&readmem_dword;
7400   fake_pc.f.r.rd=&readmem_dword;
7401 #endif
7402   int n;
7403   for(n=0x80000;n<0x80800;n++)
7404     invalid_code[n]=1;
7405   for(n=0;n<65536;n++)
7406     hash_table[n][0]=hash_table[n][2]=-1;
7407   memset(mini_ht,-1,sizeof(mini_ht));
7408   memset(restore_candidate,0,sizeof(restore_candidate));
7409   copy=shadow;
7410   expirep=16384; // Expiry pointer, +2 blocks
7411   pending_exception=0;
7412   literalcount=0;
7413 #ifdef HOST_IMM8
7414   // Copy this into local area so we don't have to put it in every literal pool
7415   invc_ptr=invalid_code;
7416 #endif
7417   stop_after_jal=0;
7418   // TLB
7419   using_tlb=0;
7420   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7421     memory_map[n]=-1;
7422   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7423     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7424   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7425     memory_map[n]=-1;
7426 #ifdef MUPEN64
7427   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7428     writemem[n] = write_nomem_new;
7429     writememb[n] = write_nomemb_new;
7430     writememh[n] = write_nomemh_new;
7431 #ifndef FORCE32
7432     writememd[n] = write_nomemd_new;
7433 #endif
7434     readmem[n] = read_nomem_new;
7435     readmemb[n] = read_nomemb_new;
7436     readmemh[n] = read_nomemh_new;
7437 #ifndef FORCE32
7438     readmemd[n] = read_nomemd_new;
7439 #endif
7440   }
7441   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7442     writemem[n] = write_rdram_new;
7443     writememb[n] = write_rdramb_new;
7444     writememh[n] = write_rdramh_new;
7445 #ifndef FORCE32
7446     writememd[n] = write_rdramd_new;
7447 #endif
7448   }
7449   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7450     writemem[n] = write_nomem_new;
7451     writememb[n] = write_nomemb_new;
7452     writememh[n] = write_nomemh_new;
7453 #ifndef FORCE32
7454     writememd[n] = write_nomemd_new;
7455 #endif
7456     readmem[n] = read_nomem_new;
7457     readmemb[n] = read_nomemb_new;
7458     readmemh[n] = read_nomemh_new;
7459 #ifndef FORCE32
7460     readmemd[n] = read_nomemd_new;
7461 #endif
7462   }
7463 #endif
7464   tlb_hacks();
7465   arch_init();
7466 }
7467
7468 void new_dynarec_cleanup()
7469 {
7470   int n;
7471   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7472   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7473   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7474   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7475   #ifdef ROM_COPY
7476   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7477   #endif
7478 }
7479
7480 int new_recompile_block(int addr)
7481 {
7482 /*
7483   if(addr==0x800cd050) {
7484     int block;
7485     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7486     int n;
7487     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7488   }
7489 */
7490   //if(Count==365117028) tracedebug=1;
7491   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7492   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7493   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7494   //if(debug) 
7495   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7496   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7497   /*if(Count>=312978186) {
7498     rlist();
7499   }*/
7500   //rlist();
7501   start = (u_int)addr&~3;
7502   //assert(((u_int)addr&1)==0);
7503 #ifdef MUPEN64
7504   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7505     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7506     pagelimit = 0xa4001000;
7507   }
7508   else
7509 #endif
7510   if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7511     source = (u_int *)((u_int)rdram+start-0x80000000);
7512     pagelimit = 0x80800000;
7513   }
7514   else if ((signed int)addr >= (signed int)0xC0000000) {
7515     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7516     //if(tlb_LUT_r[start>>12])
7517       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7518     if((signed int)memory_map[start>>12]>=0) {
7519       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7520       pagelimit=(start+4096)&0xFFFFF000;
7521       int map=memory_map[start>>12];
7522       int i;
7523       for(i=0;i<5;i++) {
7524         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7525         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7526       }
7527       assem_debug("pagelimit=%x\n",pagelimit);
7528       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7529     }
7530     else {
7531       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7532       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7533       return 1; // Caller will invoke exception handler
7534     }
7535     //printf("source= %x\n",(int)source);
7536   }
7537   else {
7538     printf("Compile at bogus memory address: %x \n", (int)addr);
7539     exit(1);
7540   }
7541
7542   /* Pass 1: disassemble */
7543   /* Pass 2: register dependencies, branch targets */
7544   /* Pass 3: register allocation */
7545   /* Pass 4: branch dependencies */
7546   /* Pass 5: pre-alloc */
7547   /* Pass 6: optimize clean/dirty state */
7548   /* Pass 7: flag 32-bit registers */
7549   /* Pass 8: assembly */
7550   /* Pass 9: linker */
7551   /* Pass 10: garbage collection / free memory */
7552
7553   int i,j;
7554   int done=0;
7555   unsigned int type,op,op2;
7556
7557   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7558   
7559   /* Pass 1 disassembly */
7560
7561   for(i=0;!done;i++) {
7562     bt[i]=0;likely[i]=0;op2=0;
7563     opcode[i]=op=source[i]>>26;
7564     switch(op)
7565     {
7566       case 0x00: strcpy(insn[i],"special"); type=NI;
7567         op2=source[i]&0x3f;
7568         switch(op2)
7569         {
7570           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7571           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7572           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7573           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7574           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7575           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7576           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7577           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7578           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7579           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7580           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7581           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7582           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7583           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7584           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7585           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7586           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7587           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7588           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7589           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7590           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7591           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7592           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7593           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7594           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7595           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7596           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7597           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7598           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7599           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7600           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7601           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7602           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7603           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7604           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7605           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7606           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7607           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7608           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7609           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7610           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7611           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7612           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7613           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7614           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7615           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7616           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7617           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7618           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7619           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7620           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7621           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7622         }
7623         break;
7624       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7625         op2=(source[i]>>16)&0x1f;
7626         switch(op2)
7627         {
7628           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7629           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7630           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7631           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7632           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7633           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7634           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7635           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7636           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7637           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7638           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7639           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7640           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7641           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7642         }
7643         break;
7644       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7645       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7646       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7647       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7648       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7649       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7650       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7651       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7652       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7653       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7654       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7655       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7656       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7657       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7658       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7659         op2=(source[i]>>21)&0x1f;
7660         switch(op2)
7661         {
7662           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7663           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7664           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7665           switch(source[i]&0x3f)
7666           {
7667             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7668             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7669             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7670             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7671             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7672           }
7673         }
7674         break;
7675       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7676         op2=(source[i]>>21)&0x1f;
7677         switch(op2)
7678         {
7679           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7680           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7681           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7682           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7683           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7684           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7685           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7686           switch((source[i]>>16)&0x3)
7687           {
7688             case 0x00: strcpy(insn[i],"BC1F"); break;
7689             case 0x01: strcpy(insn[i],"BC1T"); break;
7690             case 0x02: strcpy(insn[i],"BC1FL"); break;
7691             case 0x03: strcpy(insn[i],"BC1TL"); break;
7692           }
7693           break;
7694           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7695           switch(source[i]&0x3f)
7696           {
7697             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7698             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7699             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7700             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7701             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7702             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7703             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7704             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7705             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7706             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7707             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7708             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7709             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7710             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7711             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7712             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7713             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7714             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7715             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7716             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7717             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7718             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7719             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7720             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7721             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7722             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7723             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7724             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7725             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7726             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7727             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7728             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7729             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7730             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7731             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7732           }
7733           break;
7734           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7735           switch(source[i]&0x3f)
7736           {
7737             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7738             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7739             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7740             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7741             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7742             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7743             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7744             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7745             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7746             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7747             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7748             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7749             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7750             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7751             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7752             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7753             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7754             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7755             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7756             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7757             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7758             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7759             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7760             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7761             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7762             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7763             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7764             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7765             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7766             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7767             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7768             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7769             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7770             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7771             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7772           }
7773           break;
7774           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7775           switch(source[i]&0x3f)
7776           {
7777             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7778             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7779           }
7780           break;
7781           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7782           switch(source[i]&0x3f)
7783           {
7784             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7785             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7786           }
7787           break;
7788         }
7789         break;
7790       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7791       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7792       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7793       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7794       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7795       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7796       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7797       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7798       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7799       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7800       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7801       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7802       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7803       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7804       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7805       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7806       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7807       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7808       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7809       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7810       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7811       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7812       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7813       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7814       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7815       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7816       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7817       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7818       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7819       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7820       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7821       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7822       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7823       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7824       default: strcpy(insn[i],"???"); type=NI; break;
7825     }
7826     itype[i]=type;
7827     opcode2[i]=op2;
7828     /* Get registers/immediates */
7829     lt1[i]=0;
7830     us1[i]=0;
7831     us2[i]=0;
7832     dep1[i]=0;
7833     dep2[i]=0;
7834     switch(type) {
7835       case LOAD:
7836         rs1[i]=(source[i]>>21)&0x1f;
7837         rs2[i]=0;
7838         rt1[i]=(source[i]>>16)&0x1f;
7839         rt2[i]=0;
7840         imm[i]=(short)source[i];
7841         break;
7842       case STORE:
7843       case STORELR:
7844         rs1[i]=(source[i]>>21)&0x1f;
7845         rs2[i]=(source[i]>>16)&0x1f;
7846         rt1[i]=0;
7847         rt2[i]=0;
7848         imm[i]=(short)source[i];
7849         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7850         break;
7851       case LOADLR:
7852         // LWL/LWR only load part of the register,
7853         // therefore the target register must be treated as a source too
7854         rs1[i]=(source[i]>>21)&0x1f;
7855         rs2[i]=(source[i]>>16)&0x1f;
7856         rt1[i]=(source[i]>>16)&0x1f;
7857         rt2[i]=0;
7858         imm[i]=(short)source[i];
7859         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7860         if(op==0x26) dep1[i]=rt1[i]; // LWR
7861         break;
7862       case IMM16:
7863         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7864         else rs1[i]=(source[i]>>21)&0x1f;
7865         rs2[i]=0;
7866         rt1[i]=(source[i]>>16)&0x1f;
7867         rt2[i]=0;
7868         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7869           imm[i]=(unsigned short)source[i];
7870         }else{
7871           imm[i]=(short)source[i];
7872         }
7873         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7874         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7875         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7876         break;
7877       case UJUMP:
7878         rs1[i]=0;
7879         rs2[i]=0;
7880         rt1[i]=0;
7881         rt2[i]=0;
7882         // The JAL instruction writes to r31.
7883         if (op&1) {
7884           rt1[i]=31;
7885         }
7886         rs2[i]=CCREG;
7887         break;
7888       case RJUMP:
7889         rs1[i]=(source[i]>>21)&0x1f;
7890         rs2[i]=0;
7891         rt1[i]=0;
7892         rt2[i]=0;
7893         // The JALR instruction writes to r31.
7894         if (op2&1) {
7895           rt1[i]=31;   
7896         }
7897         rs2[i]=CCREG;
7898         break;
7899       case CJUMP:
7900         rs1[i]=(source[i]>>21)&0x1f;
7901         rs2[i]=(source[i]>>16)&0x1f;
7902         rt1[i]=0;
7903         rt2[i]=0;
7904         if(op&2) { // BGTZ/BLEZ
7905           rs2[i]=0;
7906         }
7907         us1[i]=rs1[i];
7908         us2[i]=rs2[i];
7909         likely[i]=op>>4;
7910         break;
7911       case SJUMP:
7912         rs1[i]=(source[i]>>21)&0x1f;
7913         rs2[i]=CCREG;
7914         rt1[i]=0;
7915         rt2[i]=0;
7916         us1[i]=rs1[i];
7917         if(op2&0x10) { // BxxAL
7918           rt1[i]=31;
7919           // NOTE: If the branch is not taken, r31 is still overwritten
7920         }
7921         likely[i]=(op2&2)>>1;
7922         break;
7923       case FJUMP:
7924         rs1[i]=FSREG;
7925         rs2[i]=CSREG;
7926         rt1[i]=0;
7927         rt2[i]=0;
7928         likely[i]=((source[i])>>17)&1;
7929         break;
7930       case ALU:
7931         rs1[i]=(source[i]>>21)&0x1f; // source
7932         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7933         rt1[i]=(source[i]>>11)&0x1f; // destination
7934         rt2[i]=0;
7935         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7936           us1[i]=rs1[i];us2[i]=rs2[i];
7937         }
7938         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7939           dep1[i]=rs1[i];dep2[i]=rs2[i];
7940         }
7941         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7942           dep1[i]=rs1[i];dep2[i]=rs2[i];
7943         }
7944         break;
7945       case MULTDIV:
7946         rs1[i]=(source[i]>>21)&0x1f; // source
7947         rs2[i]=(source[i]>>16)&0x1f; // divisor
7948         rt1[i]=HIREG;
7949         rt2[i]=LOREG;
7950         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7951           us1[i]=rs1[i];us2[i]=rs2[i];
7952         }
7953         break;
7954       case MOV:
7955         rs1[i]=0;
7956         rs2[i]=0;
7957         rt1[i]=0;
7958         rt2[i]=0;
7959         if(op2==0x10) rs1[i]=HIREG; // MFHI
7960         if(op2==0x11) rt1[i]=HIREG; // MTHI
7961         if(op2==0x12) rs1[i]=LOREG; // MFLO
7962         if(op2==0x13) rt1[i]=LOREG; // MTLO
7963         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7964         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7965         dep1[i]=rs1[i];
7966         break;
7967       case SHIFT:
7968         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7969         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7970         rt1[i]=(source[i]>>11)&0x1f; // destination
7971         rt2[i]=0;
7972         // DSLLV/DSRLV/DSRAV are 64-bit
7973         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7974         break;
7975       case SHIFTIMM:
7976         rs1[i]=(source[i]>>16)&0x1f;
7977         rs2[i]=0;
7978         rt1[i]=(source[i]>>11)&0x1f;
7979         rt2[i]=0;
7980         imm[i]=(source[i]>>6)&0x1f;
7981         // DSxx32 instructions
7982         if(op2>=0x3c) imm[i]|=0x20;
7983         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7984         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7985         break;
7986       case COP0:
7987         rs1[i]=0;
7988         rs2[i]=0;
7989         rt1[i]=0;
7990         rt2[i]=0;
7991         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7992         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7993         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7994         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7995         break;
7996       case COP1:
7997         rs1[i]=0;
7998         rs2[i]=0;
7999         rt1[i]=0;
8000         rt2[i]=0;
8001         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8002         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8003         if(op2==5) us1[i]=rs1[i]; // DMTC1
8004         rs2[i]=CSREG;
8005         break;
8006       case C1LS:
8007         rs1[i]=(source[i]>>21)&0x1F;
8008         rs2[i]=CSREG;
8009         rt1[i]=0;
8010         rt2[i]=0;
8011         imm[i]=(short)source[i];
8012         break;
8013       case FLOAT:
8014       case FCONV:
8015         rs1[i]=0;
8016         rs2[i]=CSREG;
8017         rt1[i]=0;
8018         rt2[i]=0;
8019         break;
8020       case FCOMP:
8021         rs1[i]=FSREG;
8022         rs2[i]=CSREG;
8023         rt1[i]=FSREG;
8024         rt2[i]=0;
8025         break;
8026       case SYSCALL:
8027         rs1[i]=CCREG;
8028         rs2[i]=0;
8029         rt1[i]=0;
8030         rt2[i]=0;
8031         break;
8032       default:
8033         rs1[i]=0;
8034         rs2[i]=0;
8035         rt1[i]=0;
8036         rt2[i]=0;
8037     }
8038     /* Calculate branch target addresses */
8039     if(type==UJUMP)
8040       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8041     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8042       ba[i]=start+i*4+8; // Ignore never taken branch
8043     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8044       ba[i]=start+i*4+8; // Ignore never taken branch
8045     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8046       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8047     else ba[i]=-1;
8048     /* Is this the end of the block? */
8049     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8050       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8051         done=1;
8052         // Does the block continue due to a branch?
8053         for(j=i-1;j>=0;j--)
8054         {
8055           if(ba[j]==start+i*4+4) done=j=0;
8056           if(ba[j]==start+i*4+8) done=j=0;
8057         }
8058       }
8059       else {
8060         if(stop_after_jal) done=1;
8061         // Stop on BREAK
8062         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8063       }
8064       // Don't recompile stuff that's already compiled
8065       if(check_addr(start+i*4+4)) done=1;
8066       // Don't get too close to the limit
8067       if(i>MAXBLOCK/2) done=1;
8068     }
8069     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8070     assert(i<MAXBLOCK-1);
8071     if(start+i*4==pagelimit-4) done=1;
8072     assert(start+i*4<pagelimit);
8073     if (i==MAXBLOCK-1) done=1;
8074     // Stop if we're compiling junk
8075     if(itype[i]==NI&&opcode[i]==0x11) {
8076       done=stop_after_jal=1;
8077       printf("Disabled speculative precompilation\n");
8078     }
8079   }
8080   slen=i;
8081   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8082     if(start+i*4==pagelimit) {
8083       itype[i-1]=SPAN;
8084     }
8085   }
8086   assert(slen>0);
8087
8088   /* Pass 2 - Register dependencies and branch targets */
8089
8090   unneeded_registers(0,slen-1,0);
8091   
8092   /* Pass 3 - Register allocation */
8093
8094   struct regstat current; // Current register allocations/status
8095   current.is32=1;
8096   current.dirty=0;
8097   current.u=unneeded_reg[0];
8098   current.uu=unneeded_reg_upper[0];
8099   clear_all_regs(current.regmap);
8100   alloc_reg(&current,0,CCREG);
8101   dirty_reg(&current,CCREG);
8102   current.isconst=0;
8103   current.wasconst=0;
8104   int ds=0;
8105   int cc=0;
8106   int hr;
8107   
8108   provisional_32bit();
8109   
8110   if((u_int)addr&1) {
8111     // First instruction is delay slot
8112     cc=-1;
8113     bt[1]=1;
8114     ds=1;
8115     unneeded_reg[0]=1;
8116     unneeded_reg_upper[0]=1;
8117     current.regmap[HOST_BTREG]=BTREG;
8118   }
8119   
8120   for(i=0;i<slen;i++)
8121   {
8122     if(bt[i])
8123     {
8124       int hr;
8125       for(hr=0;hr<HOST_REGS;hr++)
8126       {
8127         // Is this really necessary?
8128         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8129       }
8130       current.isconst=0;
8131     }
8132     if(i>1)
8133     {
8134       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8135       {
8136         if(rs1[i-2]==0||rs2[i-2]==0)
8137         {
8138           if(rs1[i-2]) {
8139             current.is32|=1LL<<rs1[i-2];
8140             int hr=get_reg(current.regmap,rs1[i-2]|64);
8141             if(hr>=0) current.regmap[hr]=-1;
8142           }
8143           if(rs2[i-2]) {
8144             current.is32|=1LL<<rs2[i-2];
8145             int hr=get_reg(current.regmap,rs2[i-2]|64);
8146             if(hr>=0) current.regmap[hr]=-1;
8147           }
8148         }
8149       }
8150     }
8151     // If something jumps here with 64-bit values
8152     // then promote those registers to 64 bits
8153     if(bt[i])
8154     {
8155       uint64_t temp_is32=current.is32;
8156       for(j=i-1;j>=0;j--)
8157       {
8158         if(ba[j]==start+i*4) 
8159           temp_is32&=branch_regs[j].is32;
8160       }
8161       for(j=i;j<slen;j++)
8162       {
8163         if(ba[j]==start+i*4) 
8164           //temp_is32=1;
8165           temp_is32&=p32[j];
8166       }
8167       if(temp_is32!=current.is32) {
8168         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8169         #ifdef DESTRUCTIVE_WRITEBACK
8170         for(hr=0;hr<HOST_REGS;hr++)
8171         {
8172           int r=current.regmap[hr];
8173           if(r>0&&r<64)
8174           {
8175             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8176               temp_is32|=1LL<<r;
8177               //printf("restore %d\n",r);
8178             }
8179           }
8180         }
8181         #endif
8182         current.is32=temp_is32;
8183       }
8184     }
8185 #ifdef FORCE32
8186     memset(p32, 0xff, sizeof(p32));
8187     current.is32=-1LL;
8188 #endif
8189
8190     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8191     regs[i].wasconst=current.isconst;
8192     regs[i].was32=current.is32;
8193     regs[i].wasdirty=current.dirty;
8194     #ifdef DESTRUCTIVE_WRITEBACK
8195     // To change a dirty register from 32 to 64 bits, we must write
8196     // it out during the previous cycle (for branches, 2 cycles)
8197     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8198     {
8199       uint64_t temp_is32=current.is32;
8200       for(j=i-1;j>=0;j--)
8201       {
8202         if(ba[j]==start+i*4+4) 
8203           temp_is32&=branch_regs[j].is32;
8204       }
8205       for(j=i;j<slen;j++)
8206       {
8207         if(ba[j]==start+i*4+4) 
8208           //temp_is32=1;
8209           temp_is32&=p32[j];
8210       }
8211       if(temp_is32!=current.is32) {
8212         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8213         for(hr=0;hr<HOST_REGS;hr++)
8214         {
8215           int r=current.regmap[hr];
8216           if(r>0)
8217           {
8218             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8219               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8220               {
8221                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8222                 {
8223                   //printf("dump %d/r%d\n",hr,r);
8224                   current.regmap[hr]=-1;
8225                   if(get_reg(current.regmap,r|64)>=0) 
8226                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8227                 }
8228               }
8229             }
8230           }
8231         }
8232       }
8233     }
8234     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8235     {
8236       uint64_t temp_is32=current.is32;
8237       for(j=i-1;j>=0;j--)
8238       {
8239         if(ba[j]==start+i*4+8) 
8240           temp_is32&=branch_regs[j].is32;
8241       }
8242       for(j=i;j<slen;j++)
8243       {
8244         if(ba[j]==start+i*4+8) 
8245           //temp_is32=1;
8246           temp_is32&=p32[j];
8247       }
8248       if(temp_is32!=current.is32) {
8249         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8250         for(hr=0;hr<HOST_REGS;hr++)
8251         {
8252           int r=current.regmap[hr];
8253           if(r>0)
8254           {
8255             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8256               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8257               {
8258                 //printf("dump %d/r%d\n",hr,r);
8259                 current.regmap[hr]=-1;
8260                 if(get_reg(current.regmap,r|64)>=0) 
8261                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8262               }
8263             }
8264           }
8265         }
8266       }
8267     }
8268     #endif
8269     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8270       if(i+1<slen) {
8271         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8272         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8273         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8274         current.u|=1;
8275         current.uu|=1;
8276       } else {
8277         current.u=1;
8278         current.uu=1;
8279       }
8280     } else {
8281       if(i+1<slen) {
8282         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8283         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8284         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8285         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8286         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8287         current.u|=1;
8288         current.uu|=1;
8289       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8290     }
8291     is_ds[i]=ds;
8292     if(ds) {
8293       ds=0; // Skip delay slot, already allocated as part of branch
8294       // ...but we need to alloc it in case something jumps here
8295       if(i+1<slen) {
8296         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8297         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8298       }else{
8299         current.u=branch_unneeded_reg[i-1];
8300         current.uu=branch_unneeded_reg_upper[i-1];
8301       }
8302       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8303       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8304       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8305       current.u|=1;
8306       current.uu|=1;
8307       struct regstat temp;
8308       memcpy(&temp,&current,sizeof(current));
8309       temp.wasdirty=temp.dirty;
8310       temp.was32=temp.is32;
8311       // TODO: Take into account unconditional branches, as below
8312       delayslot_alloc(&temp,i);
8313       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8314       regs[i].wasdirty=temp.wasdirty;
8315       regs[i].was32=temp.was32;
8316       regs[i].dirty=temp.dirty;
8317       regs[i].is32=temp.is32;
8318       regs[i].isconst=0;
8319       regs[i].wasconst=0;
8320       current.isconst=0;
8321       // Create entry (branch target) regmap
8322       for(hr=0;hr<HOST_REGS;hr++)
8323       {
8324         int r=temp.regmap[hr];
8325         if(r>=0) {
8326           if(r!=regmap_pre[i][hr]) {
8327             regs[i].regmap_entry[hr]=-1;
8328           }
8329           else
8330           {
8331             if(r<64){
8332               if((current.u>>r)&1) {
8333                 regs[i].regmap_entry[hr]=-1;
8334                 regs[i].regmap[hr]=-1;
8335                 //Don't clear regs in the delay slot as the branch might need them
8336                 //current.regmap[hr]=-1;
8337               }else
8338                 regs[i].regmap_entry[hr]=r;
8339             }
8340             else {
8341               if((current.uu>>(r&63))&1) {
8342                 regs[i].regmap_entry[hr]=-1;
8343                 regs[i].regmap[hr]=-1;
8344                 //Don't clear regs in the delay slot as the branch might need them
8345                 //current.regmap[hr]=-1;
8346               }else
8347                 regs[i].regmap_entry[hr]=r;
8348             }
8349           }
8350         } else {
8351           // First instruction expects CCREG to be allocated
8352           if(i==0&&hr==HOST_CCREG) 
8353             regs[i].regmap_entry[hr]=CCREG;
8354           else
8355             regs[i].regmap_entry[hr]=-1;
8356         }
8357       }
8358     }
8359     else { // Not delay slot
8360       switch(itype[i]) {
8361         case UJUMP:
8362           //current.isconst=0; // DEBUG
8363           //current.wasconst=0; // DEBUG
8364           //regs[i].wasconst=0; // DEBUG
8365           clear_const(&current,rt1[i]);
8366           alloc_cc(&current,i);
8367           dirty_reg(&current,CCREG);
8368           if (rt1[i]==31) {
8369             alloc_reg(&current,i,31);
8370             dirty_reg(&current,31);
8371             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8372             #ifdef REG_PREFETCH
8373             alloc_reg(&current,i,PTEMP);
8374             #endif
8375             //current.is32|=1LL<<rt1[i];
8376           }
8377           delayslot_alloc(&current,i+1);
8378           //current.isconst=0; // DEBUG
8379           ds=1;
8380           //printf("i=%d, isconst=%x\n",i,current.isconst);
8381           break;
8382         case RJUMP:
8383           //current.isconst=0;
8384           //current.wasconst=0;
8385           //regs[i].wasconst=0;
8386           clear_const(&current,rs1[i]);
8387           clear_const(&current,rt1[i]);
8388           alloc_cc(&current,i);
8389           dirty_reg(&current,CCREG);
8390           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8391             alloc_reg(&current,i,rs1[i]);
8392             if (rt1[i]==31) {
8393               alloc_reg(&current,i,31);
8394               dirty_reg(&current,31);
8395               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8396               #ifdef REG_PREFETCH
8397               alloc_reg(&current,i,PTEMP);
8398               #endif
8399             }
8400             #ifdef USE_MINI_HT
8401             if(rs1[i]==31) { // JALR
8402               alloc_reg(&current,i,RHASH);
8403               #ifndef HOST_IMM_ADDR32
8404               alloc_reg(&current,i,RHTBL);
8405               #endif
8406             }
8407             #endif
8408             delayslot_alloc(&current,i+1);
8409           } else {
8410             // The delay slot overwrites our source register,
8411             // allocate a temporary register to hold the old value.
8412             current.isconst=0;
8413             current.wasconst=0;
8414             regs[i].wasconst=0;
8415             delayslot_alloc(&current,i+1);
8416             current.isconst=0;
8417             alloc_reg(&current,i,RTEMP);
8418           }
8419           //current.isconst=0; // DEBUG
8420           ds=1;
8421           break;
8422         case CJUMP:
8423           //current.isconst=0;
8424           //current.wasconst=0;
8425           //regs[i].wasconst=0;
8426           clear_const(&current,rs1[i]);
8427           clear_const(&current,rs2[i]);
8428           if((opcode[i]&0x3E)==4) // BEQ/BNE
8429           {
8430             alloc_cc(&current,i);
8431             dirty_reg(&current,CCREG);
8432             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8433             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8434             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8435             {
8436               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8437               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8438             }
8439             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8440                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8441               // The delay slot overwrites one of our conditions.
8442               // Allocate the branch condition registers instead.
8443               // Note that such a sequence of instructions could
8444               // be considered a bug since the branch can not be
8445               // re-executed if an exception occurs.
8446               current.isconst=0;
8447               current.wasconst=0;
8448               regs[i].wasconst=0;
8449               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8450               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8451               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8452               {
8453                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8454                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8455               }
8456             }
8457             else delayslot_alloc(&current,i+1);
8458           }
8459           else
8460           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8461           {
8462             alloc_cc(&current,i);
8463             dirty_reg(&current,CCREG);
8464             alloc_reg(&current,i,rs1[i]);
8465             if(!(current.is32>>rs1[i]&1))
8466             {
8467               alloc_reg64(&current,i,rs1[i]);
8468             }
8469             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8470               // The delay slot overwrites one of our conditions.
8471               // Allocate the branch condition registers instead.
8472               // Note that such a sequence of instructions could
8473               // be considered a bug since the branch can not be
8474               // re-executed if an exception occurs.
8475               current.isconst=0;
8476               current.wasconst=0;
8477               regs[i].wasconst=0;
8478               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8479               if(!((current.is32>>rs1[i])&1))
8480               {
8481                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8482               }
8483             }
8484             else delayslot_alloc(&current,i+1);
8485           }
8486           else
8487           // Don't alloc the delay slot yet because we might not execute it
8488           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8489           {
8490             current.isconst=0;
8491             current.wasconst=0;
8492             regs[i].wasconst=0;
8493             alloc_cc(&current,i);
8494             dirty_reg(&current,CCREG);
8495             alloc_reg(&current,i,rs1[i]);
8496             alloc_reg(&current,i,rs2[i]);
8497             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8498             {
8499               alloc_reg64(&current,i,rs1[i]);
8500               alloc_reg64(&current,i,rs2[i]);
8501             }
8502           }
8503           else
8504           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8505           {
8506             current.isconst=0;
8507             current.wasconst=0;
8508             regs[i].wasconst=0;
8509             alloc_cc(&current,i);
8510             dirty_reg(&current,CCREG);
8511             alloc_reg(&current,i,rs1[i]);
8512             if(!(current.is32>>rs1[i]&1))
8513             {
8514               alloc_reg64(&current,i,rs1[i]);
8515             }
8516           }
8517           ds=1;
8518           //current.isconst=0;
8519           break;
8520         case SJUMP:
8521           //current.isconst=0;
8522           //current.wasconst=0;
8523           //regs[i].wasconst=0;
8524           clear_const(&current,rs1[i]);
8525           clear_const(&current,rt1[i]);
8526           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8527           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8528           {
8529             alloc_cc(&current,i);
8530             dirty_reg(&current,CCREG);
8531             alloc_reg(&current,i,rs1[i]);
8532             if(!(current.is32>>rs1[i]&1))
8533             {
8534               alloc_reg64(&current,i,rs1[i]);
8535             }
8536             if (rt1[i]==31) { // BLTZAL/BGEZAL
8537               alloc_reg(&current,i,31);
8538               dirty_reg(&current,31);
8539               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8540               //#ifdef REG_PREFETCH
8541               //alloc_reg(&current,i,PTEMP);
8542               //#endif
8543               //current.is32|=1LL<<rt1[i];
8544             }
8545             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8546               // The delay slot overwrites the branch condition.
8547               // Allocate the branch condition registers instead.
8548               // Note that such a sequence of instructions could
8549               // be considered a bug since the branch can not be
8550               // re-executed if an exception occurs.
8551               current.isconst=0;
8552               current.wasconst=0;
8553               regs[i].wasconst=0;
8554               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8555               if(!((current.is32>>rs1[i])&1))
8556               {
8557                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8558               }
8559             }
8560             else delayslot_alloc(&current,i+1);
8561           }
8562           else
8563           // Don't alloc the delay slot yet because we might not execute it
8564           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8565           {
8566             current.isconst=0;
8567             current.wasconst=0;
8568             regs[i].wasconst=0;
8569             alloc_cc(&current,i);
8570             dirty_reg(&current,CCREG);
8571             alloc_reg(&current,i,rs1[i]);
8572             if(!(current.is32>>rs1[i]&1))
8573             {
8574               alloc_reg64(&current,i,rs1[i]);
8575             }
8576           }
8577           ds=1;
8578           //current.isconst=0;
8579           break;
8580         case FJUMP:
8581           current.isconst=0;
8582           current.wasconst=0;
8583           regs[i].wasconst=0;
8584           if(likely[i]==0) // BC1F/BC1T
8585           {
8586             // TODO: Theoretically we can run out of registers here on x86.
8587             // The delay slot can allocate up to six, and we need to check
8588             // CSREG before executing the delay slot.  Possibly we can drop
8589             // the cycle count and then reload it after checking that the
8590             // FPU is in a usable state, or don't do out-of-order execution.
8591             alloc_cc(&current,i);
8592             dirty_reg(&current,CCREG);
8593             alloc_reg(&current,i,FSREG);
8594             alloc_reg(&current,i,CSREG);
8595             if(itype[i+1]==FCOMP) {
8596               // The delay slot overwrites the branch condition.
8597               // Allocate the branch condition registers instead.
8598               // Note that such a sequence of instructions could
8599               // be considered a bug since the branch can not be
8600               // re-executed if an exception occurs.
8601               alloc_cc(&current,i);
8602               dirty_reg(&current,CCREG);
8603               alloc_reg(&current,i,CSREG);
8604               alloc_reg(&current,i,FSREG);
8605             }
8606             else {
8607               delayslot_alloc(&current,i+1);
8608               alloc_reg(&current,i+1,CSREG);
8609             }
8610           }
8611           else
8612           // Don't alloc the delay slot yet because we might not execute it
8613           if(likely[i]) // BC1FL/BC1TL
8614           {
8615             alloc_cc(&current,i);
8616             dirty_reg(&current,CCREG);
8617             alloc_reg(&current,i,CSREG);
8618             alloc_reg(&current,i,FSREG);
8619           }
8620           ds=1;
8621           current.isconst=0;
8622           break;
8623         case IMM16:
8624           imm16_alloc(&current,i);
8625           break;
8626         case LOAD:
8627         case LOADLR:
8628           load_alloc(&current,i);
8629           break;
8630         case STORE:
8631         case STORELR:
8632           store_alloc(&current,i);
8633           break;
8634         case ALU:
8635           alu_alloc(&current,i);
8636           break;
8637         case SHIFT:
8638           shift_alloc(&current,i);
8639           break;
8640         case MULTDIV:
8641           multdiv_alloc(&current,i);
8642           break;
8643         case SHIFTIMM:
8644           shiftimm_alloc(&current,i);
8645           break;
8646         case MOV:
8647           mov_alloc(&current,i);
8648           break;
8649         case COP0:
8650           cop0_alloc(&current,i);
8651           break;
8652         case COP1:
8653           cop1_alloc(&current,i);
8654           break;
8655         case C1LS:
8656           c1ls_alloc(&current,i);
8657           break;
8658         case FCONV:
8659           fconv_alloc(&current,i);
8660           break;
8661         case FLOAT:
8662           float_alloc(&current,i);
8663           break;
8664         case FCOMP:
8665           fcomp_alloc(&current,i);
8666           break;
8667         case SYSCALL:
8668           syscall_alloc(&current,i);
8669           break;
8670         case SPAN:
8671           pagespan_alloc(&current,i);
8672           break;
8673       }
8674       
8675       // Drop the upper half of registers that have become 32-bit
8676       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8677       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8678         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8679         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8680         current.uu|=1;
8681       } else {
8682         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8683         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8684         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8685         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8686         current.uu|=1;
8687       }
8688
8689       // Create entry (branch target) regmap
8690       for(hr=0;hr<HOST_REGS;hr++)
8691       {
8692         int r,or,er;
8693         r=current.regmap[hr];
8694         if(r>=0) {
8695           if(r!=regmap_pre[i][hr]) {
8696             // TODO: delay slot (?)
8697             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8698             if(or<0||(r&63)>=TEMPREG){
8699               regs[i].regmap_entry[hr]=-1;
8700             }
8701             else
8702             {
8703               // Just move it to a different register
8704               regs[i].regmap_entry[hr]=r;
8705               // If it was dirty before, it's still dirty
8706               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8707             }
8708           }
8709           else
8710           {
8711             // Unneeded
8712             if(r==0){
8713               regs[i].regmap_entry[hr]=0;
8714             }
8715             else
8716             if(r<64){
8717               if((current.u>>r)&1) {
8718                 regs[i].regmap_entry[hr]=-1;
8719                 //regs[i].regmap[hr]=-1;
8720                 current.regmap[hr]=-1;
8721               }else
8722                 regs[i].regmap_entry[hr]=r;
8723             }
8724             else {
8725               if((current.uu>>(r&63))&1) {
8726                 regs[i].regmap_entry[hr]=-1;
8727                 //regs[i].regmap[hr]=-1;
8728                 current.regmap[hr]=-1;
8729               }else
8730                 regs[i].regmap_entry[hr]=r;
8731             }
8732           }
8733         } else {
8734           // Branches expect CCREG to be allocated at the target
8735           if(regmap_pre[i][hr]==CCREG) 
8736             regs[i].regmap_entry[hr]=CCREG;
8737           else
8738             regs[i].regmap_entry[hr]=-1;
8739         }
8740       }
8741       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8742     }
8743     /* Branch post-alloc */
8744     if(i>0)
8745     {
8746       current.was32=current.is32;
8747       current.wasdirty=current.dirty;
8748       switch(itype[i-1]) {
8749         case UJUMP:
8750           memcpy(&branch_regs[i-1],&current,sizeof(current));
8751           branch_regs[i-1].isconst=0;
8752           branch_regs[i-1].wasconst=0;
8753           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8754           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8755           alloc_cc(&branch_regs[i-1],i-1);
8756           dirty_reg(&branch_regs[i-1],CCREG);
8757           if(rt1[i-1]==31) { // JAL
8758             alloc_reg(&branch_regs[i-1],i-1,31);
8759             dirty_reg(&branch_regs[i-1],31);
8760             branch_regs[i-1].is32|=1LL<<31;
8761           }
8762           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8763           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8764           break;
8765         case RJUMP:
8766           memcpy(&branch_regs[i-1],&current,sizeof(current));
8767           branch_regs[i-1].isconst=0;
8768           branch_regs[i-1].wasconst=0;
8769           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8770           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8771           alloc_cc(&branch_regs[i-1],i-1);
8772           dirty_reg(&branch_regs[i-1],CCREG);
8773           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8774           if(rt1[i-1]==31) { // JALR
8775             alloc_reg(&branch_regs[i-1],i-1,31);
8776             dirty_reg(&branch_regs[i-1],31);
8777             branch_regs[i-1].is32|=1LL<<31;
8778           }
8779           #ifdef USE_MINI_HT
8780           if(rs1[i-1]==31) { // JALR
8781             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8782             #ifndef HOST_IMM_ADDR32
8783             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8784             #endif
8785           }
8786           #endif
8787           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8788           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8789           break;
8790         case CJUMP:
8791           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8792           {
8793             alloc_cc(&current,i-1);
8794             dirty_reg(&current,CCREG);
8795             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8796                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8797               // The delay slot overwrote one of our conditions
8798               // Delay slot goes after the test (in order)
8799               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8800               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8801               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8802               current.u|=1;
8803               current.uu|=1;
8804               delayslot_alloc(&current,i);
8805               current.isconst=0;
8806             }
8807             else
8808             {
8809               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8810               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8811               // Alloc the branch condition registers
8812               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8813               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8814               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8815               {
8816                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8817                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8818               }
8819             }
8820             memcpy(&branch_regs[i-1],&current,sizeof(current));
8821             branch_regs[i-1].isconst=0;
8822             branch_regs[i-1].wasconst=0;
8823             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8824             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8825           }
8826           else
8827           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8828           {
8829             alloc_cc(&current,i-1);
8830             dirty_reg(&current,CCREG);
8831             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8832               // The delay slot overwrote the branch condition
8833               // Delay slot goes after the test (in order)
8834               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8835               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8836               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8837               current.u|=1;
8838               current.uu|=1;
8839               delayslot_alloc(&current,i);
8840               current.isconst=0;
8841             }
8842             else
8843             {
8844               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8845               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8846               // Alloc the branch condition register
8847               alloc_reg(&current,i-1,rs1[i-1]);
8848               if(!(current.is32>>rs1[i-1]&1))
8849               {
8850                 alloc_reg64(&current,i-1,rs1[i-1]);
8851               }
8852             }
8853             memcpy(&branch_regs[i-1],&current,sizeof(current));
8854             branch_regs[i-1].isconst=0;
8855             branch_regs[i-1].wasconst=0;
8856             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8857             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8858           }
8859           else
8860           // Alloc the delay slot in case the branch is taken
8861           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8862           {
8863             memcpy(&branch_regs[i-1],&current,sizeof(current));
8864             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8865             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8866             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8867             alloc_cc(&branch_regs[i-1],i);
8868             dirty_reg(&branch_regs[i-1],CCREG);
8869             delayslot_alloc(&branch_regs[i-1],i);
8870             branch_regs[i-1].isconst=0;
8871             alloc_reg(&current,i,CCREG); // Not taken path
8872             dirty_reg(&current,CCREG);
8873             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8874           }
8875           else
8876           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8877           {
8878             memcpy(&branch_regs[i-1],&current,sizeof(current));
8879             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8880             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8881             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8882             alloc_cc(&branch_regs[i-1],i);
8883             dirty_reg(&branch_regs[i-1],CCREG);
8884             delayslot_alloc(&branch_regs[i-1],i);
8885             branch_regs[i-1].isconst=0;
8886             alloc_reg(&current,i,CCREG); // Not taken path
8887             dirty_reg(&current,CCREG);
8888             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8889           }
8890           break;
8891         case SJUMP:
8892           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8893           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8894           {
8895             alloc_cc(&current,i-1);
8896             dirty_reg(&current,CCREG);
8897             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8898               // The delay slot overwrote the branch condition
8899               // Delay slot goes after the test (in order)
8900               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8901               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8902               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8903               current.u|=1;
8904               current.uu|=1;
8905               delayslot_alloc(&current,i);
8906               current.isconst=0;
8907             }
8908             else
8909             {
8910               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8911               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8912               // Alloc the branch condition register
8913               alloc_reg(&current,i-1,rs1[i-1]);
8914               if(!(current.is32>>rs1[i-1]&1))
8915               {
8916                 alloc_reg64(&current,i-1,rs1[i-1]);
8917               }
8918             }
8919             memcpy(&branch_regs[i-1],&current,sizeof(current));
8920             branch_regs[i-1].isconst=0;
8921             branch_regs[i-1].wasconst=0;
8922             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8923             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8924           }
8925           else
8926           // Alloc the delay slot in case the branch is taken
8927           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8928           {
8929             memcpy(&branch_regs[i-1],&current,sizeof(current));
8930             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8931             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8932             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8933             alloc_cc(&branch_regs[i-1],i);
8934             dirty_reg(&branch_regs[i-1],CCREG);
8935             delayslot_alloc(&branch_regs[i-1],i);
8936             branch_regs[i-1].isconst=0;
8937             alloc_reg(&current,i,CCREG); // Not taken path
8938             dirty_reg(&current,CCREG);
8939             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8940           }
8941           // FIXME: BLTZAL/BGEZAL
8942           if(opcode2[i-1]&0x10) { // BxxZAL
8943             alloc_reg(&branch_regs[i-1],i-1,31);
8944             dirty_reg(&branch_regs[i-1],31);
8945             branch_regs[i-1].is32|=1LL<<31;
8946           }
8947           break;
8948         case FJUMP:
8949           if(likely[i-1]==0) // BC1F/BC1T
8950           {
8951             alloc_cc(&current,i-1);
8952             dirty_reg(&current,CCREG);
8953             if(itype[i]==FCOMP) {
8954               // The delay slot overwrote the branch condition
8955               // Delay slot goes after the test (in order)
8956               delayslot_alloc(&current,i);
8957               current.isconst=0;
8958             }
8959             else
8960             {
8961               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8962               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8963               // Alloc the branch condition register
8964               alloc_reg(&current,i-1,FSREG);
8965             }
8966             memcpy(&branch_regs[i-1],&current,sizeof(current));
8967             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8968           }
8969           else // BC1FL/BC1TL
8970           {
8971             // Alloc the delay slot in case the branch is taken
8972             memcpy(&branch_regs[i-1],&current,sizeof(current));
8973             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8974             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8975             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8976             alloc_cc(&branch_regs[i-1],i);
8977             dirty_reg(&branch_regs[i-1],CCREG);
8978             delayslot_alloc(&branch_regs[i-1],i);
8979             branch_regs[i-1].isconst=0;
8980             alloc_reg(&current,i,CCREG); // Not taken path
8981             dirty_reg(&current,CCREG);
8982             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8983           }
8984           break;
8985       }
8986
8987       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8988       {
8989         if(rt1[i-1]==31) // JAL/JALR
8990         {
8991           // Subroutine call will return here, don't alloc any registers
8992           current.is32=1;
8993           current.dirty=0;
8994           clear_all_regs(current.regmap);
8995           alloc_reg(&current,i,CCREG);
8996           dirty_reg(&current,CCREG);
8997         }
8998         else if(i+1<slen)
8999         {
9000           // Internal branch will jump here, match registers to caller
9001           current.is32=0x3FFFFFFFFLL;
9002           current.dirty=0;
9003           clear_all_regs(current.regmap);
9004           alloc_reg(&current,i,CCREG);
9005           dirty_reg(&current,CCREG);
9006           for(j=i-1;j>=0;j--)
9007           {
9008             if(ba[j]==start+i*4+4) {
9009               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9010               current.is32=branch_regs[j].is32;
9011               current.dirty=branch_regs[j].dirty;
9012               break;
9013             }
9014           }
9015           while(j>=0) {
9016             if(ba[j]==start+i*4+4) {
9017               for(hr=0;hr<HOST_REGS;hr++) {
9018                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9019                   current.regmap[hr]=-1;
9020                 }
9021                 current.is32&=branch_regs[j].is32;
9022                 current.dirty&=branch_regs[j].dirty;
9023               }
9024             }
9025             j--;
9026           }
9027         }
9028       }
9029     }
9030
9031     // Count cycles in between branches
9032     ccadj[i]=cc;
9033     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL))
9034     {
9035       cc=0;
9036     }
9037     else
9038     {
9039       cc++;
9040     }
9041
9042     flush_dirty_uppers(&current);
9043     if(!is_ds[i]) {
9044       regs[i].is32=current.is32;
9045       regs[i].dirty=current.dirty;
9046       regs[i].isconst=current.isconst;
9047       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9048     }
9049     for(hr=0;hr<HOST_REGS;hr++) {
9050       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9051         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9052           regs[i].wasconst&=~(1<<hr);
9053         }
9054       }
9055     }
9056     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9057   }
9058   
9059   /* Pass 4 - Cull unused host registers */
9060   
9061   uint64_t nr=0;
9062   
9063   for (i=slen-1;i>=0;i--)
9064   {
9065     int hr;
9066     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9067     {
9068       if(ba[i]<start || ba[i]>=(start+slen*4))
9069       {
9070         // Branch out of this block, don't need anything
9071         nr=0;
9072       }
9073       else
9074       {
9075         // Internal branch
9076         // Need whatever matches the target
9077         nr=0;
9078         int t=(ba[i]-start)>>2;
9079         for(hr=0;hr<HOST_REGS;hr++)
9080         {
9081           if(regs[i].regmap_entry[hr]>=0) {
9082             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9083           }
9084         }
9085       }
9086       // Conditional branch may need registers for following instructions
9087       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9088       {
9089         if(i<slen-2) {
9090           nr|=needed_reg[i+2];
9091           for(hr=0;hr<HOST_REGS;hr++)
9092           {
9093             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9094             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9095           }
9096         }
9097       }
9098       // Don't need stuff which is overwritten
9099       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9100       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9101       // Merge in delay slot
9102       for(hr=0;hr<HOST_REGS;hr++)
9103       {
9104         if(!likely[i]) {
9105           // These are overwritten unless the branch is "likely"
9106           // and the delay slot is nullified if not taken
9107           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9108           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9109         }
9110         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9111         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9112         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9113         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9114         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9115         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9116         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9117         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9118         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9119           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9120           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9121         }
9122         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9123           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9124           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9125         }
9126         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9127           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9128           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9129         }
9130       }
9131     }
9132     else if(itype[i]==SYSCALL)
9133     {
9134       // SYSCALL instruction (software interrupt)
9135       nr=0;
9136     }
9137     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9138     {
9139       // ERET instruction (return from interrupt)
9140       nr=0;
9141     }
9142     else // Non-branch
9143     {
9144       if(i<slen-1) {
9145         for(hr=0;hr<HOST_REGS;hr++) {
9146           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9147           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9148           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9149           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9150         }
9151       }
9152     }
9153     for(hr=0;hr<HOST_REGS;hr++)
9154     {
9155       // Overwritten registers are not needed
9156       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9157       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9158       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9159       // Source registers are needed
9160       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9161       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9162       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9163       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9164       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9165       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9166       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9167       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9168       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9169         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9170         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9171       }
9172       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9173         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9174         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9175       }
9176       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9177         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9178         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9179       }
9180       // Don't store a register immediately after writing it,
9181       // may prevent dual-issue.
9182       // But do so if this is a branch target, otherwise we
9183       // might have to load the register before the branch.
9184       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9185         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9186            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9187           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9188           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9189         }
9190         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9191            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9192           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9193           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9194         }
9195       }
9196     }
9197     // Cycle count is needed at branches.  Assume it is needed at the target too.
9198     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9199       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9200       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9201     }
9202     // Save it
9203     needed_reg[i]=nr;
9204     
9205     // Deallocate unneeded registers
9206     for(hr=0;hr<HOST_REGS;hr++)
9207     {
9208       if(!((nr>>hr)&1)) {
9209         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9210         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9211            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9212            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9213         {
9214           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9215           {
9216             if(likely[i]) {
9217               regs[i].regmap[hr]=-1;
9218               regs[i].isconst&=~(1<<hr);
9219               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9220             }
9221           }
9222         }
9223         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9224         {
9225           int d1=0,d2=0,map=0,temp=0;
9226           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9227           {
9228             d1=dep1[i+1];
9229             d2=dep2[i+1];
9230           }
9231           if(using_tlb) {
9232             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9233                itype[i+1]==STORE || itype[i+1]==STORELR ||
9234                itype[i+1]==C1LS )
9235             map=TLREG;
9236           } else
9237           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9238             map=INVCP;
9239           }
9240           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9241              itype[i+1]==C1LS )
9242             temp=FTEMP;
9243           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9244              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9245              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9246              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9247              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9248              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9249              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9250              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9251              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9252              regs[i].regmap[hr]!=map )
9253           {
9254             regs[i].regmap[hr]=-1;
9255             regs[i].isconst&=~(1<<hr);
9256             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9257                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9258                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9259                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9260                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9261                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9262                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9263                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9264                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9265                branch_regs[i].regmap[hr]!=map)
9266             {
9267               branch_regs[i].regmap[hr]=-1;
9268               branch_regs[i].regmap_entry[hr]=-1;
9269               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9270               {
9271                 if(!likely[i]&&i<slen-2) {
9272                   regmap_pre[i+2][hr]=-1;
9273                 }
9274               }
9275             }
9276           }
9277         }
9278         else
9279         {
9280           // Non-branch
9281           if(i>0)
9282           {
9283             int d1=0,d2=0,map=-1,temp=-1;
9284             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9285             {
9286               d1=dep1[i];
9287               d2=dep2[i];
9288             }
9289             if(using_tlb) {
9290               if(itype[i]==LOAD || itype[i]==LOADLR ||
9291                  itype[i]==STORE || itype[i]==STORELR ||
9292                  itype[i]==C1LS )
9293               map=TLREG;
9294             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9295               map=INVCP;
9296             }
9297             if(itype[i]==LOADLR || itype[i]==STORELR ||
9298                itype[i]==C1LS )
9299               temp=FTEMP;
9300             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9301                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9302                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9303                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9304                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9305                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9306             {
9307               if(i<slen-1&&!is_ds[i]) {
9308                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9309                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9310                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9311                 {
9312                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9313                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9314                 }
9315                 regmap_pre[i+1][hr]=-1;
9316                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9317               }
9318               regs[i].regmap[hr]=-1;
9319               regs[i].isconst&=~(1<<hr);
9320             }
9321           }
9322         }
9323       }
9324     }
9325   }
9326   
9327   /* Pass 5 - Pre-allocate registers */
9328   
9329   // If a register is allocated during a loop, try to allocate it for the
9330   // entire loop, if possible.  This avoids loading/storing registers
9331   // inside of the loop.
9332
9333   signed char f_regmap[HOST_REGS];
9334   clear_all_regs(f_regmap);
9335   for(i=0;i<slen-1;i++)
9336   {
9337     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9338     {
9339       if(ba[i]>=start && ba[i]<(start+i*4)) 
9340       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9341       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9342       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9343       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9344       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9345       {
9346         int t=(ba[i]-start)>>2;
9347         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9348         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9349         for(hr=0;hr<HOST_REGS;hr++)
9350         {
9351           if(regs[i].regmap[hr]>64) {
9352             if(!((regs[i].dirty>>hr)&1))
9353               f_regmap[hr]=regs[i].regmap[hr];
9354             else f_regmap[hr]=-1;
9355           }
9356           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9357           if(branch_regs[i].regmap[hr]>64) {
9358             if(!((branch_regs[i].dirty>>hr)&1))
9359               f_regmap[hr]=branch_regs[i].regmap[hr];
9360             else f_regmap[hr]=-1;
9361           }
9362           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9363           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9364           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9365           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9366           {
9367             // Test both in case the delay slot is ooo,
9368             // could be done better...
9369             if(count_free_regs(branch_regs[i].regmap)<2
9370              ||count_free_regs(regs[i].regmap)<2) 
9371               f_regmap[hr]=branch_regs[i].regmap[hr];
9372           }
9373           // Avoid dirty->clean transition
9374           // #ifdef DESTRUCTIVE_WRITEBACK here?
9375           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9376           if(f_regmap[hr]>0) {
9377             if(regs[t].regmap_entry[hr]<0) {
9378               int r=f_regmap[hr];
9379               for(j=t;j<=i;j++)
9380               {
9381                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9382                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9383                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9384                 if(r>63) {
9385                   // NB This can exclude the case where the upper-half
9386                   // register is lower numbered than the lower-half
9387                   // register.  Not sure if it's worth fixing...
9388                   if(get_reg(regs[j].regmap,r&63)<0) break;
9389                   if(regs[j].is32&(1LL<<(r&63))) break;
9390                 }
9391                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9392                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9393                   int k;
9394                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9395                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9396                     if(r>63) {
9397                       if(get_reg(regs[i].regmap,r&63)<0) break;
9398                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9399                     }
9400                     k=i;
9401                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9402                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9403                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9404                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9405                       ||itype[k-1]==FCOMP) {
9406                         if(count_free_regs(regs[k-1].regmap)<2) {
9407                           //printf("no free regs for store %x\n",start+(k-1)*4);
9408                           break;
9409                         }
9410                       }
9411                       else
9412                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9413                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9414                         //printf("no-match due to different register\n");
9415                         break;
9416                       }
9417                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9418                         //printf("no-match due to branch\n");
9419                         break;
9420                       }
9421                       // call/ret fast path assumes no registers allocated
9422                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9423                         break;
9424                       }
9425                       if(r>63) {
9426                         // NB This can exclude the case where the upper-half
9427                         // register is lower numbered than the lower-half
9428                         // register.  Not sure if it's worth fixing...
9429                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9430                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9431                       }
9432                       k--;
9433                     }
9434                     if(i<slen-1) {
9435                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9436                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9437                         //printf("bad match after branch\n");
9438                         break;
9439                       }
9440                     }
9441                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9442                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9443                       while(k<i) {
9444                         regs[k].regmap_entry[hr]=f_regmap[hr];
9445                         regs[k].regmap[hr]=f_regmap[hr];
9446                         regmap_pre[k+1][hr]=f_regmap[hr];
9447                         regs[k].wasdirty&=~(1<<hr);
9448                         regs[k].dirty&=~(1<<hr);
9449                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9450                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9451                         regs[k].wasconst&=~(1<<hr);
9452                         regs[k].isconst&=~(1<<hr);
9453                         k++;
9454                       }
9455                     }
9456                     else {
9457                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9458                       break;
9459                     }
9460                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9461                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9462                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9463                       regs[i].regmap_entry[hr]=f_regmap[hr];
9464                       regs[i].regmap[hr]=f_regmap[hr];
9465                       regs[i].wasdirty&=~(1<<hr);
9466                       regs[i].dirty&=~(1<<hr);
9467                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9468                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9469                       regs[i].wasconst&=~(1<<hr);
9470                       regs[i].isconst&=~(1<<hr);
9471                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9472                       branch_regs[i].wasdirty&=~(1<<hr);
9473                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9474                       branch_regs[i].regmap[hr]=f_regmap[hr];
9475                       branch_regs[i].dirty&=~(1<<hr);
9476                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9477                       branch_regs[i].wasconst&=~(1<<hr);
9478                       branch_regs[i].isconst&=~(1<<hr);
9479                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9480                         regmap_pre[i+2][hr]=f_regmap[hr];
9481                         regs[i+2].wasdirty&=~(1<<hr);
9482                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9483                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9484                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9485                       }
9486                     }
9487                   }
9488                   for(k=t;k<j;k++) {
9489                     regs[k].regmap_entry[hr]=f_regmap[hr];
9490                     regs[k].regmap[hr]=f_regmap[hr];
9491                     regmap_pre[k+1][hr]=f_regmap[hr];
9492                     regs[k+1].wasdirty&=~(1<<hr);
9493                     regs[k].dirty&=~(1<<hr);
9494                     regs[k].wasconst&=~(1<<hr);
9495                     regs[k].isconst&=~(1<<hr);
9496                   }
9497                   if(regs[j].regmap[hr]==f_regmap[hr])
9498                     regs[j].regmap_entry[hr]=f_regmap[hr];
9499                   break;
9500                 }
9501                 if(j==i) break;
9502                 if(regs[j].regmap[hr]>=0)
9503                   break;
9504                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9505                   //printf("no-match due to different register\n");
9506                   break;
9507                 }
9508                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9509                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9510                   break;
9511                 }
9512                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9513                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9514                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9515                   if(count_free_regs(regs[j].regmap)<2) {
9516                     //printf("No free regs for store %x\n",start+j*4);
9517                     break;
9518                   }
9519                 }
9520                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9521                 if(f_regmap[hr]>=64) {
9522                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9523                     break;
9524                   }
9525                   else
9526                   {
9527                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9528                       break;
9529                     }
9530                   }
9531                 }
9532               }
9533             }
9534           }
9535         }
9536       }
9537     }else{
9538       int count=0;
9539       for(hr=0;hr<HOST_REGS;hr++)
9540       {
9541         if(hr!=EXCLUDE_REG) {
9542           if(regs[i].regmap[hr]>64) {
9543             if(!((regs[i].dirty>>hr)&1))
9544               f_regmap[hr]=regs[i].regmap[hr];
9545           }
9546           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9547           else if(regs[i].regmap[hr]<0) count++;
9548         }
9549       }
9550       // Try to restore cycle count at branch targets
9551       if(bt[i]) {
9552         for(j=i;j<slen-1;j++) {
9553           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9554           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9555           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9556           ||itype[j]==FCOMP||itype[j]==FCONV) {
9557             if(count_free_regs(regs[j].regmap)<2) {
9558               //printf("no free regs for store %x\n",start+j*4);
9559               break;
9560             }
9561           }
9562           else
9563           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9564         }
9565         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9566           int k=i;
9567           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9568           while(k<j) {
9569             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9570             regs[k].regmap[HOST_CCREG]=CCREG;
9571             regmap_pre[k+1][HOST_CCREG]=CCREG;
9572             regs[k+1].wasdirty|=1<<HOST_CCREG;
9573             regs[k].dirty|=1<<HOST_CCREG;
9574             regs[k].wasconst&=~(1<<HOST_CCREG);
9575             regs[k].isconst&=~(1<<HOST_CCREG);
9576             k++;
9577           }
9578           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9579         }
9580         // Work backwards from the branch target
9581         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9582         {
9583           //printf("Extend backwards\n");
9584           int k;
9585           k=i;
9586           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9587             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9588             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9589             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9590               if(count_free_regs(regs[k-1].regmap)<2) {
9591                 //printf("no free regs for store %x\n",start+(k-1)*4);
9592                 break;
9593               }
9594             }
9595             else
9596             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9597             k--;
9598           }
9599           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9600             //printf("Extend CC, %x ->\n",start+k*4);
9601             while(k<=i) {
9602               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9603               regs[k].regmap[HOST_CCREG]=CCREG;
9604               regmap_pre[k+1][HOST_CCREG]=CCREG;
9605               regs[k+1].wasdirty|=1<<HOST_CCREG;
9606               regs[k].dirty|=1<<HOST_CCREG;
9607               regs[k].wasconst&=~(1<<HOST_CCREG);
9608               regs[k].isconst&=~(1<<HOST_CCREG);
9609               k++;
9610             }
9611           }
9612           else {
9613             //printf("Fail Extend CC, %x ->\n",start+k*4);
9614           }
9615         }
9616       }
9617       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9618          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9619          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9620          itype[i]!=FCONV&&itype[i]!=FCOMP)
9621       {
9622         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9623       }
9624     }
9625   }
9626   
9627   // This allocates registers (if possible) one instruction prior
9628   // to use, which can avoid a load-use penalty on certain CPUs.
9629   for(i=0;i<slen-1;i++)
9630   {
9631     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9632     {
9633       if(!bt[i+1])
9634       {
9635         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9636         {
9637           if(rs1[i+1]) {
9638             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9639             {
9640               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9641               {
9642                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9643                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9644                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9645                 regs[i].isconst&=~(1<<hr);
9646                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9647                 constmap[i][hr]=constmap[i+1][hr];
9648                 regs[i+1].wasdirty&=~(1<<hr);
9649                 regs[i].dirty&=~(1<<hr);
9650               }
9651             }
9652           }
9653           if(rs2[i+1]) {
9654             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9655             {
9656               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9657               {
9658                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9659                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9660                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9661                 regs[i].isconst&=~(1<<hr);
9662                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9663                 constmap[i][hr]=constmap[i+1][hr];
9664                 regs[i+1].wasdirty&=~(1<<hr);
9665                 regs[i].dirty&=~(1<<hr);
9666               }
9667             }
9668           }
9669           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9670             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9671             {
9672               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9673               {
9674                 regs[i].regmap[hr]=rs1[i+1];
9675                 regmap_pre[i+1][hr]=rs1[i+1];
9676                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9677                 regs[i].isconst&=~(1<<hr);
9678                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9679                 constmap[i][hr]=constmap[i+1][hr];
9680                 regs[i+1].wasdirty&=~(1<<hr);
9681                 regs[i].dirty&=~(1<<hr);
9682               }
9683             }
9684           }
9685           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9686             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9687             {
9688               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9689               {
9690                 regs[i].regmap[hr]=rs1[i+1];
9691                 regmap_pre[i+1][hr]=rs1[i+1];
9692                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9693                 regs[i].isconst&=~(1<<hr);
9694                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9695                 constmap[i][hr]=constmap[i+1][hr];
9696                 regs[i+1].wasdirty&=~(1<<hr);
9697                 regs[i].dirty&=~(1<<hr);
9698               }
9699             }
9700           }
9701           #ifndef HOST_IMM_ADDR32
9702           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9703             hr=get_reg(regs[i+1].regmap,TLREG);
9704             if(hr>=0) {
9705               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9706               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9707                 int nr;
9708                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9709                 {
9710                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9711                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9712                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9713                   regs[i].isconst&=~(1<<hr);
9714                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9715                   constmap[i][hr]=constmap[i+1][hr];
9716                   regs[i+1].wasdirty&=~(1<<hr);
9717                   regs[i].dirty&=~(1<<hr);
9718                 }
9719                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9720                 {
9721                   // move it to another register
9722                   regs[i+1].regmap[hr]=-1;
9723                   regmap_pre[i+2][hr]=-1;
9724                   regs[i+1].regmap[nr]=TLREG;
9725                   regmap_pre[i+2][nr]=TLREG;
9726                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9727                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9728                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9729                   regs[i].isconst&=~(1<<nr);
9730                   regs[i+1].isconst&=~(1<<nr);
9731                   regs[i].dirty&=~(1<<nr);
9732                   regs[i+1].wasdirty&=~(1<<nr);
9733                   regs[i+1].dirty&=~(1<<nr);
9734                   regs[i+2].wasdirty&=~(1<<nr);
9735                 }
9736               }
9737             }
9738           }
9739           #endif
9740           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9741             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9742               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9743               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9744               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9745               assert(hr>=0);
9746               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9747               {
9748                 regs[i].regmap[hr]=rs1[i+1];
9749                 regmap_pre[i+1][hr]=rs1[i+1];
9750                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9751                 regs[i].isconst&=~(1<<hr);
9752                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9753                 constmap[i][hr]=constmap[i+1][hr];
9754                 regs[i+1].wasdirty&=~(1<<hr);
9755                 regs[i].dirty&=~(1<<hr);
9756               }
9757             }
9758           }
9759           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9760             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9761               int nr;
9762               hr=get_reg(regs[i+1].regmap,FTEMP);
9763               assert(hr>=0);
9764               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9765               {
9766                 regs[i].regmap[hr]=rs1[i+1];
9767                 regmap_pre[i+1][hr]=rs1[i+1];
9768                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9769                 regs[i].isconst&=~(1<<hr);
9770                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9771                 constmap[i][hr]=constmap[i+1][hr];
9772                 regs[i+1].wasdirty&=~(1<<hr);
9773                 regs[i].dirty&=~(1<<hr);
9774               }
9775               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9776               {
9777                 // move it to another register
9778                 regs[i+1].regmap[hr]=-1;
9779                 regmap_pre[i+2][hr]=-1;
9780                 regs[i+1].regmap[nr]=FTEMP;
9781                 regmap_pre[i+2][nr]=FTEMP;
9782                 regs[i].regmap[nr]=rs1[i+1];
9783                 regmap_pre[i+1][nr]=rs1[i+1];
9784                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9785                 regs[i].isconst&=~(1<<nr);
9786                 regs[i+1].isconst&=~(1<<nr);
9787                 regs[i].dirty&=~(1<<nr);
9788                 regs[i+1].wasdirty&=~(1<<nr);
9789                 regs[i+1].dirty&=~(1<<nr);
9790                 regs[i+2].wasdirty&=~(1<<nr);
9791               }
9792             }
9793           }
9794           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9795             if(itype[i+1]==LOAD) 
9796               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9797             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9798               hr=get_reg(regs[i+1].regmap,FTEMP);
9799             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9800               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9801               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9802             }
9803             if(hr>=0&&regs[i].regmap[hr]<0) {
9804               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9805               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9806                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9807                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9808                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9809                 regs[i].isconst&=~(1<<hr);
9810                 regs[i+1].wasdirty&=~(1<<hr);
9811                 regs[i].dirty&=~(1<<hr);
9812               }
9813             }
9814           }
9815         }
9816       }
9817     }
9818   }
9819   
9820   /* Pass 6 - Optimize clean/dirty state */
9821   clean_registers(0,slen-1,1);
9822   
9823   /* Pass 7 - Identify 32-bit registers */
9824   
9825   provisional_r32();
9826
9827   u_int r32=0;
9828   
9829   for (i=slen-1;i>=0;i--)
9830   {
9831     int hr;
9832     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9833     {
9834       if(ba[i]<start || ba[i]>=(start+slen*4))
9835       {
9836         // Branch out of this block, don't need anything
9837         r32=0;
9838       }
9839       else
9840       {
9841         // Internal branch
9842         // Need whatever matches the target
9843         // (and doesn't get overwritten by the delay slot instruction)
9844         r32=0;
9845         int t=(ba[i]-start)>>2;
9846         if(ba[i]>start+i*4) {
9847           // Forward branch
9848           if(!(requires_32bit[t]&~regs[i].was32))
9849             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9850         }else{
9851           // Backward branch
9852           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9853           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9854           if(!(pr32[t]&~regs[i].was32))
9855             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9856         }
9857       }
9858       // Conditional branch may need registers for following instructions
9859       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9860       {
9861         if(i<slen-2) {
9862           r32|=requires_32bit[i+2];
9863           r32&=regs[i].was32;
9864           // Mark this address as a branch target since it may be called
9865           // upon return from interrupt
9866           bt[i+2]=1;
9867         }
9868       }
9869       // Merge in delay slot
9870       if(!likely[i]) {
9871         // These are overwritten unless the branch is "likely"
9872         // and the delay slot is nullified if not taken
9873         r32&=~(1LL<<rt1[i+1]);
9874         r32&=~(1LL<<rt2[i+1]);
9875       }
9876       // Assume these are needed (delay slot)
9877       if(us1[i+1]>0)
9878       {
9879         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9880       }
9881       if(us2[i+1]>0)
9882       {
9883         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9884       }
9885       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9886       {
9887         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9888       }
9889       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9890       {
9891         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9892       }
9893     }
9894     else if(itype[i]==SYSCALL)
9895     {
9896       // SYSCALL instruction (software interrupt)
9897       r32=0;
9898     }
9899     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9900     {
9901       // ERET instruction (return from interrupt)
9902       r32=0;
9903     }
9904     // Check 32 bits
9905     r32&=~(1LL<<rt1[i]);
9906     r32&=~(1LL<<rt2[i]);
9907     if(us1[i]>0)
9908     {
9909       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9910     }
9911     if(us2[i]>0)
9912     {
9913       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9914     }
9915     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
9916     {
9917       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
9918     }
9919     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
9920     {
9921       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
9922     }
9923     requires_32bit[i]=r32;
9924     
9925     // Dirty registers which are 32-bit, require 32-bit input
9926     // as they will be written as 32-bit values
9927     for(hr=0;hr<HOST_REGS;hr++)
9928     {
9929       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
9930         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
9931           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
9932           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
9933         }
9934       }
9935     }
9936     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
9937   }
9938
9939   if(itype[slen-1]==SPAN) {
9940     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9941   }
9942   
9943   /* Debug/disassembly */
9944   if((void*)assem_debug==(void*)printf) 
9945   for(i=0;i<slen;i++)
9946   {
9947     printf("U:");
9948     int r;
9949     for(r=1;r<=CCREG;r++) {
9950       if((unneeded_reg[i]>>r)&1) {
9951         if(r==HIREG) printf(" HI");
9952         else if(r==LOREG) printf(" LO");
9953         else printf(" r%d",r);
9954       }
9955     }
9956     printf(" UU:");
9957     for(r=1;r<=CCREG;r++) {
9958       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
9959         if(r==HIREG) printf(" HI");
9960         else if(r==LOREG) printf(" LO");
9961         else printf(" r%d",r);
9962       }
9963     }
9964     printf(" 32:");
9965     for(r=0;r<=CCREG;r++) {
9966       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9967       if((regs[i].was32>>r)&1) {
9968         if(r==CCREG) printf(" CC");
9969         else if(r==HIREG) printf(" HI");
9970         else if(r==LOREG) printf(" LO");
9971         else printf(" r%d",r);
9972       }
9973     }
9974     printf("\n");
9975     #if defined(__i386__) || defined(__x86_64__)
9976     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9977     #endif
9978     #ifdef __arm__
9979     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9980     #endif
9981     printf("needs: ");
9982     if(needed_reg[i]&1) printf("eax ");
9983     if((needed_reg[i]>>1)&1) printf("ecx ");
9984     if((needed_reg[i]>>2)&1) printf("edx ");
9985     if((needed_reg[i]>>3)&1) printf("ebx ");
9986     if((needed_reg[i]>>5)&1) printf("ebp ");
9987     if((needed_reg[i]>>6)&1) printf("esi ");
9988     if((needed_reg[i]>>7)&1) printf("edi ");
9989     printf("r:");
9990     for(r=0;r<=CCREG;r++) {
9991       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
9992       if((requires_32bit[i]>>r)&1) {
9993         if(r==CCREG) printf(" CC");
9994         else if(r==HIREG) printf(" HI");
9995         else if(r==LOREG) printf(" LO");
9996         else printf(" r%d",r);
9997       }
9998     }
9999     printf("\n");
10000     /*printf("pr:");
10001     for(r=0;r<=CCREG;r++) {
10002       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10003       if((pr32[i]>>r)&1) {
10004         if(r==CCREG) printf(" CC");
10005         else if(r==HIREG) printf(" HI");
10006         else if(r==LOREG) printf(" LO");
10007         else printf(" r%d",r);
10008       }
10009     }
10010     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10011     printf("\n");*/
10012     #if defined(__i386__) || defined(__x86_64__)
10013     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10014     printf("dirty: ");
10015     if(regs[i].wasdirty&1) printf("eax ");
10016     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10017     if((regs[i].wasdirty>>2)&1) printf("edx ");
10018     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10019     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10020     if((regs[i].wasdirty>>6)&1) printf("esi ");
10021     if((regs[i].wasdirty>>7)&1) printf("edi ");
10022     #endif
10023     #ifdef __arm__
10024     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10025     printf("dirty: ");
10026     if(regs[i].wasdirty&1) printf("r0 ");
10027     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10028     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10029     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10030     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10031     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10032     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10033     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10034     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10035     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10036     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10037     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10038     #endif
10039     printf("\n");
10040     disassemble_inst(i);
10041     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10042     #if defined(__i386__) || defined(__x86_64__)
10043     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10044     if(regs[i].dirty&1) printf("eax ");
10045     if((regs[i].dirty>>1)&1) printf("ecx ");
10046     if((regs[i].dirty>>2)&1) printf("edx ");
10047     if((regs[i].dirty>>3)&1) printf("ebx ");
10048     if((regs[i].dirty>>5)&1) printf("ebp ");
10049     if((regs[i].dirty>>6)&1) printf("esi ");
10050     if((regs[i].dirty>>7)&1) printf("edi ");
10051     #endif
10052     #ifdef __arm__
10053     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10054     if(regs[i].dirty&1) printf("r0 ");
10055     if((regs[i].dirty>>1)&1) printf("r1 ");
10056     if((regs[i].dirty>>2)&1) printf("r2 ");
10057     if((regs[i].dirty>>3)&1) printf("r3 ");
10058     if((regs[i].dirty>>4)&1) printf("r4 ");
10059     if((regs[i].dirty>>5)&1) printf("r5 ");
10060     if((regs[i].dirty>>6)&1) printf("r6 ");
10061     if((regs[i].dirty>>7)&1) printf("r7 ");
10062     if((regs[i].dirty>>8)&1) printf("r8 ");
10063     if((regs[i].dirty>>9)&1) printf("r9 ");
10064     if((regs[i].dirty>>10)&1) printf("r10 ");
10065     if((regs[i].dirty>>12)&1) printf("r12 ");
10066     #endif
10067     printf("\n");
10068     if(regs[i].isconst) {
10069       printf("constants: ");
10070       #if defined(__i386__) || defined(__x86_64__)
10071       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10072       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10073       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10074       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10075       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10076       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10077       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10078       #endif
10079       #ifdef __arm__
10080       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10081       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10082       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10083       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10084       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10085       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10086       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10087       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10088       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10089       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10090       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10091       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10092       #endif
10093       printf("\n");
10094     }
10095     printf(" 32:");
10096     for(r=0;r<=CCREG;r++) {
10097       if((regs[i].is32>>r)&1) {
10098         if(r==CCREG) printf(" CC");
10099         else if(r==HIREG) printf(" HI");
10100         else if(r==LOREG) printf(" LO");
10101         else printf(" r%d",r);
10102       }
10103     }
10104     printf("\n");
10105     /*printf(" p32:");
10106     for(r=0;r<=CCREG;r++) {
10107       if((p32[i]>>r)&1) {
10108         if(r==CCREG) printf(" CC");
10109         else if(r==HIREG) printf(" HI");
10110         else if(r==LOREG) printf(" LO");
10111         else printf(" r%d",r);
10112       }
10113     }
10114     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10115     else printf("\n");*/
10116     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10117       #if defined(__i386__) || defined(__x86_64__)
10118       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10119       if(branch_regs[i].dirty&1) printf("eax ");
10120       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10121       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10122       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10123       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10124       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10125       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10126       #endif
10127       #ifdef __arm__
10128       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10129       if(branch_regs[i].dirty&1) printf("r0 ");
10130       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10131       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10132       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10133       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10134       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10135       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10136       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10137       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10138       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10139       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10140       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10141       #endif
10142       printf(" 32:");
10143       for(r=0;r<=CCREG;r++) {
10144         if((branch_regs[i].is32>>r)&1) {
10145           if(r==CCREG) printf(" CC");
10146           else if(r==HIREG) printf(" HI");
10147           else if(r==LOREG) printf(" LO");
10148           else printf(" r%d",r);
10149         }
10150       }
10151       printf("\n");
10152     }
10153   }
10154
10155   /* Pass 8 - Assembly */
10156   linkcount=0;stubcount=0;
10157   ds=0;is_delayslot=0;
10158   cop1_usable=0;
10159   uint64_t is32_pre=0;
10160   u_int dirty_pre=0;
10161   u_int beginning=(u_int)out;
10162   if((u_int)addr&1) {
10163     ds=1;
10164     pagespan_ds();
10165   }
10166   for(i=0;i<slen;i++)
10167   {
10168     //if(ds) printf("ds: ");
10169     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10170     if(ds) {
10171       ds=0; // Skip delay slot
10172       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10173       instr_addr[i]=0;
10174     } else {
10175       #ifndef DESTRUCTIVE_WRITEBACK
10176       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10177       {
10178         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10179               unneeded_reg[i],unneeded_reg_upper[i]);
10180         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10181               unneeded_reg[i],unneeded_reg_upper[i]);
10182       }
10183       is32_pre=regs[i].is32;
10184       dirty_pre=regs[i].dirty;
10185       #endif
10186       // write back
10187       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10188       {
10189         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10190                       unneeded_reg[i],unneeded_reg_upper[i]);
10191         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10192       }
10193       // branch target entry point
10194       instr_addr[i]=(u_int)out;
10195       assem_debug("<->\n");
10196       // load regs
10197       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10198         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10199       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10200       address_generation(i,&regs[i],regs[i].regmap_entry);
10201       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10202       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10203       {
10204         // Load the delay slot registers if necessary
10205         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10206           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10207         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10208           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10209         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10210           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10211       }
10212       else if(i+1<slen)
10213       {
10214         // Preload registers for following instruction
10215         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10216           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10217             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10218         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10219           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10220             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10221       }
10222       // TODO: if(is_ooo(i)) address_generation(i+1);
10223       if(itype[i]==CJUMP||itype[i]==FJUMP)
10224         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10225       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10226         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10227       if(bt[i]) cop1_usable=0;
10228       // assemble
10229       switch(itype[i]) {
10230         case ALU:
10231           alu_assemble(i,&regs[i]);break;
10232         case IMM16:
10233           imm16_assemble(i,&regs[i]);break;
10234         case SHIFT:
10235           shift_assemble(i,&regs[i]);break;
10236         case SHIFTIMM:
10237           shiftimm_assemble(i,&regs[i]);break;
10238         case LOAD:
10239           load_assemble(i,&regs[i]);break;
10240         case LOADLR:
10241           loadlr_assemble(i,&regs[i]);break;
10242         case STORE:
10243           store_assemble(i,&regs[i]);break;
10244         case STORELR:
10245           storelr_assemble(i,&regs[i]);break;
10246         case COP0:
10247           cop0_assemble(i,&regs[i]);break;
10248         case COP1:
10249           cop1_assemble(i,&regs[i]);break;
10250         case C1LS:
10251           c1ls_assemble(i,&regs[i]);break;
10252         case FCONV:
10253           fconv_assemble(i,&regs[i]);break;
10254         case FLOAT:
10255           float_assemble(i,&regs[i]);break;
10256         case FCOMP:
10257           fcomp_assemble(i,&regs[i]);break;
10258         case MULTDIV:
10259           multdiv_assemble(i,&regs[i]);break;
10260         case MOV:
10261           mov_assemble(i,&regs[i]);break;
10262         case SYSCALL:
10263           syscall_assemble(i,&regs[i]);break;
10264         case UJUMP:
10265           ujump_assemble(i,&regs[i]);ds=1;break;
10266         case RJUMP:
10267           rjump_assemble(i,&regs[i]);ds=1;break;
10268         case CJUMP:
10269           cjump_assemble(i,&regs[i]);ds=1;break;
10270         case SJUMP:
10271           sjump_assemble(i,&regs[i]);ds=1;break;
10272         case FJUMP:
10273           fjump_assemble(i,&regs[i]);ds=1;break;
10274         case SPAN:
10275           pagespan_assemble(i,&regs[i]);break;
10276       }
10277       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10278         literal_pool(1024);
10279       else
10280         literal_pool_jumpover(256);
10281     }
10282   }
10283   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10284   // If the block did not end with an unconditional branch,
10285   // add a jump to the next instruction.
10286   if(i>1) {
10287     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10288       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10289       assert(i==slen);
10290       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10291         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10292         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10293           emit_loadreg(CCREG,HOST_CCREG);
10294         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10295       }
10296       else if(!likely[i-2])
10297       {
10298         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10299         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10300       }
10301       else
10302       {
10303         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10304         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10305       }
10306       add_to_linker((int)out,start+i*4,0);
10307       emit_jmp(0);
10308     }
10309   }
10310   else
10311   {
10312     assert(i>0);
10313     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10314     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10315     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10316       emit_loadreg(CCREG,HOST_CCREG);
10317     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10318     add_to_linker((int)out,start+i*4,0);
10319     emit_jmp(0);
10320   }
10321
10322   // TODO: delay slot stubs?
10323   // Stubs
10324   for(i=0;i<stubcount;i++)
10325   {
10326     switch(stubs[i][0])
10327     {
10328       case LOADB_STUB:
10329       case LOADH_STUB:
10330       case LOADW_STUB:
10331       case LOADD_STUB:
10332       case LOADBU_STUB:
10333       case LOADHU_STUB:
10334         do_readstub(i);break;
10335       case STOREB_STUB:
10336       case STOREH_STUB:
10337       case STOREW_STUB:
10338       case STORED_STUB:
10339         do_writestub(i);break;
10340       case CC_STUB:
10341         do_ccstub(i);break;
10342       case INVCODE_STUB:
10343         do_invstub(i);break;
10344       case FP_STUB:
10345         do_cop1stub(i);break;
10346       case STORELR_STUB:
10347         do_unalignedwritestub(i);break;
10348     }
10349   }
10350
10351   /* Pass 9 - Linker */
10352   for(i=0;i<linkcount;i++)
10353   {
10354     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10355     literal_pool(64);
10356     if(!link_addr[i][2])
10357     {
10358       void *stub=out;
10359       void *addr=check_addr(link_addr[i][1]);
10360       emit_extjump(link_addr[i][0],link_addr[i][1]);
10361       if(addr) {
10362         set_jump_target(link_addr[i][0],(int)addr);
10363         add_link(link_addr[i][1],stub);
10364       }
10365       else set_jump_target(link_addr[i][0],(int)stub);
10366     }
10367     else
10368     {
10369       // Internal branch
10370       int target=(link_addr[i][1]-start)>>2;
10371       assert(target>=0&&target<slen);
10372       assert(instr_addr[target]);
10373       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10374       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10375       //#else
10376       set_jump_target(link_addr[i][0],instr_addr[target]);
10377       //#endif
10378     }
10379   }
10380   // External Branch Targets (jump_in)
10381   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10382   for(i=0;i<slen;i++)
10383   {
10384     if(bt[i]||i==0)
10385     {
10386       if(instr_addr[i]) // TODO - delay slots (=null)
10387       {
10388         u_int vaddr=start+i*4;
10389         u_int page=get_page(vaddr);
10390         u_int vpage=get_vpage(vaddr);
10391         literal_pool(256);
10392         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10393         if(!requires_32bit[i])
10394         {
10395           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10396           assem_debug("jump_in: %x\n",start+i*4);
10397           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10398           int entry_point=do_dirty_stub(i);
10399           ll_add(jump_in+page,vaddr,(void *)entry_point);
10400           // If there was an existing entry in the hash table,
10401           // replace it with the new address.
10402           // Don't add new entries.  We'll insert the
10403           // ones that actually get used in check_addr().
10404           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10405           if(ht_bin[0]==vaddr) {
10406             ht_bin[1]=entry_point;
10407           }
10408           if(ht_bin[2]==vaddr) {
10409             ht_bin[3]=entry_point;
10410           }
10411         }
10412         else
10413         {
10414           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10415           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10416           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10417           //int entry_point=(int)out;
10418           ////assem_debug("entry_point: %x\n",entry_point);
10419           //load_regs_entry(i);
10420           //if(entry_point==(int)out)
10421           //  entry_point=instr_addr[i];
10422           //else
10423           //  emit_jmp(instr_addr[i]);
10424           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10425           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10426           int entry_point=do_dirty_stub(i);
10427           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10428         }
10429       }
10430     }
10431   }
10432   // Write out the literal pool if necessary
10433   literal_pool(0);
10434   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10435   // Align code
10436   if(((u_int)out)&7) emit_addnop(13);
10437   #endif
10438   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10439   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10440   memcpy(copy,source,slen*4);
10441   copy+=slen*4;
10442   
10443   #ifdef __arm__
10444   __clear_cache((void *)beginning,out);
10445   #endif
10446   
10447   // If we're within 256K of the end of the buffer,
10448   // start over from the beginning. (Is 256K enough?)
10449   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10450   
10451   // Trap writes to any of the pages we compiled
10452   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10453     invalid_code[i]=0;
10454     memory_map[i]|=0x40000000;
10455     if((signed int)start>=(signed int)0xC0000000) {
10456       assert(using_tlb);
10457       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10458       invalid_code[j]=0;
10459       memory_map[j]|=0x40000000;
10460       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10461     }
10462   }
10463   
10464   /* Pass 10 - Free memory by expiring oldest blocks */
10465   
10466   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10467   while(expirep!=end)
10468   {
10469     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10470     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10471     inv_debug("EXP: Phase %d\n",expirep);
10472     switch((expirep>>11)&3)
10473     {
10474       case 0:
10475         // Clear jump_in and jump_dirty
10476         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10477         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10478         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10479         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10480         break;
10481       case 1:
10482         // Clear pointers
10483         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10484         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10485         break;
10486       case 2:
10487         // Clear hash table
10488         for(i=0;i<32;i++) {
10489           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10490           if((ht_bin[3]>>shift)==(base>>shift) ||
10491              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10492             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10493             ht_bin[2]=ht_bin[3]=-1;
10494           }
10495           if((ht_bin[1]>>shift)==(base>>shift) ||
10496              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10497             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10498             ht_bin[0]=ht_bin[2];
10499             ht_bin[1]=ht_bin[3];
10500             ht_bin[2]=ht_bin[3]=-1;
10501           }
10502         }
10503         break;
10504       case 3:
10505         // Clear jump_out
10506         #ifdef __arm__
10507         if((expirep&2047)==0)
10508           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10509         #endif
10510         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10511         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10512         break;
10513     }
10514     expirep=(expirep+1)&65535;
10515   }
10516   return 0;
10517 }