drc: attempt to support little endian
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   uint64_t unneeded_reg[MAXBLOCK];
88   uint64_t unneeded_reg_upper[MAXBLOCK];
89   uint64_t branch_unneeded_reg[MAXBLOCK];
90   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
91   uint64_t p32[MAXBLOCK];
92   uint64_t pr32[MAXBLOCK];
93   signed char regmap_pre[MAXBLOCK][HOST_REGS];
94   signed char regmap[MAXBLOCK][HOST_REGS];
95   signed char regmap_entry[MAXBLOCK][HOST_REGS];
96   uint64_t constmap[MAXBLOCK][HOST_REGS];
97   uint64_t known_value[HOST_REGS];
98   u_int known_reg;
99   struct regstat regs[MAXBLOCK];
100   struct regstat branch_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124   u_int using_tlb;
125   u_int stop_after_jal;
126   extern u_char restore_candidate[512];
127   extern int cycle_count;
128
129   /* registers that may be allocated */
130   /* 1-31 gpr */
131 #define HIREG 32 // hi
132 #define LOREG 33 // lo
133 #define FSREG 34 // FPU status (FCSR)
134 #define CSREG 35 // Coprocessor status
135 #define CCREG 36 // Cycle count
136 #define INVCP 37 // Pointer to invalid_code
137 #define TEMPREG 38
138 #define FTEMP 38 // FPU temporary register
139 #define PTEMP 39 // Prefetch temporary register
140 #define TLREG 40 // TLB mapping offset
141 #define RHASH 41 // Return address hash
142 #define RHTBL 42 // Return address hash table address
143 #define RTEMP 43 // JR/JALR address register
144 #define MAXREG 43
145 #define AGEN1 44 // Address generation temporary register
146 #define AGEN2 45 // Address generation temporary register
147 #define MGEN1 46 // Maptable address generation temporary register
148 #define MGEN2 47 // Maptable address generation temporary register
149 #define BTREG 48 // Branch target temporary register
150
151   /* instruction types */
152 #define NOP 0     // No operation
153 #define LOAD 1    // Load
154 #define STORE 2   // Store
155 #define LOADLR 3  // Unaligned load
156 #define STORELR 4 // Unaligned store
157 #define MOV 5     // Move 
158 #define ALU 6     // Arithmetic/logic
159 #define MULTDIV 7 // Multiply/divide
160 #define SHIFT 8   // Shift by register
161 #define SHIFTIMM 9// Shift by immediate
162 #define IMM16 10  // 16-bit immediate
163 #define RJUMP 11  // Unconditional jump to register
164 #define UJUMP 12  // Unconditional jump
165 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
166 #define SJUMP 14  // Conditional branch (regimm format)
167 #define COP0 15   // Coprocessor 0
168 #define COP1 16   // Coprocessor 1
169 #define C1LS 17   // Coprocessor 1 load/store
170 #define FJUMP 18  // Conditional branch (floating point)
171 #define FLOAT 19  // Floating point unit
172 #define FCONV 20  // Convert integer to float
173 #define FCOMP 21  // Floating point compare (sets FSREG)
174 #define SYSCALL 22// SYSCALL
175 #define OTHER 23  // Other
176 #define SPAN 24   // Branch/delay slot spans 2 pages
177 #define NI 25     // Not implemented
178
179   /* stubs */
180 #define CC_STUB 1
181 #define FP_STUB 2
182 #define LOADB_STUB 3
183 #define LOADH_STUB 4
184 #define LOADW_STUB 5
185 #define LOADD_STUB 6
186 #define LOADBU_STUB 7
187 #define LOADHU_STUB 8
188 #define STOREB_STUB 9
189 #define STOREH_STUB 10
190 #define STOREW_STUB 11
191 #define STORED_STUB 12
192 #define STORELR_STUB 13
193 #define INVCODE_STUB 14
194
195   /* branch codes */
196 #define TAKEN 1
197 #define NOTTAKEN 2
198 #define NULLDS 3
199
200 // asm linkage
201 int new_recompile_block(int addr);
202 void *get_addr_ht(u_int vaddr);
203 void invalidate_block(u_int block);
204 void invalidate_addr(u_int addr);
205 void remove_hash(int vaddr);
206 void jump_vaddr();
207 void dyna_linker();
208 void dyna_linker_ds();
209 void verify_code();
210 void verify_code_vm();
211 void verify_code_ds();
212 void cc_interrupt();
213 void fp_exception();
214 void fp_exception_ds();
215 void jump_syscall();
216 void jump_eret();
217
218 // TLB
219 void TLBWI_new();
220 void TLBWR_new();
221 void read_nomem_new();
222 void read_nomemb_new();
223 void read_nomemh_new();
224 void read_nomemd_new();
225 void write_nomem_new();
226 void write_nomemb_new();
227 void write_nomemh_new();
228 void write_nomemd_new();
229 void write_rdram_new();
230 void write_rdramb_new();
231 void write_rdramh_new();
232 void write_rdramd_new();
233 extern u_int memory_map[1048576];
234
235 // Needed by assembler
236 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
237 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
238 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
239 void load_all_regs(signed char i_regmap[]);
240 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
241 void load_regs_entry(int t);
242 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
243
244 int tracedebug=0;
245
246 //#define DEBUG_CYCLE_COUNT 1
247
248 void nullf() {}
249 //#define assem_debug printf
250 //#define inv_debug printf
251 #define assem_debug nullf
252 #define inv_debug nullf
253
254 static void tlb_hacks()
255 {
256 #ifndef DISABLE_TLB
257   // Goldeneye hack
258   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
259   {
260     u_int addr;
261     int n;
262     switch (ROM_HEADER->Country_code&0xFF) 
263     {
264       case 0x45: // U
265         addr=0x34b30;
266         break;                   
267       case 0x4A: // J 
268         addr=0x34b70;    
269         break;    
270       case 0x50: // E 
271         addr=0x329f0;
272         break;                        
273       default: 
274         // Unknown country code
275         addr=0;
276         break;
277     }
278     u_int rom_addr=(u_int)rom;
279     #ifdef ROM_COPY
280     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
281     // in the lower 4G of memory to use this hack.  Copy it if necessary.
282     if((void *)rom>(void *)0xffffffff) {
283       munmap(ROM_COPY, 67108864);
284       if(mmap(ROM_COPY, 12582912,
285               PROT_READ | PROT_WRITE,
286               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
287               -1, 0) <= 0) {printf("mmap() failed\n");}
288       memcpy(ROM_COPY,rom,12582912);
289       rom_addr=(u_int)ROM_COPY;
290     }
291     #endif
292     if(addr) {
293       for(n=0x7F000;n<0x80000;n++) {
294         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
295       }
296     }
297   }
298 #endif
299 }
300
301 static u_int get_page(u_int vaddr)
302 {
303   u_int page=(vaddr^0x80000000)>>12;
304 #ifndef DISABLE_TLB
305   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
306 #endif
307   if(page>2048) page=2048+(page&2047);
308   return page;
309 }
310
311 static u_int get_vpage(u_int vaddr)
312 {
313   u_int vpage=(vaddr^0x80000000)>>12;
314 #ifndef DISABLE_TLB
315   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
316 #endif
317   if(vpage>2048) vpage=2048+(vpage&2047);
318   return vpage;
319 }
320
321 // Get address from virtual address
322 // This is called from the recompiled JR/JALR instructions
323 void *get_addr(u_int vaddr)
324 {
325   u_int page=get_page(vaddr);
326   u_int vpage=get_vpage(vaddr);
327   struct ll_entry *head;
328   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
329   head=jump_in[page];
330   while(head!=NULL) {
331     if(head->vaddr==vaddr&&head->reg32==0) {
332   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
333       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
334       ht_bin[3]=ht_bin[1];
335       ht_bin[2]=ht_bin[0];
336       ht_bin[1]=(int)head->addr;
337       ht_bin[0]=vaddr;
338       return head->addr;
339     }
340     head=head->next;
341   }
342   head=jump_dirty[vpage];
343   while(head!=NULL) {
344     if(head->vaddr==vaddr&&head->reg32==0) {
345       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
346       // Don't restore blocks which are about to expire from the cache
347       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
348       if(verify_dirty(head->addr)) {
349         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
350         invalid_code[vaddr>>12]=0;
351         memory_map[vaddr>>12]|=0x40000000;
352         if(vpage<2048) {
353 #ifndef DISABLE_TLB
354           if(tlb_LUT_r[vaddr>>12]) {
355             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
356             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
357           }
358 #endif
359           restore_candidate[vpage>>3]|=1<<(vpage&7);
360         }
361         else restore_candidate[page>>3]|=1<<(page&7);
362         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
363         if(ht_bin[0]==vaddr) {
364           ht_bin[1]=(int)head->addr; // Replace existing entry
365         }
366         else
367         {
368           ht_bin[3]=ht_bin[1];
369           ht_bin[2]=ht_bin[0];
370           ht_bin[1]=(int)head->addr;
371           ht_bin[0]=vaddr;
372         }
373         return head->addr;
374       }
375     }
376     head=head->next;
377   }
378   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
379   int r=new_recompile_block(vaddr);
380   if(r==0) return get_addr(vaddr);
381   // Execute in unmapped page, generate pagefault execption
382   Status|=2;
383   Cause=(vaddr<<31)|0x8;
384   EPC=(vaddr&1)?vaddr-5:vaddr;
385   BadVAddr=(vaddr&~1);
386   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
387   EntryHi=BadVAddr&0xFFFFE000;
388   return get_addr_ht(0x80000000);
389 }
390 // Look up address in hash table first
391 void *get_addr_ht(u_int vaddr)
392 {
393   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
394   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
395   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
396   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
397   return get_addr(vaddr);
398 }
399
400 void *get_addr_32(u_int vaddr,u_int flags)
401 {
402   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
403   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
404   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
405   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
406   u_int page=get_page(vaddr);
407   u_int vpage=get_vpage(vaddr);
408   struct ll_entry *head;
409   head=jump_in[page];
410   while(head!=NULL) {
411     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
412       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
413       if(head->reg32==0) {
414         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
415         if(ht_bin[0]==-1) {
416           ht_bin[1]=(int)head->addr;
417           ht_bin[0]=vaddr;
418         }else if(ht_bin[2]==-1) {
419           ht_bin[3]=(int)head->addr;
420           ht_bin[2]=vaddr;
421         }
422         //ht_bin[3]=ht_bin[1];
423         //ht_bin[2]=ht_bin[0];
424         //ht_bin[1]=(int)head->addr;
425         //ht_bin[0]=vaddr;
426       }
427       return head->addr;
428     }
429     head=head->next;
430   }
431   head=jump_dirty[vpage];
432   while(head!=NULL) {
433     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
434       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
435       // Don't restore blocks which are about to expire from the cache
436       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
437       if(verify_dirty(head->addr)) {
438         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
439         invalid_code[vaddr>>12]=0;
440         memory_map[vaddr>>12]|=0x40000000;
441         if(vpage<2048) {
442 #ifndef DISABLE_TLB
443           if(tlb_LUT_r[vaddr>>12]) {
444             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
445             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
446           }
447 #endif
448           restore_candidate[vpage>>3]|=1<<(vpage&7);
449         }
450         else restore_candidate[page>>3]|=1<<(page&7);
451         if(head->reg32==0) {
452           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
453           if(ht_bin[0]==-1) {
454             ht_bin[1]=(int)head->addr;
455             ht_bin[0]=vaddr;
456           }else if(ht_bin[2]==-1) {
457             ht_bin[3]=(int)head->addr;
458             ht_bin[2]=vaddr;
459           }
460           //ht_bin[3]=ht_bin[1];
461           //ht_bin[2]=ht_bin[0];
462           //ht_bin[1]=(int)head->addr;
463           //ht_bin[0]=vaddr;
464         }
465         return head->addr;
466       }
467     }
468     head=head->next;
469   }
470   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
471   int r=new_recompile_block(vaddr);
472   if(r==0) return get_addr(vaddr);
473   // Execute in unmapped page, generate pagefault execption
474   Status|=2;
475   Cause=(vaddr<<31)|0x8;
476   EPC=(vaddr&1)?vaddr-5:vaddr;
477   BadVAddr=(vaddr&~1);
478   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
479   EntryHi=BadVAddr&0xFFFFE000;
480   return get_addr_ht(0x80000000);
481 }
482
483 void clear_all_regs(signed char regmap[])
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
487 }
488
489 signed char get_reg(signed char regmap[],int r)
490 {
491   int hr;
492   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
493   return -1;
494 }
495
496 // Find a register that is available for two consecutive cycles
497 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
498 {
499   int hr;
500   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
501   return -1;
502 }
503
504 int count_free_regs(signed char regmap[])
505 {
506   int count=0;
507   int hr;
508   for(hr=0;hr<HOST_REGS;hr++)
509   {
510     if(hr!=EXCLUDE_REG) {
511       if(regmap[hr]<0) count++;
512     }
513   }
514   return count;
515 }
516
517 void dirty_reg(struct regstat *cur,signed char reg)
518 {
519   int hr;
520   if(!reg) return;
521   for (hr=0;hr<HOST_REGS;hr++) {
522     if((cur->regmap[hr]&63)==reg) {
523       cur->dirty|=1<<hr;
524     }
525   }
526 }
527
528 // If we dirty the lower half of a 64 bit register which is now being
529 // sign-extended, we need to dump the upper half.
530 // Note: Do this only after completion of the instruction, because
531 // some instructions may need to read the full 64-bit value even if
532 // overwriting it (eg SLTI, DSRA32).
533 static void flush_dirty_uppers(struct regstat *cur)
534 {
535   int hr,reg;
536   for (hr=0;hr<HOST_REGS;hr++) {
537     if((cur->dirty>>hr)&1) {
538       reg=cur->regmap[hr];
539       if(reg>=64) 
540         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
541     }
542   }
543 }
544
545 void set_const(struct regstat *cur,signed char reg,uint64_t value)
546 {
547   int hr;
548   if(!reg) return;
549   for (hr=0;hr<HOST_REGS;hr++) {
550     if(cur->regmap[hr]==reg) {
551       cur->isconst|=1<<hr;
552       cur->constmap[hr]=value;
553     }
554     else if((cur->regmap[hr]^64)==reg) {
555       cur->isconst|=1<<hr;
556       cur->constmap[hr]=value>>32;
557     }
558   }
559 }
560
561 void clear_const(struct regstat *cur,signed char reg)
562 {
563   int hr;
564   if(!reg) return;
565   for (hr=0;hr<HOST_REGS;hr++) {
566     if((cur->regmap[hr]&63)==reg) {
567       cur->isconst&=~(1<<hr);
568     }
569   }
570 }
571
572 int is_const(struct regstat *cur,signed char reg)
573 {
574   int hr;
575   if(!reg) return 1;
576   for (hr=0;hr<HOST_REGS;hr++) {
577     if((cur->regmap[hr]&63)==reg) {
578       return (cur->isconst>>hr)&1;
579     }
580   }
581   return 0;
582 }
583 uint64_t get_const(struct regstat *cur,signed char reg)
584 {
585   int hr;
586   if(!reg) return 0;
587   for (hr=0;hr<HOST_REGS;hr++) {
588     if(cur->regmap[hr]==reg) {
589       return cur->constmap[hr];
590     }
591   }
592   printf("Unknown constant in r%d\n",reg);
593   exit(1);
594 }
595
596 // Least soon needed registers
597 // Look at the next ten instructions and see which registers
598 // will be used.  Try not to reallocate these.
599 void lsn(u_char hsn[], int i, int *preferred_reg)
600 {
601   int j;
602   int b=-1;
603   for(j=0;j<9;j++)
604   {
605     if(i+j>=slen) {
606       j=slen-i-1;
607       break;
608     }
609     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
610     {
611       // Don't go past an unconditonal jump
612       j++;
613       break;
614     }
615   }
616   for(;j>=0;j--)
617   {
618     if(rs1[i+j]) hsn[rs1[i+j]]=j;
619     if(rs2[i+j]) hsn[rs2[i+j]]=j;
620     if(rt1[i+j]) hsn[rt1[i+j]]=j;
621     if(rt2[i+j]) hsn[rt2[i+j]]=j;
622     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
623       // Stores can allocate zero
624       hsn[rs1[i+j]]=j;
625       hsn[rs2[i+j]]=j;
626     }
627     // On some architectures stores need invc_ptr
628     #if defined(HOST_IMM8)
629     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) {
630       hsn[INVCP]=j;
631     }
632     #endif
633     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
634     {
635       hsn[CCREG]=j;
636       b=j;
637     }
638   }
639   if(b>=0)
640   {
641     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
642     {
643       // Follow first branch
644       int t=(ba[i+b]-start)>>2;
645       j=7-b;if(t+j>=slen) j=slen-t-1;
646       for(;j>=0;j--)
647       {
648         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
649         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
650         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
651         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
652       }
653     }
654     // TODO: preferred register based on backward branch
655   }
656   // Delay slot should preferably not overwrite branch conditions or cycle count
657   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
658     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
659     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
660     hsn[CCREG]=1;
661     // ...or hash tables
662     hsn[RHASH]=1;
663     hsn[RHTBL]=1;
664   }
665   // Coprocessor load/store needs FTEMP, even if not declared
666   if(itype[i]==C1LS) {
667     hsn[FTEMP]=0;
668   }
669   // Load L/R also uses FTEMP as a temporary register
670   if(itype[i]==LOADLR) {
671     hsn[FTEMP]=0;
672   }
673   // Also 64-bit SDL/SDR
674   if(opcode[i]==0x2c||opcode[i]==0x2d) {
675     hsn[FTEMP]=0;
676   }
677   // Don't remove the TLB registers either
678   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) {
679     hsn[TLREG]=0;
680   }
681   // Don't remove the miniht registers
682   if(itype[i]==UJUMP||itype[i]==RJUMP)
683   {
684     hsn[RHASH]=0;
685     hsn[RHTBL]=0;
686   }
687 }
688
689 // We only want to allocate registers if we're going to use them again soon
690 int needed_again(int r, int i)
691 {
692   int j;
693   int b=-1;
694   int rn=10;
695   int hr;
696   u_char hsn[MAXREG+1];
697   int preferred_reg;
698   
699   memset(hsn,10,sizeof(hsn));
700   lsn(hsn,i,&preferred_reg);
701   
702   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
703   {
704     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
705       return 0; // Don't need any registers if exiting the block
706   }
707   for(j=0;j<9;j++)
708   {
709     if(i+j>=slen) {
710       j=slen-i-1;
711       break;
712     }
713     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
714     {
715       // Don't go past an unconditonal jump
716       j++;
717       break;
718     }
719     if(itype[i+j]==SYSCALL||((source[i+j]&0xfc00003f)==0x0d))
720     {
721       break;
722     }
723   }
724   for(;j>=1;j--)
725   {
726     if(rs1[i+j]==r) rn=j;
727     if(rs2[i+j]==r) rn=j;
728     if((unneeded_reg[i+j]>>r)&1) rn=10;
729     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
730     {
731       b=j;
732     }
733   }
734   /*
735   if(b>=0)
736   {
737     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
738     {
739       // Follow first branch
740       int o=rn;
741       int t=(ba[i+b]-start)>>2;
742       j=7-b;if(t+j>=slen) j=slen-t-1;
743       for(;j>=0;j--)
744       {
745         if(!((unneeded_reg[t+j]>>r)&1)) {
746           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
747           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
748         }
749         else rn=o;
750       }
751     }
752   }*/
753   for(hr=0;hr<HOST_REGS;hr++) {
754     if(hr!=EXCLUDE_REG) {
755       if(rn<hsn[hr]) return 1;
756     }
757   }
758   return 0;
759 }
760
761 // Try to match register allocations at the end of a loop with those
762 // at the beginning
763 int loop_reg(int i, int r, int hr)
764 {
765   int j,k;
766   for(j=0;j<9;j++)
767   {
768     if(i+j>=slen) {
769       j=slen-i-1;
770       break;
771     }
772     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
773     {
774       // Don't go past an unconditonal jump
775       j++;
776       break;
777     }
778   }
779   k=0;
780   if(i>0){
781     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
782       k--;
783   }
784   for(;k<j;k++)
785   {
786     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
787     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
788     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
789     {
790       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
791       {
792         int t=(ba[i+k]-start)>>2;
793         int reg=get_reg(regs[t].regmap_entry,r);
794         if(reg>=0) return reg;
795         //reg=get_reg(regs[t+1].regmap_entry,r);
796         //if(reg>=0) return reg;
797       }
798     }
799   }
800   return hr;
801 }
802
803
804 // Allocate every register, preserving source/target regs
805 void alloc_all(struct regstat *cur,int i)
806 {
807   int hr;
808   
809   for(hr=0;hr<HOST_REGS;hr++) {
810     if(hr!=EXCLUDE_REG) {
811       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
812          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
813       {
814         cur->regmap[hr]=-1;
815         cur->dirty&=~(1<<hr);
816       }
817       // Don't need zeros
818       if((cur->regmap[hr]&63)==0)
819       {
820         cur->regmap[hr]=-1;
821         cur->dirty&=~(1<<hr);
822       }
823     }
824   }
825 }
826
827
828 void div64(int64_t dividend,int64_t divisor)
829 {
830   lo=dividend/divisor;
831   hi=dividend%divisor;
832   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
833   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
834 }
835 void divu64(uint64_t dividend,uint64_t divisor)
836 {
837   lo=dividend/divisor;
838   hi=dividend%divisor;
839   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
840   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
841 }
842
843 void mult64(uint64_t m1,uint64_t m2)
844 {
845    unsigned long long int op1, op2, op3, op4;
846    unsigned long long int result1, result2, result3, result4;
847    unsigned long long int temp1, temp2, temp3, temp4;
848    int sign = 0;
849    
850    if (m1 < 0)
851      {
852     op2 = -m1;
853     sign = 1 - sign;
854      }
855    else op2 = m1;
856    if (m2 < 0)
857      {
858     op4 = -m2;
859     sign = 1 - sign;
860      }
861    else op4 = m2;
862    
863    op1 = op2 & 0xFFFFFFFF;
864    op2 = (op2 >> 32) & 0xFFFFFFFF;
865    op3 = op4 & 0xFFFFFFFF;
866    op4 = (op4 >> 32) & 0xFFFFFFFF;
867    
868    temp1 = op1 * op3;
869    temp2 = (temp1 >> 32) + op1 * op4;
870    temp3 = op2 * op3;
871    temp4 = (temp3 >> 32) + op2 * op4;
872    
873    result1 = temp1 & 0xFFFFFFFF;
874    result2 = temp2 + (temp3 & 0xFFFFFFFF);
875    result3 = (result2 >> 32) + temp4;
876    result4 = (result3 >> 32);
877    
878    lo = result1 | (result2 << 32);
879    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
880    if (sign)
881      {
882     hi = ~hi;
883     if (!lo) hi++;
884     else lo = ~lo + 1;
885      }
886 }
887
888 void multu64(uint64_t m1,uint64_t m2)
889 {
890    unsigned long long int op1, op2, op3, op4;
891    unsigned long long int result1, result2, result3, result4;
892    unsigned long long int temp1, temp2, temp3, temp4;
893    
894    op1 = m1 & 0xFFFFFFFF;
895    op2 = (m1 >> 32) & 0xFFFFFFFF;
896    op3 = m2 & 0xFFFFFFFF;
897    op4 = (m2 >> 32) & 0xFFFFFFFF;
898    
899    temp1 = op1 * op3;
900    temp2 = (temp1 >> 32) + op1 * op4;
901    temp3 = op2 * op3;
902    temp4 = (temp3 >> 32) + op2 * op4;
903    
904    result1 = temp1 & 0xFFFFFFFF;
905    result2 = temp2 + (temp3 & 0xFFFFFFFF);
906    result3 = (result2 >> 32) + temp4;
907    result4 = (result3 >> 32);
908    
909    lo = result1 | (result2 << 32);
910    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
911    
912   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
913   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
914 }
915
916 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
917 {
918   if(bits) {
919     original<<=64-bits;
920     original>>=64-bits;
921     loaded<<=bits;
922     original|=loaded;
923   }
924   else original=loaded;
925   return original;
926 }
927 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
928 {
929   if(bits^56) {
930     original>>=64-(bits^56);
931     original<<=64-(bits^56);
932     loaded>>=bits^56;
933     original|=loaded;
934   }
935   else original=loaded;
936   return original;
937 }
938
939 #ifdef __i386__
940 #include "assem_x86.c"
941 #endif
942 #ifdef __x86_64__
943 #include "assem_x64.c"
944 #endif
945 #ifdef __arm__
946 #include "assem_arm.c"
947 #endif
948
949 // Add virtual address mapping to linked list
950 void ll_add(struct ll_entry **head,int vaddr,void *addr)
951 {
952   struct ll_entry *new_entry;
953   new_entry=malloc(sizeof(struct ll_entry));
954   assert(new_entry!=NULL);
955   new_entry->vaddr=vaddr;
956   new_entry->reg32=0;
957   new_entry->addr=addr;
958   new_entry->next=*head;
959   *head=new_entry;
960 }
961
962 // Add virtual address mapping for 32-bit compiled block
963 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
964 {
965   struct ll_entry *new_entry;
966   new_entry=malloc(sizeof(struct ll_entry));
967   assert(new_entry!=NULL);
968   new_entry->vaddr=vaddr;
969   new_entry->reg32=reg32;
970   new_entry->addr=addr;
971   new_entry->next=*head;
972   *head=new_entry;
973 }
974
975 // Check if an address is already compiled
976 // but don't return addresses which are about to expire from the cache
977 void *check_addr(u_int vaddr)
978 {
979   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
980   if(ht_bin[0]==vaddr) {
981     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
982       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
983   }
984   if(ht_bin[2]==vaddr) {
985     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
986       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
987   }
988   u_int page=get_page(vaddr);
989   struct ll_entry *head;
990   head=jump_in[page];
991   while(head!=NULL) {
992     if(head->vaddr==vaddr&&head->reg32==0) {
993       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
994         // Update existing entry with current address
995         if(ht_bin[0]==vaddr) {
996           ht_bin[1]=(int)head->addr;
997           return head->addr;
998         }
999         if(ht_bin[2]==vaddr) {
1000           ht_bin[3]=(int)head->addr;
1001           return head->addr;
1002         }
1003         // Insert into hash table with low priority.
1004         // Don't evict existing entries, as they are probably
1005         // addresses that are being accessed frequently.
1006         if(ht_bin[0]==-1) {
1007           ht_bin[1]=(int)head->addr;
1008           ht_bin[0]=vaddr;
1009         }else if(ht_bin[2]==-1) {
1010           ht_bin[3]=(int)head->addr;
1011           ht_bin[2]=vaddr;
1012         }
1013         return head->addr;
1014       }
1015     }
1016     head=head->next;
1017   }
1018   return 0;
1019 }
1020
1021 void remove_hash(int vaddr)
1022 {
1023   //printf("remove hash: %x\n",vaddr);
1024   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1025   if(ht_bin[2]==vaddr) {
1026     ht_bin[2]=ht_bin[3]=-1;
1027   }
1028   if(ht_bin[0]==vaddr) {
1029     ht_bin[0]=ht_bin[2];
1030     ht_bin[1]=ht_bin[3];
1031     ht_bin[2]=ht_bin[3]=-1;
1032   }
1033 }
1034
1035 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1036 {
1037   struct ll_entry *next;
1038   while(*head) {
1039     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1040        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1041     {
1042       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1043       remove_hash((*head)->vaddr);
1044       next=(*head)->next;
1045       free(*head);
1046       *head=next;
1047     }
1048     else
1049     {
1050       head=&((*head)->next);
1051     }
1052   }
1053 }
1054
1055 // Remove all entries from linked list
1056 void ll_clear(struct ll_entry **head)
1057 {
1058   struct ll_entry *cur;
1059   struct ll_entry *next;
1060   if(cur=*head) {
1061     *head=0;
1062     while(cur) {
1063       next=cur->next;
1064       free(cur);
1065       cur=next;
1066     }
1067   }
1068 }
1069
1070 // Dereference the pointers and remove if it matches
1071 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1072 {
1073   while(head) {
1074     int ptr=get_pointer(head->addr);
1075     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1076     if(((ptr>>shift)==(addr>>shift)) ||
1077        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1078     {
1079       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1080       kill_pointer(head->addr);
1081     }
1082     head=head->next;
1083   }
1084 }
1085
1086 // This is called when we write to a compiled block (see do_invstub)
1087 int invalidate_page(u_int page)
1088 {
1089   int modified=0;
1090   struct ll_entry *head;
1091   struct ll_entry *next;
1092   head=jump_in[page];
1093   jump_in[page]=0;
1094   while(head!=NULL) {
1095     inv_debug("INVALIDATE: %x\n",head->vaddr);
1096     remove_hash(head->vaddr);
1097     next=head->next;
1098     free(head);
1099     head=next;
1100   }
1101   head=jump_out[page];
1102   jump_out[page]=0;
1103   while(head!=NULL) {
1104     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1105     kill_pointer(head->addr);
1106     modified=1;
1107     next=head->next;
1108     free(head);
1109     head=next;
1110   }
1111   return modified;
1112 }
1113 void invalidate_block(u_int block)
1114 {
1115   int modified;
1116   u_int page=get_page(block<<12);
1117   u_int vpage=get_vpage(block<<12);
1118   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1119   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1120   u_int first,last;
1121   first=last=page;
1122   struct ll_entry *head;
1123   head=jump_dirty[vpage];
1124   //printf("page=%d vpage=%d\n",page,vpage);
1125   while(head!=NULL) {
1126     u_int start,end;
1127     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1128       get_bounds((int)head->addr,&start,&end);
1129       //printf("start: %x end: %x\n",start,end);
1130       if(page<2048&&start>=0x80000000&&end<0x80800000) {
1131         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1132           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1133           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1134         }
1135       }
1136 #ifndef DISABLE_TLB
1137       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1138         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1139           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1140           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1141         }
1142       }
1143 #endif
1144     }
1145     head=head->next;
1146   }
1147   //printf("first=%d last=%d\n",first,last);
1148   modified=invalidate_page(page);
1149   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1150   assert(last<page+5);
1151   // Invalidate the adjacent pages if a block crosses a 4K boundary
1152   while(first<page) {
1153     invalidate_page(first);
1154     first++;
1155   }
1156   for(first=page+1;first<last;first++) {
1157     invalidate_page(first);
1158   }
1159   
1160   // Don't trap writes
1161   invalid_code[block]=1;
1162 #ifndef DISABLE_TLB
1163   // If there is a valid TLB entry for this page, remove write protect
1164   if(tlb_LUT_w[block]) {
1165     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1166     // CHECK: Is this right?
1167     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1168     u_int real_block=tlb_LUT_w[block]>>12;
1169     invalid_code[real_block]=1;
1170     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1171   }
1172   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1173 #endif
1174   #ifdef __arm__
1175   if(modified)
1176     __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1177   #endif
1178   #ifdef USE_MINI_HT
1179   memset(mini_ht,-1,sizeof(mini_ht));
1180   #endif
1181 }
1182 void invalidate_addr(u_int addr)
1183 {
1184   invalidate_block(addr>>12);
1185 }
1186 void invalidate_all_pages()
1187 {
1188   u_int page,n;
1189   for(page=0;page<4096;page++)
1190     invalidate_page(page);
1191   for(page=0;page<1048576;page++)
1192     if(!invalid_code[page]) {
1193       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1194       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1195     }
1196   #ifdef __arm__
1197   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1198   #endif
1199   #ifdef USE_MINI_HT
1200   memset(mini_ht,-1,sizeof(mini_ht));
1201   #endif
1202   #ifndef DISABLE_TLB
1203   // TLB
1204   for(page=0;page<0x100000;page++) {
1205     if(tlb_LUT_r[page]) {
1206       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1207       if(!tlb_LUT_w[page]||!invalid_code[page])
1208         memory_map[page]|=0x40000000; // Write protect
1209     }
1210     else memory_map[page]=-1;
1211     if(page==0x80000) page=0xC0000;
1212   }
1213   tlb_hacks();
1214   #endif
1215 }
1216
1217 // Add an entry to jump_out after making a link
1218 void add_link(u_int vaddr,void *src)
1219 {
1220   u_int page=get_page(vaddr);
1221   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1222   ll_add(jump_out+page,vaddr,src);
1223   //int ptr=get_pointer(src);
1224   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1225 }
1226
1227 // If a code block was found to be unmodified (bit was set in
1228 // restore_candidate) and it remains unmodified (bit is clear
1229 // in invalid_code) then move the entries for that 4K page from
1230 // the dirty list to the clean list.
1231 void clean_blocks(u_int page)
1232 {
1233   struct ll_entry *head;
1234   inv_debug("INV: clean_blocks page=%d\n",page);
1235   head=jump_dirty[page];
1236   while(head!=NULL) {
1237     if(!invalid_code[head->vaddr>>12]) {
1238       // Don't restore blocks which are about to expire from the cache
1239       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1240         u_int start,end;
1241         if(verify_dirty((int)head->addr)) {
1242           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1243           u_int i;
1244           u_int inv=0;
1245           get_bounds((int)head->addr,&start,&end);
1246           if(start-(u_int)rdram<0x800000) {
1247             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1248               inv|=invalid_code[i];
1249             }
1250           }
1251           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1252             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1253             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1254             if(addr<start||addr>=end) inv=1;
1255           }
1256           else if((signed int)head->vaddr>=(signed int)0x80800000) {
1257             inv=1;
1258           }
1259           if(!inv) {
1260             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1261             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1262               u_int ppage=page;
1263 #ifndef DISABLE_TLB
1264               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1265 #endif
1266               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1267               //printf("page=%x, addr=%x\n",page,head->vaddr);
1268               //assert(head->vaddr>>12==(page|0x80000));
1269               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1270               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1271               if(!head->reg32) {
1272                 if(ht_bin[0]==head->vaddr) {
1273                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1274                 }
1275                 if(ht_bin[2]==head->vaddr) {
1276                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1277                 }
1278               }
1279             }
1280           }
1281         }
1282       }
1283     }
1284     head=head->next;
1285   }
1286 }
1287
1288
1289 void mov_alloc(struct regstat *current,int i)
1290 {
1291   // Note: Don't need to actually alloc the source registers
1292   if((~current->is32>>rs1[i])&1) {
1293     //alloc_reg64(current,i,rs1[i]);
1294     alloc_reg64(current,i,rt1[i]);
1295     current->is32&=~(1LL<<rt1[i]);
1296   } else {
1297     //alloc_reg(current,i,rs1[i]);
1298     alloc_reg(current,i,rt1[i]);
1299     current->is32|=(1LL<<rt1[i]);
1300   }
1301   clear_const(current,rs1[i]);
1302   clear_const(current,rt1[i]);
1303   dirty_reg(current,rt1[i]);
1304 }
1305
1306 void shiftimm_alloc(struct regstat *current,int i)
1307 {
1308   clear_const(current,rs1[i]);
1309   clear_const(current,rt1[i]);
1310   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1311   {
1312     if(rt1[i]) {
1313       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1314       else lt1[i]=rs1[i];
1315       alloc_reg(current,i,rt1[i]);
1316       current->is32|=1LL<<rt1[i];
1317       dirty_reg(current,rt1[i]);
1318     }
1319   }
1320   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1321   {
1322     if(rt1[i]) {
1323       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1324       alloc_reg64(current,i,rt1[i]);
1325       current->is32&=~(1LL<<rt1[i]);
1326       dirty_reg(current,rt1[i]);
1327     }
1328   }
1329   if(opcode2[i]==0x3c) // DSLL32
1330   {
1331     if(rt1[i]) {
1332       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1333       alloc_reg64(current,i,rt1[i]);
1334       current->is32&=~(1LL<<rt1[i]);
1335       dirty_reg(current,rt1[i]);
1336     }
1337   }
1338   if(opcode2[i]==0x3e) // DSRL32
1339   {
1340     if(rt1[i]) {
1341       alloc_reg64(current,i,rs1[i]);
1342       if(imm[i]==32) {
1343         alloc_reg64(current,i,rt1[i]);
1344         current->is32&=~(1LL<<rt1[i]);
1345       } else {
1346         alloc_reg(current,i,rt1[i]);
1347         current->is32|=1LL<<rt1[i];
1348       }
1349       dirty_reg(current,rt1[i]);
1350     }
1351   }
1352   if(opcode2[i]==0x3f) // DSRA32
1353   {
1354     if(rt1[i]) {
1355       alloc_reg64(current,i,rs1[i]);
1356       alloc_reg(current,i,rt1[i]);
1357       current->is32|=1LL<<rt1[i];
1358       dirty_reg(current,rt1[i]);
1359     }
1360   }
1361 }
1362
1363 void shift_alloc(struct regstat *current,int i)
1364 {
1365   if(rt1[i]) {
1366     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1367     {
1368       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1369       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1370       alloc_reg(current,i,rt1[i]);
1371       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1372       current->is32|=1LL<<rt1[i];
1373     } else { // DSLLV/DSRLV/DSRAV
1374       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1375       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1376       alloc_reg64(current,i,rt1[i]);
1377       current->is32&=~(1LL<<rt1[i]);
1378       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1379         alloc_reg_temp(current,i,-1);
1380     }
1381     clear_const(current,rs1[i]);
1382     clear_const(current,rs2[i]);
1383     clear_const(current,rt1[i]);
1384     dirty_reg(current,rt1[i]);
1385   }
1386 }
1387
1388 void alu_alloc(struct regstat *current,int i)
1389 {
1390   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1391     if(rt1[i]) {
1392       if(rs1[i]&&rs2[i]) {
1393         alloc_reg(current,i,rs1[i]);
1394         alloc_reg(current,i,rs2[i]);
1395       }
1396       else {
1397         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1398         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1399       }
1400       alloc_reg(current,i,rt1[i]);
1401     }
1402     current->is32|=1LL<<rt1[i];
1403   }
1404   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1405     if(rt1[i]) {
1406       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1407       {
1408         alloc_reg64(current,i,rs1[i]);
1409         alloc_reg64(current,i,rs2[i]);
1410         alloc_reg(current,i,rt1[i]);
1411       } else {
1412         alloc_reg(current,i,rs1[i]);
1413         alloc_reg(current,i,rs2[i]);
1414         alloc_reg(current,i,rt1[i]);
1415       }
1416     }
1417     current->is32|=1LL<<rt1[i];
1418   }
1419   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1420     if(rt1[i]) {
1421       if(rs1[i]&&rs2[i]) {
1422         alloc_reg(current,i,rs1[i]);
1423         alloc_reg(current,i,rs2[i]);
1424       }
1425       else
1426       {
1427         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1428         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1429       }
1430       alloc_reg(current,i,rt1[i]);
1431       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1432       {
1433         if(!((current->uu>>rt1[i])&1)) {
1434           alloc_reg64(current,i,rt1[i]);
1435         }
1436         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1437           if(rs1[i]&&rs2[i]) {
1438             alloc_reg64(current,i,rs1[i]);
1439             alloc_reg64(current,i,rs2[i]);
1440           }
1441           else
1442           {
1443             // Is is really worth it to keep 64-bit values in registers?
1444             #ifdef NATIVE_64BIT
1445             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1446             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1447             #endif
1448           }
1449         }
1450         current->is32&=~(1LL<<rt1[i]);
1451       } else {
1452         current->is32|=1LL<<rt1[i];
1453       }
1454     }
1455   }
1456   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1457     if(rt1[i]) {
1458       if(rs1[i]&&rs2[i]) {
1459         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1460           alloc_reg64(current,i,rs1[i]);
1461           alloc_reg64(current,i,rs2[i]);
1462           alloc_reg64(current,i,rt1[i]);
1463         } else {
1464           alloc_reg(current,i,rs1[i]);
1465           alloc_reg(current,i,rs2[i]);
1466           alloc_reg(current,i,rt1[i]);
1467         }
1468       }
1469       else {
1470         alloc_reg(current,i,rt1[i]);
1471         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1472           // DADD used as move, or zeroing
1473           // If we have a 64-bit source, then make the target 64 bits too
1474           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1475             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1476             alloc_reg64(current,i,rt1[i]);
1477           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1478             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1479             alloc_reg64(current,i,rt1[i]);
1480           }
1481           if(opcode2[i]>=0x2e&&rs2[i]) {
1482             // DSUB used as negation - 64-bit result
1483             // If we have a 32-bit register, extend it to 64 bits
1484             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1485             alloc_reg64(current,i,rt1[i]);
1486           }
1487         }
1488       }
1489       if(rs1[i]&&rs2[i]) {
1490         current->is32&=~(1LL<<rt1[i]);
1491       } else if(rs1[i]) {
1492         current->is32&=~(1LL<<rt1[i]);
1493         if((current->is32>>rs1[i])&1)
1494           current->is32|=1LL<<rt1[i];
1495       } else if(rs2[i]) {
1496         current->is32&=~(1LL<<rt1[i]);
1497         if((current->is32>>rs2[i])&1)
1498           current->is32|=1LL<<rt1[i];
1499       } else {
1500         current->is32|=1LL<<rt1[i];
1501       }
1502     }
1503   }
1504   clear_const(current,rs1[i]);
1505   clear_const(current,rs2[i]);
1506   clear_const(current,rt1[i]);
1507   dirty_reg(current,rt1[i]);
1508 }
1509
1510 void imm16_alloc(struct regstat *current,int i)
1511 {
1512   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1513   else lt1[i]=rs1[i];
1514   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1515   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1516     current->is32&=~(1LL<<rt1[i]);
1517     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1518       // TODO: Could preserve the 32-bit flag if the immediate is zero
1519       alloc_reg64(current,i,rt1[i]);
1520       alloc_reg64(current,i,rs1[i]);
1521     }
1522     clear_const(current,rs1[i]);
1523     clear_const(current,rt1[i]);
1524   }
1525   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1526     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1527     current->is32|=1LL<<rt1[i];
1528     clear_const(current,rs1[i]);
1529     clear_const(current,rt1[i]);
1530   }
1531   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1532     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1533       if(rs1[i]!=rt1[i]) {
1534         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1535         alloc_reg64(current,i,rt1[i]);
1536         current->is32&=~(1LL<<rt1[i]);
1537       }
1538     }
1539     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1540     if(is_const(current,rs1[i])) {
1541       int v=get_const(current,rs1[i]);
1542       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1543       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1544       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1545     }
1546     else clear_const(current,rt1[i]);
1547   }
1548   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1549     if(is_const(current,rs1[i])) {
1550       int v=get_const(current,rs1[i]);
1551       set_const(current,rt1[i],v+imm[i]);
1552     }
1553     else clear_const(current,rt1[i]);
1554     current->is32|=1LL<<rt1[i];
1555   }
1556   else {
1557     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1558     current->is32|=1LL<<rt1[i];
1559   }
1560   dirty_reg(current,rt1[i]);
1561 }
1562
1563 void load_alloc(struct regstat *current,int i)
1564 {
1565   clear_const(current,rt1[i]);
1566   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1567   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1568   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1569   if(rt1[i]) {
1570     alloc_reg(current,i,rt1[i]);
1571     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1572     {
1573       current->is32&=~(1LL<<rt1[i]);
1574       alloc_reg64(current,i,rt1[i]);
1575     }
1576     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1577     {
1578       current->is32&=~(1LL<<rt1[i]);
1579       alloc_reg64(current,i,rt1[i]);
1580       alloc_all(current,i);
1581       alloc_reg64(current,i,FTEMP);
1582     }
1583     else current->is32|=1LL<<rt1[i];
1584     dirty_reg(current,rt1[i]);
1585     // If using TLB, need a register for pointer to the mapping table
1586     if(using_tlb) alloc_reg(current,i,TLREG);
1587     // LWL/LWR need a temporary register for the old value
1588     if(opcode[i]==0x22||opcode[i]==0x26)
1589     {
1590       alloc_reg(current,i,FTEMP);
1591       alloc_reg_temp(current,i,-1);
1592     }
1593   }
1594   else
1595   {
1596     // Load to r0 (dummy load)
1597     // but we still need a register to calculate the address
1598     alloc_reg_temp(current,i,-1);
1599   }
1600 }
1601
1602 void store_alloc(struct regstat *current,int i)
1603 {
1604   clear_const(current,rs2[i]);
1605   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1606   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1607   alloc_reg(current,i,rs2[i]);
1608   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1609     alloc_reg64(current,i,rs2[i]);
1610     if(rs2[i]) alloc_reg(current,i,FTEMP);
1611   }
1612   // If using TLB, need a register for pointer to the mapping table
1613   if(using_tlb) alloc_reg(current,i,TLREG);
1614   #if defined(HOST_IMM8)
1615   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1616   else alloc_reg(current,i,INVCP);
1617   #endif
1618   if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR
1619     alloc_reg(current,i,FTEMP);
1620   }
1621   // We need a temporary register for address generation
1622   alloc_reg_temp(current,i,-1);
1623 }
1624
1625 void c1ls_alloc(struct regstat *current,int i)
1626 {
1627   //clear_const(current,rs1[i]); // FIXME
1628   clear_const(current,rt1[i]);
1629   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1630   alloc_reg(current,i,CSREG); // Status
1631   alloc_reg(current,i,FTEMP);
1632   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1633     alloc_reg64(current,i,FTEMP);
1634   }
1635   // If using TLB, need a register for pointer to the mapping table
1636   if(using_tlb) alloc_reg(current,i,TLREG);
1637   #if defined(HOST_IMM8)
1638   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1639   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1640     alloc_reg(current,i,INVCP);
1641   #endif
1642   // We need a temporary register for address generation
1643   alloc_reg_temp(current,i,-1);
1644 }
1645
1646 #ifndef multdiv_alloc
1647 void multdiv_alloc(struct regstat *current,int i)
1648 {
1649   //  case 0x18: MULT
1650   //  case 0x19: MULTU
1651   //  case 0x1A: DIV
1652   //  case 0x1B: DIVU
1653   //  case 0x1C: DMULT
1654   //  case 0x1D: DMULTU
1655   //  case 0x1E: DDIV
1656   //  case 0x1F: DDIVU
1657   clear_const(current,rs1[i]);
1658   clear_const(current,rs2[i]);
1659   if(rs1[i]&&rs2[i])
1660   {
1661     if((opcode2[i]&4)==0) // 32-bit
1662     {
1663       current->u&=~(1LL<<HIREG);
1664       current->u&=~(1LL<<LOREG);
1665       alloc_reg(current,i,HIREG);
1666       alloc_reg(current,i,LOREG);
1667       alloc_reg(current,i,rs1[i]);
1668       alloc_reg(current,i,rs2[i]);
1669       current->is32|=1LL<<HIREG;
1670       current->is32|=1LL<<LOREG;
1671       dirty_reg(current,HIREG);
1672       dirty_reg(current,LOREG);
1673     }
1674     else // 64-bit
1675     {
1676       current->u&=~(1LL<<HIREG);
1677       current->u&=~(1LL<<LOREG);
1678       current->uu&=~(1LL<<HIREG);
1679       current->uu&=~(1LL<<LOREG);
1680       alloc_reg64(current,i,HIREG);
1681       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1682       alloc_reg64(current,i,rs1[i]);
1683       alloc_reg64(current,i,rs2[i]);
1684       alloc_all(current,i);
1685       current->is32&=~(1LL<<HIREG);
1686       current->is32&=~(1LL<<LOREG);
1687       dirty_reg(current,HIREG);
1688       dirty_reg(current,LOREG);
1689     }
1690   }
1691   else
1692   {
1693     // Multiply by zero is zero.
1694     // MIPS does not have a divide by zero exception.
1695     // The result is undefined, we return zero.
1696     alloc_reg(current,i,HIREG);
1697     alloc_reg(current,i,LOREG);
1698     current->is32|=1LL<<HIREG;
1699     current->is32|=1LL<<LOREG;
1700     dirty_reg(current,HIREG);
1701     dirty_reg(current,LOREG);
1702   }
1703 }
1704 #endif
1705
1706 void cop0_alloc(struct regstat *current,int i)
1707 {
1708   if(opcode2[i]==0) // MFC0
1709   {
1710     if(rt1[i]) {
1711       clear_const(current,rt1[i]);
1712       alloc_all(current,i);
1713       alloc_reg(current,i,rt1[i]);
1714       current->is32|=1LL<<rt1[i];
1715       dirty_reg(current,rt1[i]);
1716     }
1717   }
1718   else if(opcode2[i]==4) // MTC0
1719   {
1720     if(rs1[i]){
1721       clear_const(current,rs1[i]);
1722       alloc_reg(current,i,rs1[i]);
1723       alloc_all(current,i);
1724     }
1725     else {
1726       alloc_all(current,i); // FIXME: Keep r0
1727       current->u&=~1LL;
1728       alloc_reg(current,i,0);
1729     }
1730   }
1731   else
1732   {
1733     // TLBR/TLBWI/TLBWR/TLBP/ERET
1734     assert(opcode2[i]==0x10);
1735     alloc_all(current,i);
1736   }
1737 }
1738
1739 void cop1_alloc(struct regstat *current,int i)
1740 {
1741   alloc_reg(current,i,CSREG); // Load status
1742   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1743   {
1744     assert(rt1[i]);
1745     clear_const(current,rt1[i]);
1746     if(opcode2[i]==1) {
1747       alloc_reg64(current,i,rt1[i]); // DMFC1
1748       current->is32&=~(1LL<<rt1[i]);
1749     }else{
1750       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1751       current->is32|=1LL<<rt1[i];
1752     }
1753     dirty_reg(current,rt1[i]);
1754     alloc_reg_temp(current,i,-1);
1755   }
1756   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1757   {
1758     if(rs1[i]){
1759       clear_const(current,rs1[i]);
1760       if(opcode2[i]==5)
1761         alloc_reg64(current,i,rs1[i]); // DMTC1
1762       else
1763         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1764       alloc_reg_temp(current,i,-1);
1765     }
1766     else {
1767       current->u&=~1LL;
1768       alloc_reg(current,i,0);
1769       alloc_reg_temp(current,i,-1);
1770     }
1771   }
1772 }
1773 void fconv_alloc(struct regstat *current,int i)
1774 {
1775   alloc_reg(current,i,CSREG); // Load status
1776   alloc_reg_temp(current,i,-1);
1777 }
1778 void float_alloc(struct regstat *current,int i)
1779 {
1780   alloc_reg(current,i,CSREG); // Load status
1781   alloc_reg_temp(current,i,-1);
1782 }
1783 void fcomp_alloc(struct regstat *current,int i)
1784 {
1785   alloc_reg(current,i,CSREG); // Load status
1786   alloc_reg(current,i,FSREG); // Load flags
1787   dirty_reg(current,FSREG); // Flag will be modified
1788   alloc_reg_temp(current,i,-1);
1789 }
1790
1791 void syscall_alloc(struct regstat *current,int i)
1792 {
1793   alloc_cc(current,i);
1794   dirty_reg(current,CCREG);
1795   alloc_all(current,i);
1796   current->isconst=0;
1797 }
1798
1799 void delayslot_alloc(struct regstat *current,int i)
1800 {
1801   switch(itype[i]) {
1802     case UJUMP:
1803     case CJUMP:
1804     case SJUMP:
1805     case RJUMP:
1806     case FJUMP:
1807     case SYSCALL:
1808     case SPAN:
1809       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1810       printf("Disabled speculative precompilation\n");
1811       stop_after_jal=1;
1812       break;
1813     case IMM16:
1814       imm16_alloc(current,i);
1815       break;
1816     case LOAD:
1817     case LOADLR:
1818       load_alloc(current,i);
1819       break;
1820     case STORE:
1821     case STORELR:
1822       store_alloc(current,i);
1823       break;
1824     case ALU:
1825       alu_alloc(current,i);
1826       break;
1827     case SHIFT:
1828       shift_alloc(current,i);
1829       break;
1830     case MULTDIV:
1831       multdiv_alloc(current,i);
1832       break;
1833     case SHIFTIMM:
1834       shiftimm_alloc(current,i);
1835       break;
1836     case MOV:
1837       mov_alloc(current,i);
1838       break;
1839     case COP0:
1840       cop0_alloc(current,i);
1841       break;
1842     case COP1:
1843       cop1_alloc(current,i);
1844       break;
1845     case C1LS:
1846       c1ls_alloc(current,i);
1847       break;
1848     case FCONV:
1849       fconv_alloc(current,i);
1850       break;
1851     case FLOAT:
1852       float_alloc(current,i);
1853       break;
1854     case FCOMP:
1855       fcomp_alloc(current,i);
1856       break;
1857   }
1858 }
1859
1860 // Special case where a branch and delay slot span two pages in virtual memory
1861 static void pagespan_alloc(struct regstat *current,int i)
1862 {
1863   current->isconst=0;
1864   current->wasconst=0;
1865   regs[i].wasconst=0;
1866   alloc_all(current,i);
1867   alloc_cc(current,i);
1868   dirty_reg(current,CCREG);
1869   if(opcode[i]==3) // JAL
1870   {
1871     alloc_reg(current,i,31);
1872     dirty_reg(current,31);
1873   }
1874   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1875   {
1876     alloc_reg(current,i,rs1[i]);
1877     if (rt1[i]==31) {
1878       alloc_reg(current,i,31);
1879       dirty_reg(current,31);
1880     }
1881   }
1882   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1883   {
1884     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1885     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1886     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1887     {
1888       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1889       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1890     }
1891   }
1892   else
1893   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1894   {
1895     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1896     if(!((current->is32>>rs1[i])&1))
1897     {
1898       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1899     }
1900   }
1901   else
1902   if(opcode[i]==0x11) // BC1
1903   {
1904     alloc_reg(current,i,FSREG);
1905     alloc_reg(current,i,CSREG);
1906   }
1907   //else ...
1908 }
1909
1910 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1911 {
1912   stubs[stubcount][0]=type;
1913   stubs[stubcount][1]=addr;
1914   stubs[stubcount][2]=retaddr;
1915   stubs[stubcount][3]=a;
1916   stubs[stubcount][4]=b;
1917   stubs[stubcount][5]=c;
1918   stubs[stubcount][6]=d;
1919   stubs[stubcount][7]=e;
1920   stubcount++;
1921 }
1922
1923 // Write out a single register
1924 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1925 {
1926   int hr;
1927   for(hr=0;hr<HOST_REGS;hr++) {
1928     if(hr!=EXCLUDE_REG) {
1929       if((regmap[hr]&63)==r) {
1930         if((dirty>>hr)&1) {
1931           if(regmap[hr]<64) {
1932             emit_storereg(r,hr);
1933 #ifndef FORCE32
1934             if((is32>>regmap[hr])&1) {
1935               emit_sarimm(hr,31,hr);
1936               emit_storereg(r|64,hr);
1937             }
1938 #endif
1939           }else{
1940             emit_storereg(r|64,hr);
1941           }
1942         }
1943       }
1944     }
1945   }
1946 }
1947
1948 int mchecksum()
1949 {
1950   //if(!tracedebug) return 0;
1951   int i;
1952   int sum=0;
1953   for(i=0;i<2097152;i++) {
1954     unsigned int temp=sum;
1955     sum<<=1;
1956     sum|=(~temp)>>31;
1957     sum^=((u_int *)rdram)[i];
1958   }
1959   return sum;
1960 }
1961 int rchecksum()
1962 {
1963   int i;
1964   int sum=0;
1965   for(i=0;i<64;i++)
1966     sum^=((u_int *)reg)[i];
1967   return sum;
1968 }
1969 void rlist()
1970 {
1971   int i;
1972   printf("TRACE: ");
1973   for(i=0;i<32;i++)
1974     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1975   printf("\n");
1976 #ifndef DISABLE_COP1
1977   printf("TRACE: ");
1978   for(i=0;i<32;i++)
1979     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1980   printf("\n");
1981 #endif
1982 }
1983
1984 void enabletrace()
1985 {
1986   tracedebug=1;
1987 }
1988
1989 void memdebug(int i)
1990 {
1991   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1992   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1993   //rlist();
1994   //if(tracedebug) {
1995   //if(Count>=-2084597794) {
1996   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1997   //if(0) {
1998     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1999     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2000     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2001     rlist();
2002     #ifdef __i386__
2003     printf("TRACE: %x\n",(&i)[-1]);
2004     #endif
2005     #ifdef __arm__
2006     int j;
2007     printf("TRACE: %x \n",(&j)[10]);
2008     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2009     #endif
2010     //fflush(stdout);
2011   }
2012   //printf("TRACE: %x\n",(&i)[-1]);
2013 }
2014
2015 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2016 {
2017   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2018 }
2019
2020 void alu_assemble(int i,struct regstat *i_regs)
2021 {
2022   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2023     if(rt1[i]) {
2024       signed char s1,s2,t;
2025       t=get_reg(i_regs->regmap,rt1[i]);
2026       if(t>=0) {
2027         s1=get_reg(i_regs->regmap,rs1[i]);
2028         s2=get_reg(i_regs->regmap,rs2[i]);
2029         if(rs1[i]&&rs2[i]) {
2030           assert(s1>=0);
2031           assert(s2>=0);
2032           if(opcode2[i]&2) emit_sub(s1,s2,t);
2033           else emit_add(s1,s2,t);
2034         }
2035         else if(rs1[i]) {
2036           if(s1>=0) emit_mov(s1,t);
2037           else emit_loadreg(rs1[i],t);
2038         }
2039         else if(rs2[i]) {
2040           if(s2>=0) {
2041             if(opcode2[i]&2) emit_neg(s2,t);
2042             else emit_mov(s2,t);
2043           }
2044           else {
2045             emit_loadreg(rs2[i],t);
2046             if(opcode2[i]&2) emit_neg(t,t);
2047           }
2048         }
2049         else emit_zeroreg(t);
2050       }
2051     }
2052   }
2053   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2054     if(rt1[i]) {
2055       signed char s1l,s2l,s1h,s2h,tl,th;
2056       tl=get_reg(i_regs->regmap,rt1[i]);
2057       th=get_reg(i_regs->regmap,rt1[i]|64);
2058       if(tl>=0) {
2059         s1l=get_reg(i_regs->regmap,rs1[i]);
2060         s2l=get_reg(i_regs->regmap,rs2[i]);
2061         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2062         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2063         if(rs1[i]&&rs2[i]) {
2064           assert(s1l>=0);
2065           assert(s2l>=0);
2066           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2067           else emit_adds(s1l,s2l,tl);
2068           if(th>=0) {
2069             #ifdef INVERTED_CARRY
2070             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2071             #else
2072             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2073             #endif
2074             else emit_add(s1h,s2h,th);
2075           }
2076         }
2077         else if(rs1[i]) {
2078           if(s1l>=0) emit_mov(s1l,tl);
2079           else emit_loadreg(rs1[i],tl);
2080           if(th>=0) {
2081             if(s1h>=0) emit_mov(s1h,th);
2082             else emit_loadreg(rs1[i]|64,th);
2083           }
2084         }
2085         else if(rs2[i]) {
2086           if(s2l>=0) {
2087             if(opcode2[i]&2) emit_negs(s2l,tl);
2088             else emit_mov(s2l,tl);
2089           }
2090           else {
2091             emit_loadreg(rs2[i],tl);
2092             if(opcode2[i]&2) emit_negs(tl,tl);
2093           }
2094           if(th>=0) {
2095             #ifdef INVERTED_CARRY
2096             if(s2h>=0) emit_mov(s2h,th);
2097             else emit_loadreg(rs2[i]|64,th);
2098             if(opcode2[i]&2) {
2099               emit_adcimm(-1,th); // x86 has inverted carry flag
2100               emit_not(th,th);
2101             }
2102             #else
2103             if(opcode2[i]&2) {
2104               if(s2h>=0) emit_rscimm(s2h,0,th);
2105               else {
2106                 emit_loadreg(rs2[i]|64,th);
2107                 emit_rscimm(th,0,th);
2108               }
2109             }else{
2110               if(s2h>=0) emit_mov(s2h,th);
2111               else emit_loadreg(rs2[i]|64,th);
2112             }
2113             #endif
2114           }
2115         }
2116         else {
2117           emit_zeroreg(tl);
2118           if(th>=0) emit_zeroreg(th);
2119         }
2120       }
2121     }
2122   }
2123   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2124     if(rt1[i]) {
2125       signed char s1l,s1h,s2l,s2h,t;
2126       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2127       {
2128         t=get_reg(i_regs->regmap,rt1[i]);
2129         //assert(t>=0);
2130         if(t>=0) {
2131           s1l=get_reg(i_regs->regmap,rs1[i]);
2132           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2133           s2l=get_reg(i_regs->regmap,rs2[i]);
2134           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2135           if(rs2[i]==0) // rx<r0
2136           {
2137             assert(s1h>=0);
2138             if(opcode2[i]==0x2a) // SLT
2139               emit_shrimm(s1h,31,t);
2140             else // SLTU (unsigned can not be less than zero)
2141               emit_zeroreg(t);
2142           }
2143           else if(rs1[i]==0) // r0<rx
2144           {
2145             assert(s2h>=0);
2146             if(opcode2[i]==0x2a) // SLT
2147               emit_set_gz64_32(s2h,s2l,t);
2148             else // SLTU (set if not zero)
2149               emit_set_nz64_32(s2h,s2l,t);
2150           }
2151           else {
2152             assert(s1l>=0);assert(s1h>=0);
2153             assert(s2l>=0);assert(s2h>=0);
2154             if(opcode2[i]==0x2a) // SLT
2155               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2156             else // SLTU
2157               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2158           }
2159         }
2160       } else {
2161         t=get_reg(i_regs->regmap,rt1[i]);
2162         //assert(t>=0);
2163         if(t>=0) {
2164           s1l=get_reg(i_regs->regmap,rs1[i]);
2165           s2l=get_reg(i_regs->regmap,rs2[i]);
2166           if(rs2[i]==0) // rx<r0
2167           {
2168             assert(s1l>=0);
2169             if(opcode2[i]==0x2a) // SLT
2170               emit_shrimm(s1l,31,t);
2171             else // SLTU (unsigned can not be less than zero)
2172               emit_zeroreg(t);
2173           }
2174           else if(rs1[i]==0) // r0<rx
2175           {
2176             assert(s2l>=0);
2177             if(opcode2[i]==0x2a) // SLT
2178               emit_set_gz32(s2l,t);
2179             else // SLTU (set if not zero)
2180               emit_set_nz32(s2l,t);
2181           }
2182           else{
2183             assert(s1l>=0);assert(s2l>=0);
2184             if(opcode2[i]==0x2a) // SLT
2185               emit_set_if_less32(s1l,s2l,t);
2186             else // SLTU
2187               emit_set_if_carry32(s1l,s2l,t);
2188           }
2189         }
2190       }
2191     }
2192   }
2193   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2194     if(rt1[i]) {
2195       signed char s1l,s1h,s2l,s2h,th,tl;
2196       tl=get_reg(i_regs->regmap,rt1[i]);
2197       th=get_reg(i_regs->regmap,rt1[i]|64);
2198       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2199       {
2200         assert(tl>=0);
2201         if(tl>=0) {
2202           s1l=get_reg(i_regs->regmap,rs1[i]);
2203           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2204           s2l=get_reg(i_regs->regmap,rs2[i]);
2205           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2206           if(rs1[i]&&rs2[i]) {
2207             assert(s1l>=0);assert(s1h>=0);
2208             assert(s2l>=0);assert(s2h>=0);
2209             if(opcode2[i]==0x24) { // AND
2210               emit_and(s1l,s2l,tl);
2211               emit_and(s1h,s2h,th);
2212             } else
2213             if(opcode2[i]==0x25) { // OR
2214               emit_or(s1l,s2l,tl);
2215               emit_or(s1h,s2h,th);
2216             } else
2217             if(opcode2[i]==0x26) { // XOR
2218               emit_xor(s1l,s2l,tl);
2219               emit_xor(s1h,s2h,th);
2220             } else
2221             if(opcode2[i]==0x27) { // NOR
2222               emit_or(s1l,s2l,tl);
2223               emit_or(s1h,s2h,th);
2224               emit_not(tl,tl);
2225               emit_not(th,th);
2226             }
2227           }
2228           else
2229           {
2230             if(opcode2[i]==0x24) { // AND
2231               emit_zeroreg(tl);
2232               emit_zeroreg(th);
2233             } else
2234             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2235               if(rs1[i]){
2236                 if(s1l>=0) emit_mov(s1l,tl);
2237                 else emit_loadreg(rs1[i],tl);
2238                 if(s1h>=0) emit_mov(s1h,th);
2239                 else emit_loadreg(rs1[i]|64,th);
2240               }
2241               else
2242               if(rs2[i]){
2243                 if(s2l>=0) emit_mov(s2l,tl);
2244                 else emit_loadreg(rs2[i],tl);
2245                 if(s2h>=0) emit_mov(s2h,th);
2246                 else emit_loadreg(rs2[i]|64,th);
2247               }
2248               else{
2249                 emit_zeroreg(tl);
2250                 emit_zeroreg(th);
2251               }
2252             } else
2253             if(opcode2[i]==0x27) { // NOR
2254               if(rs1[i]){
2255                 if(s1l>=0) emit_not(s1l,tl);
2256                 else{
2257                   emit_loadreg(rs1[i],tl);
2258                   emit_not(tl,tl);
2259                 }
2260                 if(s1h>=0) emit_not(s1h,th);
2261                 else{
2262                   emit_loadreg(rs1[i]|64,th);
2263                   emit_not(th,th);
2264                 }
2265               }
2266               else
2267               if(rs2[i]){
2268                 if(s2l>=0) emit_not(s2l,tl);
2269                 else{
2270                   emit_loadreg(rs2[i],tl);
2271                   emit_not(tl,tl);
2272                 }
2273                 if(s2h>=0) emit_not(s2h,th);
2274                 else{
2275                   emit_loadreg(rs2[i]|64,th);
2276                   emit_not(th,th);
2277                 }
2278               }
2279               else {
2280                 emit_movimm(-1,tl);
2281                 emit_movimm(-1,th);
2282               }
2283             }
2284           }
2285         }
2286       }
2287       else
2288       {
2289         // 32 bit
2290         if(tl>=0) {
2291           s1l=get_reg(i_regs->regmap,rs1[i]);
2292           s2l=get_reg(i_regs->regmap,rs2[i]);
2293           if(rs1[i]&&rs2[i]) {
2294             assert(s1l>=0);
2295             assert(s2l>=0);
2296             if(opcode2[i]==0x24) { // AND
2297               emit_and(s1l,s2l,tl);
2298             } else
2299             if(opcode2[i]==0x25) { // OR
2300               emit_or(s1l,s2l,tl);
2301             } else
2302             if(opcode2[i]==0x26) { // XOR
2303               emit_xor(s1l,s2l,tl);
2304             } else
2305             if(opcode2[i]==0x27) { // NOR
2306               emit_or(s1l,s2l,tl);
2307               emit_not(tl,tl);
2308             }
2309           }
2310           else
2311           {
2312             if(opcode2[i]==0x24) { // AND
2313               emit_zeroreg(tl);
2314             } else
2315             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2316               if(rs1[i]){
2317                 if(s1l>=0) emit_mov(s1l,tl);
2318                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2319               }
2320               else
2321               if(rs2[i]){
2322                 if(s2l>=0) emit_mov(s2l,tl);
2323                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2324               }
2325               else emit_zeroreg(tl);
2326             } else
2327             if(opcode2[i]==0x27) { // NOR
2328               if(rs1[i]){
2329                 if(s1l>=0) emit_not(s1l,tl);
2330                 else {
2331                   emit_loadreg(rs1[i],tl);
2332                   emit_not(tl,tl);
2333                 }
2334               }
2335               else
2336               if(rs2[i]){
2337                 if(s2l>=0) emit_not(s2l,tl);
2338                 else {
2339                   emit_loadreg(rs2[i],tl);
2340                   emit_not(tl,tl);
2341                 }
2342               }
2343               else emit_movimm(-1,tl);
2344             }
2345           }
2346         }
2347       }
2348     }
2349   }
2350 }
2351
2352 void imm16_assemble(int i,struct regstat *i_regs)
2353 {
2354   if (opcode[i]==0x0f) { // LUI
2355     if(rt1[i]) {
2356       signed char t;
2357       t=get_reg(i_regs->regmap,rt1[i]);
2358       //assert(t>=0);
2359       if(t>=0) {
2360         if(!((i_regs->isconst>>t)&1))
2361           emit_movimm(imm[i]<<16,t);
2362       }
2363     }
2364   }
2365   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2366     if(rt1[i]) {
2367       signed char s,t;
2368       t=get_reg(i_regs->regmap,rt1[i]);
2369       s=get_reg(i_regs->regmap,rs1[i]);
2370       if(rs1[i]) {
2371         //assert(t>=0);
2372         //assert(s>=0);
2373         if(t>=0) {
2374           if(!((i_regs->isconst>>t)&1)) {
2375             if(s<0) {
2376               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2377               emit_addimm(t,imm[i],t);
2378             }else{
2379               if(!((i_regs->wasconst>>s)&1))
2380                 emit_addimm(s,imm[i],t);
2381               else
2382                 emit_movimm(constmap[i][s]+imm[i],t);
2383             }
2384           }
2385         }
2386       } else {
2387         if(t>=0) {
2388           if(!((i_regs->isconst>>t)&1))
2389             emit_movimm(imm[i],t);
2390         }
2391       }
2392     }
2393   }
2394   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2395     if(rt1[i]) {
2396       signed char sh,sl,th,tl;
2397       th=get_reg(i_regs->regmap,rt1[i]|64);
2398       tl=get_reg(i_regs->regmap,rt1[i]);
2399       sh=get_reg(i_regs->regmap,rs1[i]|64);
2400       sl=get_reg(i_regs->regmap,rs1[i]);
2401       if(tl>=0) {
2402         if(rs1[i]) {
2403           assert(sh>=0);
2404           assert(sl>=0);
2405           if(th>=0) {
2406             emit_addimm64_32(sh,sl,imm[i],th,tl);
2407           }
2408           else {
2409             emit_addimm(sl,imm[i],tl);
2410           }
2411         } else {
2412           emit_movimm(imm[i],tl);
2413           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2414         }
2415       }
2416     }
2417   }
2418   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2419     if(rt1[i]) {
2420       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2421       signed char sh,sl,t;
2422       t=get_reg(i_regs->regmap,rt1[i]);
2423       sh=get_reg(i_regs->regmap,rs1[i]|64);
2424       sl=get_reg(i_regs->regmap,rs1[i]);
2425       //assert(t>=0);
2426       if(t>=0) {
2427         if(rs1[i]>0) {
2428           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2429           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2430             if(opcode[i]==0x0a) { // SLTI
2431               if(sl<0) {
2432                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2433                 emit_slti32(t,imm[i],t);
2434               }else{
2435                 emit_slti32(sl,imm[i],t);
2436               }
2437             }
2438             else { // SLTIU
2439               if(sl<0) {
2440                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2441                 emit_sltiu32(t,imm[i],t);
2442               }else{
2443                 emit_sltiu32(sl,imm[i],t);
2444               }
2445             }
2446           }else{ // 64-bit
2447             assert(sl>=0);
2448             if(opcode[i]==0x0a) // SLTI
2449               emit_slti64_32(sh,sl,imm[i],t);
2450             else // SLTIU
2451               emit_sltiu64_32(sh,sl,imm[i],t);
2452           }
2453         }else{
2454           // SLTI(U) with r0 is just stupid,
2455           // nonetheless examples can be found
2456           if(opcode[i]==0x0a) // SLTI
2457             if(0<imm[i]) emit_movimm(1,t);
2458             else emit_zeroreg(t);
2459           else // SLTIU
2460           {
2461             if(imm[i]) emit_movimm(1,t);
2462             else emit_zeroreg(t);
2463           }
2464         }
2465       }
2466     }
2467   }
2468   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2469     if(rt1[i]) {
2470       signed char sh,sl,th,tl;
2471       th=get_reg(i_regs->regmap,rt1[i]|64);
2472       tl=get_reg(i_regs->regmap,rt1[i]);
2473       sh=get_reg(i_regs->regmap,rs1[i]|64);
2474       sl=get_reg(i_regs->regmap,rs1[i]);
2475       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2476         if(opcode[i]==0x0c) //ANDI
2477         {
2478           if(rs1[i]) {
2479             if(sl<0) {
2480               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2481               emit_andimm(tl,imm[i],tl);
2482             }else{
2483               if(!((i_regs->wasconst>>sl)&1))
2484                 emit_andimm(sl,imm[i],tl);
2485               else
2486                 emit_movimm(constmap[i][sl]&imm[i],tl);
2487             }
2488           }
2489           else
2490             emit_zeroreg(tl);
2491           if(th>=0) emit_zeroreg(th);
2492         }
2493         else
2494         {
2495           if(rs1[i]) {
2496             if(sl<0) {
2497               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2498             }
2499             if(th>=0) {
2500               if(sh<0) {
2501                 emit_loadreg(rs1[i]|64,th);
2502               }else{
2503                 emit_mov(sh,th);
2504               }
2505             }
2506             if(opcode[i]==0x0d) //ORI
2507             if(sl<0) {
2508               emit_orimm(tl,imm[i],tl);
2509             }else{
2510               if(!((i_regs->wasconst>>sl)&1))
2511                 emit_orimm(sl,imm[i],tl);
2512               else
2513                 emit_movimm(constmap[i][sl]|imm[i],tl);
2514             }
2515             if(opcode[i]==0x0e) //XORI
2516             if(sl<0) {
2517               emit_xorimm(tl,imm[i],tl);
2518             }else{
2519               if(!((i_regs->wasconst>>sl)&1))
2520                 emit_xorimm(sl,imm[i],tl);
2521               else
2522                 emit_movimm(constmap[i][sl]^imm[i],tl);
2523             }
2524           }
2525           else {
2526             emit_movimm(imm[i],tl);
2527             if(th>=0) emit_zeroreg(th);
2528           }
2529         }
2530       }
2531     }
2532   }
2533 }
2534
2535 void shiftimm_assemble(int i,struct regstat *i_regs)
2536 {
2537   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2538   {
2539     if(rt1[i]) {
2540       signed char s,t;
2541       t=get_reg(i_regs->regmap,rt1[i]);
2542       s=get_reg(i_regs->regmap,rs1[i]);
2543       //assert(t>=0);
2544       if(t>=0){
2545         if(rs1[i]==0)
2546         {
2547           emit_zeroreg(t);
2548         }
2549         else
2550         {
2551           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2552           if(imm[i]) {
2553             if(opcode2[i]==0) // SLL
2554             {
2555               emit_shlimm(s<0?t:s,imm[i],t);
2556             }
2557             if(opcode2[i]==2) // SRL
2558             {
2559               emit_shrimm(s<0?t:s,imm[i],t);
2560             }
2561             if(opcode2[i]==3) // SRA
2562             {
2563               emit_sarimm(s<0?t:s,imm[i],t);
2564             }
2565           }else{
2566             // Shift by zero
2567             if(s>=0 && s!=t) emit_mov(s,t);
2568           }
2569         }
2570       }
2571       //emit_storereg(rt1[i],t); //DEBUG
2572     }
2573   }
2574   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2575   {
2576     if(rt1[i]) {
2577       signed char sh,sl,th,tl;
2578       th=get_reg(i_regs->regmap,rt1[i]|64);
2579       tl=get_reg(i_regs->regmap,rt1[i]);
2580       sh=get_reg(i_regs->regmap,rs1[i]|64);
2581       sl=get_reg(i_regs->regmap,rs1[i]);
2582       if(tl>=0) {
2583         if(rs1[i]==0)
2584         {
2585           emit_zeroreg(tl);
2586           if(th>=0) emit_zeroreg(th);
2587         }
2588         else
2589         {
2590           assert(sl>=0);
2591           assert(sh>=0);
2592           if(imm[i]) {
2593             if(opcode2[i]==0x38) // DSLL
2594             {
2595               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2596               emit_shlimm(sl,imm[i],tl);
2597             }
2598             if(opcode2[i]==0x3a) // DSRL
2599             {
2600               emit_shrdimm(sl,sh,imm[i],tl);
2601               if(th>=0) emit_shrimm(sh,imm[i],th);
2602             }
2603             if(opcode2[i]==0x3b) // DSRA
2604             {
2605               emit_shrdimm(sl,sh,imm[i],tl);
2606               if(th>=0) emit_sarimm(sh,imm[i],th);
2607             }
2608           }else{
2609             // Shift by zero
2610             if(sl!=tl) emit_mov(sl,tl);
2611             if(th>=0&&sh!=th) emit_mov(sh,th);
2612           }
2613         }
2614       }
2615     }
2616   }
2617   if(opcode2[i]==0x3c) // DSLL32
2618   {
2619     if(rt1[i]) {
2620       signed char sl,tl,th;
2621       tl=get_reg(i_regs->regmap,rt1[i]);
2622       th=get_reg(i_regs->regmap,rt1[i]|64);
2623       sl=get_reg(i_regs->regmap,rs1[i]);
2624       if(th>=0||tl>=0){
2625         assert(tl>=0);
2626         assert(th>=0);
2627         assert(sl>=0);
2628         emit_mov(sl,th);
2629         emit_zeroreg(tl);
2630         if(imm[i]>32)
2631         {
2632           emit_shlimm(th,imm[i]&31,th);
2633         }
2634       }
2635     }
2636   }
2637   if(opcode2[i]==0x3e) // DSRL32
2638   {
2639     if(rt1[i]) {
2640       signed char sh,tl,th;
2641       tl=get_reg(i_regs->regmap,rt1[i]);
2642       th=get_reg(i_regs->regmap,rt1[i]|64);
2643       sh=get_reg(i_regs->regmap,rs1[i]|64);
2644       if(tl>=0){
2645         assert(sh>=0);
2646         emit_mov(sh,tl);
2647         if(th>=0) emit_zeroreg(th);
2648         if(imm[i]>32)
2649         {
2650           emit_shrimm(tl,imm[i]&31,tl);
2651         }
2652       }
2653     }
2654   }
2655   if(opcode2[i]==0x3f) // DSRA32
2656   {
2657     if(rt1[i]) {
2658       signed char sh,tl;
2659       tl=get_reg(i_regs->regmap,rt1[i]);
2660       sh=get_reg(i_regs->regmap,rs1[i]|64);
2661       if(tl>=0){
2662         assert(sh>=0);
2663         emit_mov(sh,tl);
2664         if(imm[i]>32)
2665         {
2666           emit_sarimm(tl,imm[i]&31,tl);
2667         }
2668       }
2669     }
2670   }
2671 }
2672
2673 #ifndef shift_assemble
2674 void shift_assemble(int i,struct regstat *i_regs)
2675 {
2676   printf("Need shift_assemble for this architecture.\n");
2677   exit(1);
2678 }
2679 #endif
2680
2681 void load_assemble(int i,struct regstat *i_regs)
2682 {
2683   int s,th,tl,addr,map=-1;
2684   int offset;
2685   int jaddr=0;
2686   int memtarget,c=0;
2687   u_int hr,reglist=0;
2688   th=get_reg(i_regs->regmap,rt1[i]|64);
2689   tl=get_reg(i_regs->regmap,rt1[i]);
2690   s=get_reg(i_regs->regmap,rs1[i]);
2691   offset=imm[i];
2692   for(hr=0;hr<HOST_REGS;hr++) {
2693     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2694   }
2695   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2696   if(s>=0) {
2697     c=(i_regs->wasconst>>s)&1;
2698     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2699     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2700   }
2701   if(offset||s<0||c) addr=tl;
2702   else addr=s;
2703   //printf("load_assemble: c=%d\n",c);
2704   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2705   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2706   if(tl>=0) {
2707     //assert(tl>=0);
2708     //assert(rt1[i]);
2709     reglist&=~(1<<tl);
2710     if(th>=0) reglist&=~(1<<th);
2711     if(!using_tlb) {
2712       if(!c) {
2713 //#define R29_HACK 1
2714         #ifdef R29_HACK
2715         // Strmnnrmn's speed hack
2716         if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2717         #endif
2718         {
2719           emit_cmpimm(addr,0x800000);
2720           jaddr=(int)out;
2721           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2722           // Hint to branch predictor that the branch is unlikely to be taken
2723           if(rs1[i]>=28)
2724             emit_jno_unlikely(0);
2725           else
2726           #endif
2727           emit_jno(0);
2728         }
2729       }
2730     }else{ // using tlb
2731       int x=0;
2732       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2733       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2734       map=get_reg(i_regs->regmap,TLREG);
2735       assert(map>=0);
2736       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2737       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2738     }
2739     if (opcode[i]==0x20) { // LB
2740       if(!c||memtarget) {
2741         #ifdef HOST_IMM_ADDR32
2742         if(c)
2743           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2744         else
2745         #endif
2746         {
2747           //emit_xorimm(addr,3,tl);
2748           //gen_tlb_addr_r(tl,map);
2749           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2750           int x=0;
2751 #ifdef BIG_ENDIAN_MIPS
2752           if(!c) emit_xorimm(addr,3,tl);
2753           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2754 #else
2755           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2756           else if (tl!=addr) emit_mov(addr,tl);
2757 #endif
2758           emit_movsbl_indexed_tlb(x,tl,map,tl);
2759         }
2760         if(jaddr)
2761           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2762       }
2763       else
2764         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2765     }
2766     if (opcode[i]==0x21) { // LH
2767       if(!c||memtarget) {
2768         #ifdef HOST_IMM_ADDR32
2769         if(c)
2770           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2771         else
2772         #endif
2773         {
2774           int x=0;
2775 #ifdef BIG_ENDIAN_MIPS
2776           if(!c) emit_xorimm(addr,2,tl);
2777           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2778 #else
2779           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2780           else if (tl!=addr) emit_mov(addr,tl);
2781 #endif
2782           //#ifdef
2783           //emit_movswl_indexed_tlb(x,tl,map,tl);
2784           //else
2785           if(map>=0) {
2786             gen_tlb_addr_r(tl,map);
2787             emit_movswl_indexed(x,tl,tl);
2788           }else
2789             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2790         }
2791         if(jaddr)
2792           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2793       }
2794       else
2795         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2796     }
2797     if (opcode[i]==0x23) { // LW
2798       if(!c||memtarget) {
2799         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2800         #ifdef HOST_IMM_ADDR32
2801         if(c)
2802           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2803         else
2804         #endif
2805         emit_readword_indexed_tlb(0,addr,map,tl);
2806         if(jaddr)
2807           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2808       }
2809       else
2810         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2811     }
2812     if (opcode[i]==0x24) { // LBU
2813       if(!c||memtarget) {
2814         #ifdef HOST_IMM_ADDR32
2815         if(c)
2816           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2817         else
2818         #endif
2819         {
2820           //emit_xorimm(addr,3,tl);
2821           //gen_tlb_addr_r(tl,map);
2822           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2823           int x=0;
2824 #ifdef BIG_ENDIAN_MIPS
2825           if(!c) emit_xorimm(addr,3,tl);
2826           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2827 #else
2828           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2829           else if (tl!=addr) emit_mov(addr,tl);
2830 #endif
2831           emit_movzbl_indexed_tlb(x,tl,map,tl);
2832         }
2833         if(jaddr)
2834           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2835       }
2836       else
2837         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2838     }
2839     if (opcode[i]==0x25) { // LHU
2840       if(!c||memtarget) {
2841         #ifdef HOST_IMM_ADDR32
2842         if(c)
2843           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2844         else
2845         #endif
2846         {
2847           int x=0;
2848 #ifdef BIG_ENDIAN_MIPS
2849           if(!c) emit_xorimm(addr,2,tl);
2850           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2851 #else
2852           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2853           else if (tl!=addr) emit_mov(addr,tl);
2854 #endif
2855           //#ifdef
2856           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2857           //#else
2858           if(map>=0) {
2859             gen_tlb_addr_r(tl,map);
2860             emit_movzwl_indexed(x,tl,tl);
2861           }else
2862             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2863           if(jaddr)
2864             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2865         }
2866       }
2867       else
2868         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2869     }
2870     if (opcode[i]==0x27) { // LWU
2871       assert(th>=0);
2872       if(!c||memtarget) {
2873         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2874         #ifdef HOST_IMM_ADDR32
2875         if(c)
2876           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2877         else
2878         #endif
2879         emit_readword_indexed_tlb(0,addr,map,tl);
2880         if(jaddr)
2881           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2882       }
2883       else {
2884         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2885       }
2886       emit_zeroreg(th);
2887     }
2888     if (opcode[i]==0x37) { // LD
2889       if(!c||memtarget) {
2890         //gen_tlb_addr_r(tl,map);
2891         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2892         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2893         #ifdef HOST_IMM_ADDR32
2894         if(c)
2895           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2896         else
2897         #endif
2898         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2899         if(jaddr)
2900           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2901       }
2902       else
2903         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2904     }
2905     //emit_storereg(rt1[i],tl); // DEBUG
2906   }
2907   //if(opcode[i]==0x23)
2908   //if(opcode[i]==0x24)
2909   //if(opcode[i]==0x23||opcode[i]==0x24)
2910   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2911   {
2912     //emit_pusha();
2913     save_regs(0x100f);
2914         emit_readword((int)&last_count,ECX);
2915         #ifdef __i386__
2916         if(get_reg(i_regs->regmap,CCREG)<0)
2917           emit_loadreg(CCREG,HOST_CCREG);
2918         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2919         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2920         emit_writeword(HOST_CCREG,(int)&Count);
2921         #endif
2922         #ifdef __arm__
2923         if(get_reg(i_regs->regmap,CCREG)<0)
2924           emit_loadreg(CCREG,0);
2925         else
2926           emit_mov(HOST_CCREG,0);
2927         emit_add(0,ECX,0);
2928         emit_addimm(0,2*ccadj[i],0);
2929         emit_writeword(0,(int)&Count);
2930         #endif
2931     emit_call((int)memdebug);
2932     //emit_popa();
2933     restore_regs(0x100f);
2934   }/**/
2935 }
2936
2937 #ifndef loadlr_assemble
2938 void loadlr_assemble(int i,struct regstat *i_regs)
2939 {
2940   printf("Need loadlr_assemble for this architecture.\n");
2941   exit(1);
2942 }
2943 #endif
2944
2945 void store_assemble(int i,struct regstat *i_regs)
2946 {
2947   int s,th,tl,map=-1;
2948   int addr,temp;
2949   int offset;
2950   int jaddr=0,jaddr2,type;
2951   int memtarget,c=0;
2952   int agr=AGEN1+(i&1);
2953   u_int hr,reglist=0;
2954   th=get_reg(i_regs->regmap,rs2[i]|64);
2955   tl=get_reg(i_regs->regmap,rs2[i]);
2956   s=get_reg(i_regs->regmap,rs1[i]);
2957   temp=get_reg(i_regs->regmap,agr);
2958   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2959   offset=imm[i];
2960   if(s>=0) {
2961     c=(i_regs->wasconst>>s)&1;
2962     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
2963     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2964   }
2965   assert(tl>=0);
2966   assert(temp>=0);
2967   for(hr=0;hr<HOST_REGS;hr++) {
2968     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2969   }
2970   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2971   if(offset||s<0||c) addr=temp;
2972   else addr=s;
2973   if(!using_tlb) {
2974     if(!c) {
2975       #ifdef R29_HACK
2976       // Strmnnrmn's speed hack
2977       memtarget=1;
2978       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2979       #endif
2980       emit_cmpimm(addr,0x800000);
2981       #ifdef DESTRUCTIVE_SHIFT
2982       if(s==addr) emit_mov(s,temp);
2983       #endif
2984       #ifdef R29_HACK
2985       if(rs1[i]!=29||start<0x80001000||start>=0x80800000)
2986       #endif
2987       {
2988         jaddr=(int)out;
2989         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2990         // Hint to branch predictor that the branch is unlikely to be taken
2991         if(rs1[i]>=28)
2992           emit_jno_unlikely(0);
2993         else
2994         #endif
2995         emit_jno(0);
2996       }
2997     }
2998   }else{ // using tlb
2999     int x=0;
3000     if (opcode[i]==0x28) x=3; // SB
3001     if (opcode[i]==0x29) x=2; // SH
3002     map=get_reg(i_regs->regmap,TLREG);
3003     assert(map>=0);
3004     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3005     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3006   }
3007
3008   if (opcode[i]==0x28) { // SB
3009     if(!c||memtarget) {
3010       int x=0;
3011 #ifdef BIG_ENDIAN_MIPS
3012       if(!c) emit_xorimm(addr,3,temp);
3013       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3014 #else
3015       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3016       else if (addr!=temp) emit_mov(addr,temp);
3017 #endif
3018       //gen_tlb_addr_w(temp,map);
3019       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3020       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
3021     }
3022     type=STOREB_STUB;
3023   }
3024   if (opcode[i]==0x29) { // SH
3025     if(!c||memtarget) {
3026       int x=0;
3027 #ifdef BIG_ENDIAN_MIPS
3028       if(!c) emit_xorimm(addr,2,temp);
3029       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3030 #else
3031       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3032       else if (addr!=temp) emit_mov(addr,temp);
3033 #endif
3034       //#ifdef
3035       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3036       //#else
3037       if(map>=0) {
3038         gen_tlb_addr_w(temp,map);
3039         emit_writehword_indexed(tl,x,temp);
3040       }else
3041         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3042     }
3043     type=STOREH_STUB;
3044   }
3045   if (opcode[i]==0x2B) { // SW
3046     if(!c||memtarget)
3047       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3048       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3049     type=STOREW_STUB;
3050   }
3051   if (opcode[i]==0x3F) { // SD
3052     if(!c||memtarget) {
3053       if(rs2[i]) {
3054         assert(th>=0);
3055         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3056         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3057         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3058       }else{
3059         // Store zero
3060         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3061         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3062         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3063       }
3064     }
3065     type=STORED_STUB;
3066   }
3067   if(jaddr) {
3068     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3069   } else if(!memtarget) {
3070     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3071   }
3072   if(!using_tlb) {
3073     if(!c||memtarget) {
3074       #ifdef DESTRUCTIVE_SHIFT
3075       // The x86 shift operation is 'destructive'; it overwrites the
3076       // source register, so we need to make a copy first and use that.
3077       addr=temp;
3078       #endif
3079       #if defined(HOST_IMM8)
3080       int ir=get_reg(i_regs->regmap,INVCP);
3081       assert(ir>=0);
3082       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3083       #else
3084       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3085       #endif
3086       jaddr2=(int)out;
3087       emit_jne(0);
3088       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3089     }
3090   }
3091   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3092   //if(opcode[i]==0x2B || opcode[i]==0x28)
3093   //if(opcode[i]==0x2B || opcode[i]==0x29)
3094   //if(opcode[i]==0x2B)
3095   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3096   {
3097     //emit_pusha();
3098     save_regs(0x100f);
3099         emit_readword((int)&last_count,ECX);
3100         #ifdef __i386__
3101         if(get_reg(i_regs->regmap,CCREG)<0)
3102           emit_loadreg(CCREG,HOST_CCREG);
3103         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3104         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3105         emit_writeword(HOST_CCREG,(int)&Count);
3106         #endif
3107         #ifdef __arm__
3108         if(get_reg(i_regs->regmap,CCREG)<0)
3109           emit_loadreg(CCREG,0);
3110         else
3111           emit_mov(HOST_CCREG,0);
3112         emit_add(0,ECX,0);
3113         emit_addimm(0,2*ccadj[i],0);
3114         emit_writeword(0,(int)&Count);
3115         #endif
3116     emit_call((int)memdebug);
3117     //emit_popa();
3118     restore_regs(0x100f);
3119   }/**/
3120 }
3121
3122 void storelr_assemble(int i,struct regstat *i_regs)
3123 {
3124   int s,th,tl;
3125   int temp;
3126   int temp2;
3127   int offset;
3128   int jaddr=0,jaddr2;
3129   int case1,case2,case3;
3130   int done0,done1,done2;
3131   int memtarget,c=0;
3132   u_int hr,reglist=0;
3133   th=get_reg(i_regs->regmap,rs2[i]|64);
3134   tl=get_reg(i_regs->regmap,rs2[i]);
3135   s=get_reg(i_regs->regmap,rs1[i]);
3136   temp=get_reg(i_regs->regmap,-1);
3137   offset=imm[i];
3138   if(s>=0) {
3139     c=(i_regs->isconst>>s)&1;
3140     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000;
3141     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3142   }
3143   assert(tl>=0);
3144   for(hr=0;hr<HOST_REGS;hr++) {
3145     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3146   }
3147   if(tl>=0) {
3148     assert(temp>=0);
3149     if(!using_tlb) {
3150       if(!c) {
3151         emit_cmpimm(s<0||offset?temp:s,0x800000);
3152         if(!offset&&s!=temp) emit_mov(s,temp);
3153         jaddr=(int)out;
3154         emit_jno(0);
3155       }
3156       else
3157       {
3158         if(!memtarget||!rs1[i]) {
3159           jaddr=(int)out;
3160           emit_jmp(0);
3161         }
3162       }
3163       if((u_int)rdram!=0x80000000) 
3164         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3165     }else{ // using tlb
3166       int map=get_reg(i_regs->regmap,TLREG);
3167       assert(map>=0);
3168       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3169       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3170       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3171       if(!jaddr&&!memtarget) {
3172         jaddr=(int)out;
3173         emit_jmp(0);
3174       }
3175       gen_tlb_addr_w(temp,map);
3176     }
3177
3178     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3179       temp2=get_reg(i_regs->regmap,FTEMP);
3180       if(!rs2[i]) temp2=th=tl;
3181     }
3182
3183 #ifndef BIG_ENDIAN_MIPS
3184     emit_xorimm(temp,3,temp);
3185 #endif
3186     emit_testimm(temp,2);
3187     case2=(int)out;
3188     emit_jne(0);
3189     emit_testimm(temp,1);
3190     case1=(int)out;
3191     emit_jne(0);
3192     // 0
3193     if (opcode[i]==0x2A) { // SWL
3194       emit_writeword_indexed(tl,0,temp);
3195     }
3196     if (opcode[i]==0x2E) { // SWR
3197       emit_writebyte_indexed(tl,3,temp);
3198     }
3199     if (opcode[i]==0x2C) { // SDL
3200       emit_writeword_indexed(th,0,temp);
3201       if(rs2[i]) emit_mov(tl,temp2);
3202     }
3203     if (opcode[i]==0x2D) { // SDR
3204       emit_writebyte_indexed(tl,3,temp);
3205       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3206     }
3207     done0=(int)out;
3208     emit_jmp(0);
3209     // 1
3210     set_jump_target(case1,(int)out);
3211     if (opcode[i]==0x2A) { // SWL
3212       // Write 3 msb into three least significant bytes
3213       if(rs2[i]) emit_rorimm(tl,8,tl);
3214       emit_writehword_indexed(tl,-1,temp);
3215       if(rs2[i]) emit_rorimm(tl,16,tl);
3216       emit_writebyte_indexed(tl,1,temp);
3217       if(rs2[i]) emit_rorimm(tl,8,tl);
3218     }
3219     if (opcode[i]==0x2E) { // SWR
3220       // Write two lsb into two most significant bytes
3221       emit_writehword_indexed(tl,1,temp);
3222     }
3223     if (opcode[i]==0x2C) { // SDL
3224       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3225       // Write 3 msb into three least significant bytes
3226       if(rs2[i]) emit_rorimm(th,8,th);
3227       emit_writehword_indexed(th,-1,temp);
3228       if(rs2[i]) emit_rorimm(th,16,th);
3229       emit_writebyte_indexed(th,1,temp);
3230       if(rs2[i]) emit_rorimm(th,8,th);
3231     }
3232     if (opcode[i]==0x2D) { // SDR
3233       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3234       // Write two lsb into two most significant bytes
3235       emit_writehword_indexed(tl,1,temp);
3236     }
3237     done1=(int)out;
3238     emit_jmp(0);
3239     // 2
3240     set_jump_target(case2,(int)out);
3241     emit_testimm(temp,1);
3242     case3=(int)out;
3243     emit_jne(0);
3244     if (opcode[i]==0x2A) { // SWL
3245       // Write two msb into two least significant bytes
3246       if(rs2[i]) emit_rorimm(tl,16,tl);
3247       emit_writehword_indexed(tl,-2,temp);
3248       if(rs2[i]) emit_rorimm(tl,16,tl);
3249     }
3250     if (opcode[i]==0x2E) { // SWR
3251       // Write 3 lsb into three most significant bytes
3252       emit_writebyte_indexed(tl,-1,temp);
3253       if(rs2[i]) emit_rorimm(tl,8,tl);
3254       emit_writehword_indexed(tl,0,temp);
3255       if(rs2[i]) emit_rorimm(tl,24,tl);
3256     }
3257     if (opcode[i]==0x2C) { // SDL
3258       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3259       // Write two msb into two least significant bytes
3260       if(rs2[i]) emit_rorimm(th,16,th);
3261       emit_writehword_indexed(th,-2,temp);
3262       if(rs2[i]) emit_rorimm(th,16,th);
3263     }
3264     if (opcode[i]==0x2D) { // SDR
3265       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3266       // Write 3 lsb into three most significant bytes
3267       emit_writebyte_indexed(tl,-1,temp);
3268       if(rs2[i]) emit_rorimm(tl,8,tl);
3269       emit_writehword_indexed(tl,0,temp);
3270       if(rs2[i]) emit_rorimm(tl,24,tl);
3271     }
3272     done2=(int)out;
3273     emit_jmp(0);
3274     // 3
3275     set_jump_target(case3,(int)out);
3276     if (opcode[i]==0x2A) { // SWL
3277       // Write msb into least significant byte
3278       if(rs2[i]) emit_rorimm(tl,24,tl);
3279       emit_writebyte_indexed(tl,-3,temp);
3280       if(rs2[i]) emit_rorimm(tl,8,tl);
3281     }
3282     if (opcode[i]==0x2E) { // SWR
3283       // Write entire word
3284       emit_writeword_indexed(tl,-3,temp);
3285     }
3286     if (opcode[i]==0x2C) { // SDL
3287       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3288       // Write msb into least significant byte
3289       if(rs2[i]) emit_rorimm(th,24,th);
3290       emit_writebyte_indexed(th,-3,temp);
3291       if(rs2[i]) emit_rorimm(th,8,th);
3292     }
3293     if (opcode[i]==0x2D) { // SDR
3294       if(rs2[i]) emit_mov(th,temp2);
3295       // Write entire word
3296       emit_writeword_indexed(tl,-3,temp);
3297     }
3298     set_jump_target(done0,(int)out);
3299     set_jump_target(done1,(int)out);
3300     set_jump_target(done2,(int)out);
3301     if (opcode[i]==0x2C) { // SDL
3302       emit_testimm(temp,4);
3303       done0=(int)out;
3304       emit_jne(0);
3305       emit_andimm(temp,~3,temp);
3306       emit_writeword_indexed(temp2,4,temp);
3307       set_jump_target(done0,(int)out);
3308     }
3309     if (opcode[i]==0x2D) { // SDR
3310       emit_testimm(temp,4);
3311       done0=(int)out;
3312       emit_jeq(0);
3313       emit_andimm(temp,~3,temp);
3314       emit_writeword_indexed(temp2,-4,temp);
3315       set_jump_target(done0,(int)out);
3316     }
3317     if(!c||!memtarget)
3318       add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist);
3319   }
3320   if(!using_tlb) {
3321     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3322     #if defined(HOST_IMM8)
3323     int ir=get_reg(i_regs->regmap,INVCP);
3324     assert(ir>=0);
3325     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3326     #else
3327     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3328     #endif
3329     jaddr2=(int)out;
3330     emit_jne(0);
3331     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3332   }
3333   /*
3334     emit_pusha();
3335     //save_regs(0x100f);
3336         emit_readword((int)&last_count,ECX);
3337         if(get_reg(i_regs->regmap,CCREG)<0)
3338           emit_loadreg(CCREG,HOST_CCREG);
3339         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3340         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3341         emit_writeword(HOST_CCREG,(int)&Count);
3342     emit_call((int)memdebug);
3343     emit_popa();
3344     //restore_regs(0x100f);
3345   /**/
3346 }
3347
3348 void c1ls_assemble(int i,struct regstat *i_regs)
3349 {
3350 #ifndef DISABLE_COP1
3351   int s,th,tl;
3352   int temp,ar;
3353   int map=-1;
3354   int offset;
3355   int c=0;
3356   int jaddr,jaddr2=0,jaddr3,type;
3357   int agr=AGEN1+(i&1);
3358   u_int hr,reglist=0;
3359   th=get_reg(i_regs->regmap,FTEMP|64);
3360   tl=get_reg(i_regs->regmap,FTEMP);
3361   s=get_reg(i_regs->regmap,rs1[i]);
3362   temp=get_reg(i_regs->regmap,agr);
3363   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3364   offset=imm[i];
3365   assert(tl>=0);
3366   assert(rs1[i]>0);
3367   assert(temp>=0);
3368   for(hr=0;hr<HOST_REGS;hr++) {
3369     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3370   }
3371   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3372   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3373   {
3374     // Loads use a temporary register which we need to save
3375     reglist|=1<<temp;
3376   }
3377   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3378     ar=temp;
3379   else // LWC1/LDC1
3380     ar=tl;
3381   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3382   //else c=(i_regs->wasconst>>s)&1;
3383   if(s>=0) c=(i_regs->wasconst>>s)&1;
3384   // Check cop1 unusable
3385   if(!cop1_usable) {
3386     signed char rs=get_reg(i_regs->regmap,CSREG);
3387     assert(rs>=0);
3388     emit_testimm(rs,0x20000000);
3389     jaddr=(int)out;
3390     emit_jeq(0);
3391     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3392     cop1_usable=1;
3393   }
3394   if (opcode[i]==0x39) { // SWC1 (get float address)
3395     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3396   }
3397   if (opcode[i]==0x3D) { // SDC1 (get double address)
3398     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3399   }
3400   // Generate address + offset
3401   if(!using_tlb) {
3402     if(!c)
3403       emit_cmpimm(offset||c||s<0?ar:s,0x800000);
3404   }
3405   else
3406   {
3407     map=get_reg(i_regs->regmap,TLREG);
3408     assert(map>=0);
3409     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3410       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3411     }
3412     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3413       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3414     }
3415   }
3416   if (opcode[i]==0x39) { // SWC1 (read float)
3417     emit_readword_indexed(0,tl,tl);
3418   }
3419   if (opcode[i]==0x3D) { // SDC1 (read double)
3420     emit_readword_indexed(4,tl,th);
3421     emit_readword_indexed(0,tl,tl);
3422   }
3423   if (opcode[i]==0x31) { // LWC1 (get target address)
3424     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3425   }
3426   if (opcode[i]==0x35) { // LDC1 (get target address)
3427     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3428   }
3429   if(!using_tlb) {
3430     if(!c) {
3431       jaddr2=(int)out;
3432       emit_jno(0);
3433     }
3434     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) {
3435       jaddr2=(int)out;
3436       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3437     }
3438     #ifdef DESTRUCTIVE_SHIFT
3439     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3440       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3441     }
3442     #endif
3443   }else{
3444     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3445       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3446     }
3447     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3448       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3449     }
3450   }
3451   if (opcode[i]==0x31) { // LWC1
3452     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3453     //gen_tlb_addr_r(ar,map);
3454     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3455     #ifdef HOST_IMM_ADDR32
3456     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3457     else
3458     #endif
3459     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3460     type=LOADW_STUB;
3461   }
3462   if (opcode[i]==0x35) { // LDC1
3463     assert(th>=0);
3464     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3465     //gen_tlb_addr_r(ar,map);
3466     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3467     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3468     #ifdef HOST_IMM_ADDR32
3469     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3470     else
3471     #endif
3472     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3473     type=LOADD_STUB;
3474   }
3475   if (opcode[i]==0x39) { // SWC1
3476     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3477     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3478     type=STOREW_STUB;
3479   }
3480   if (opcode[i]==0x3D) { // SDC1
3481     assert(th>=0);
3482     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3483     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3484     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3485     type=STORED_STUB;
3486   }
3487   if(!using_tlb) {
3488     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3489       #ifndef DESTRUCTIVE_SHIFT
3490       temp=offset||c||s<0?ar:s;
3491       #endif
3492       #if defined(HOST_IMM8)
3493       int ir=get_reg(i_regs->regmap,INVCP);
3494       assert(ir>=0);
3495       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3496       #else
3497       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3498       #endif
3499       jaddr3=(int)out;
3500       emit_jne(0);
3501       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3502     }
3503   }
3504   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3505   if (opcode[i]==0x31) { // LWC1 (write float)
3506     emit_writeword_indexed(tl,0,temp);
3507   }
3508   if (opcode[i]==0x35) { // LDC1 (write double)
3509     emit_writeword_indexed(th,4,temp);
3510     emit_writeword_indexed(tl,0,temp);
3511   }
3512   //if(opcode[i]==0x39)
3513   /*if(opcode[i]==0x39||opcode[i]==0x31)
3514   {
3515     emit_pusha();
3516         emit_readword((int)&last_count,ECX);
3517         if(get_reg(i_regs->regmap,CCREG)<0)
3518           emit_loadreg(CCREG,HOST_CCREG);
3519         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3520         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3521         emit_writeword(HOST_CCREG,(int)&Count);
3522     emit_call((int)memdebug);
3523     emit_popa();
3524   }/**/
3525 #else
3526   cop1_unusable(i, i_regs);
3527 #endif
3528 }
3529
3530 #ifndef multdiv_assemble
3531 void multdiv_assemble(int i,struct regstat *i_regs)
3532 {
3533   printf("Need multdiv_assemble for this architecture.\n");
3534   exit(1);
3535 }
3536 #endif
3537
3538 void mov_assemble(int i,struct regstat *i_regs)
3539 {
3540   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3541   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3542   assert(rt1[i]>0);
3543   if(rt1[i]) {
3544     signed char sh,sl,th,tl;
3545     th=get_reg(i_regs->regmap,rt1[i]|64);
3546     tl=get_reg(i_regs->regmap,rt1[i]);
3547     //assert(tl>=0);
3548     if(tl>=0) {
3549       sh=get_reg(i_regs->regmap,rs1[i]|64);
3550       sl=get_reg(i_regs->regmap,rs1[i]);
3551       if(sl>=0) emit_mov(sl,tl);
3552       else emit_loadreg(rs1[i],tl);
3553       if(th>=0) {
3554         if(sh>=0) emit_mov(sh,th);
3555         else emit_loadreg(rs1[i]|64,th);
3556       }
3557     }
3558   }
3559 }
3560
3561 #ifndef fconv_assemble
3562 void fconv_assemble(int i,struct regstat *i_regs)
3563 {
3564   printf("Need fconv_assemble for this architecture.\n");
3565   exit(1);
3566 }
3567 #endif
3568
3569 #if 0
3570 void float_assemble(int i,struct regstat *i_regs)
3571 {
3572   printf("Need float_assemble for this architecture.\n");
3573   exit(1);
3574 }
3575 #endif
3576
3577 void syscall_assemble(int i,struct regstat *i_regs)
3578 {
3579   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3580   assert(ccreg==HOST_CCREG);
3581   assert(!is_delayslot);
3582   emit_movimm(start+i*4,EAX); // Get PC
3583   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3584   emit_jmp((int)jump_syscall);
3585 }
3586
3587 void ds_assemble(int i,struct regstat *i_regs)
3588 {
3589   is_delayslot=1;
3590   switch(itype[i]) {
3591     case ALU:
3592       alu_assemble(i,i_regs);break;
3593     case IMM16:
3594       imm16_assemble(i,i_regs);break;
3595     case SHIFT:
3596       shift_assemble(i,i_regs);break;
3597     case SHIFTIMM:
3598       shiftimm_assemble(i,i_regs);break;
3599     case LOAD:
3600       load_assemble(i,i_regs);break;
3601     case LOADLR:
3602       loadlr_assemble(i,i_regs);break;
3603     case STORE:
3604       store_assemble(i,i_regs);break;
3605     case STORELR:
3606       storelr_assemble(i,i_regs);break;
3607     case COP0:
3608       cop0_assemble(i,i_regs);break;
3609     case COP1:
3610       cop1_assemble(i,i_regs);break;
3611     case C1LS:
3612       c1ls_assemble(i,i_regs);break;
3613     case FCONV:
3614       fconv_assemble(i,i_regs);break;
3615     case FLOAT:
3616       float_assemble(i,i_regs);break;
3617     case FCOMP:
3618       fcomp_assemble(i,i_regs);break;
3619     case MULTDIV:
3620       multdiv_assemble(i,i_regs);break;
3621     case MOV:
3622       mov_assemble(i,i_regs);break;
3623     case SYSCALL:
3624     case SPAN:
3625     case UJUMP:
3626     case RJUMP:
3627     case CJUMP:
3628     case SJUMP:
3629     case FJUMP:
3630       printf("Jump in the delay slot.  This is probably a bug.\n");
3631   }
3632   is_delayslot=0;
3633 }
3634
3635 // Is the branch target a valid internal jump?
3636 int internal_branch(uint64_t i_is32,int addr)
3637 {
3638   if(addr&1) return 0; // Indirect (register) jump
3639   if(addr>=start && addr<start+slen*4-4)
3640   {
3641     int t=(addr-start)>>2;
3642     // Delay slots are not valid branch targets
3643     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3644     // 64 -> 32 bit transition requires a recompile
3645     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3646     {
3647       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3648       else printf("optimizable: yes\n");
3649     }*/
3650     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3651     if(requires_32bit[t]&~i_is32) return 0;
3652     else return 1;
3653   }
3654   return 0;
3655 }
3656
3657 #ifndef wb_invalidate
3658 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3659   uint64_t u,uint64_t uu)
3660 {
3661   int hr;
3662   for(hr=0;hr<HOST_REGS;hr++) {
3663     if(hr!=EXCLUDE_REG) {
3664       if(pre[hr]!=entry[hr]) {
3665         if(pre[hr]>=0) {
3666           if((dirty>>hr)&1) {
3667             if(get_reg(entry,pre[hr])<0) {
3668               if(pre[hr]<64) {
3669                 if(!((u>>pre[hr])&1)) {
3670                   emit_storereg(pre[hr],hr);
3671                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3672                     emit_sarimm(hr,31,hr);
3673                     emit_storereg(pre[hr]|64,hr);
3674                   }
3675                 }
3676               }else{
3677                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3678                   emit_storereg(pre[hr],hr);
3679                 }
3680               }
3681             }
3682           }
3683         }
3684       }
3685     }
3686   }
3687   // Move from one register to another (no writeback)
3688   for(hr=0;hr<HOST_REGS;hr++) {
3689     if(hr!=EXCLUDE_REG) {
3690       if(pre[hr]!=entry[hr]) {
3691         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3692           int nr;
3693           if((nr=get_reg(entry,pre[hr]))>=0) {
3694             emit_mov(hr,nr);
3695           }
3696         }
3697       }
3698     }
3699   }
3700 }
3701 #endif
3702
3703 // Load the specified registers
3704 // This only loads the registers given as arguments because
3705 // we don't want to load things that will be overwritten
3706 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3707 {
3708   int hr;
3709   // Load 32-bit regs
3710   for(hr=0;hr<HOST_REGS;hr++) {
3711     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3712       if(entry[hr]!=regmap[hr]) {
3713         if(regmap[hr]==rs1||regmap[hr]==rs2)
3714         {
3715           if(regmap[hr]==0) {
3716             emit_zeroreg(hr);
3717           }
3718           else
3719           {
3720             emit_loadreg(regmap[hr],hr);
3721           }
3722         }
3723       }
3724     }
3725   }
3726   //Load 64-bit regs
3727   for(hr=0;hr<HOST_REGS;hr++) {
3728     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3729       if(entry[hr]!=regmap[hr]) {
3730         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3731         {
3732           assert(regmap[hr]!=64);
3733           if((is32>>(regmap[hr]&63))&1) {
3734             int lr=get_reg(regmap,regmap[hr]-64);
3735             if(lr>=0)
3736               emit_sarimm(lr,31,hr);
3737             else
3738               emit_loadreg(regmap[hr],hr);
3739           }
3740           else
3741           {
3742             emit_loadreg(regmap[hr],hr);
3743           }
3744         }
3745       }
3746     }
3747   }
3748 }
3749
3750 // Load registers prior to the start of a loop
3751 // so that they are not loaded within the loop
3752 static void loop_preload(signed char pre[],signed char entry[])
3753 {
3754   int hr;
3755   for(hr=0;hr<HOST_REGS;hr++) {
3756     if(hr!=EXCLUDE_REG) {
3757       if(pre[hr]!=entry[hr]) {
3758         if(entry[hr]>=0) {
3759           if(get_reg(pre,entry[hr])<0) {
3760             assem_debug("loop preload:\n");
3761             //printf("loop preload: %d\n",hr);
3762             if(entry[hr]==0) {
3763               emit_zeroreg(hr);
3764             }
3765             else if(entry[hr]<TEMPREG)
3766             {
3767               emit_loadreg(entry[hr],hr);
3768             }
3769             else if(entry[hr]-64<TEMPREG)
3770             {
3771               emit_loadreg(entry[hr],hr);
3772             }
3773           }
3774         }
3775       }
3776     }
3777   }
3778 }
3779
3780 // Generate address for load/store instruction
3781 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3782 {
3783   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
3784     int ra;
3785     int agr=AGEN1+(i&1);
3786     int mgr=MGEN1+(i&1);
3787     if(itype[i]==LOAD) {
3788       ra=get_reg(i_regs->regmap,rt1[i]);
3789       //if(rt1[i]) assert(ra>=0);
3790     }
3791     if(itype[i]==LOADLR) {
3792       ra=get_reg(i_regs->regmap,FTEMP);
3793     }
3794     if(itype[i]==STORE||itype[i]==STORELR) {
3795       ra=get_reg(i_regs->regmap,agr);
3796       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3797     }
3798     if(itype[i]==C1LS) {
3799       if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3800         ra=get_reg(i_regs->regmap,FTEMP);
3801       else { // SWC1/SDC1
3802         ra=get_reg(i_regs->regmap,agr);
3803         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3804       }
3805     }
3806     int rs=get_reg(i_regs->regmap,rs1[i]);
3807     int rm=get_reg(i_regs->regmap,TLREG);
3808     if(ra>=0) {
3809       int offset=imm[i];
3810       int c=(i_regs->wasconst>>rs)&1;
3811       if(rs1[i]==0) {
3812         // Using r0 as a base address
3813         /*if(rm>=0) {
3814           if(!entry||entry[rm]!=mgr) {
3815             generate_map_const(offset,rm);
3816           } // else did it in the previous cycle
3817         }*/
3818         if(!entry||entry[ra]!=agr) {
3819           if (opcode[i]==0x22||opcode[i]==0x26) {
3820             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3821           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3822             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3823           }else{
3824             emit_movimm(offset,ra);
3825           }
3826         } // else did it in the previous cycle
3827       }
3828       else if(rs<0) {
3829         if(!entry||entry[ra]!=rs1[i])
3830           emit_loadreg(rs1[i],ra);
3831         //if(!entry||entry[ra]!=rs1[i])
3832         //  printf("poor load scheduling!\n");
3833       }
3834       else if(c) {
3835         if(rm>=0) {
3836           if(!entry||entry[rm]!=mgr) {
3837             if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) {
3838               // Stores to memory go thru the mapper to detect self-modifying
3839               // code, loads don't.
3840               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
3841                  (unsigned int)(constmap[i][rs]+offset)<0x80800000 )
3842                 generate_map_const(constmap[i][rs]+offset,rm);
3843             }else{
3844               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
3845                 generate_map_const(constmap[i][rs]+offset,rm);
3846             }
3847           }
3848         }
3849         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3850           if(!entry||entry[ra]!=agr) {
3851             if (opcode[i]==0x22||opcode[i]==0x26) {
3852               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3853             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3854               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3855             }else{
3856               #ifdef HOST_IMM_ADDR32
3857               if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) ||
3858                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
3859               #endif
3860               emit_movimm(constmap[i][rs]+offset,ra);
3861             }
3862           } // else did it in the previous cycle
3863         } // else load_consts already did it
3864       }
3865       if(offset&&!c&&rs1[i]) {
3866         if(rs>=0) {
3867           emit_addimm(rs,offset,ra);
3868         }else{
3869           emit_addimm(ra,offset,ra);
3870         }
3871       }
3872     }
3873   }
3874   // Preload constants for next instruction
3875   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
3876     int agr,ra;
3877     #ifndef HOST_IMM_ADDR32
3878     // Mapper entry
3879     agr=MGEN1+((i+1)&1);
3880     ra=get_reg(i_regs->regmap,agr);
3881     if(ra>=0) {
3882       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3883       int offset=imm[i+1];
3884       int c=(regs[i+1].wasconst>>rs)&1;
3885       if(c) {
3886         if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) {
3887           // Stores to memory go thru the mapper to detect self-modifying
3888           // code, loads don't.
3889           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
3890              (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 )
3891             generate_map_const(constmap[i+1][rs]+offset,ra);
3892         }else{
3893           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
3894             generate_map_const(constmap[i+1][rs]+offset,ra);
3895         }
3896       }
3897       /*else if(rs1[i]==0) {
3898         generate_map_const(offset,ra);
3899       }*/
3900     }
3901     #endif
3902     // Actual address
3903     agr=AGEN1+((i+1)&1);
3904     ra=get_reg(i_regs->regmap,agr);
3905     if(ra>=0) {
3906       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3907       int offset=imm[i+1];
3908       int c=(regs[i+1].wasconst>>rs)&1;
3909       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3910         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3911           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3912         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3913           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3914         }else{
3915           #ifdef HOST_IMM_ADDR32
3916           if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) ||
3917              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
3918           #endif
3919           emit_movimm(constmap[i+1][rs]+offset,ra);
3920         }
3921       }
3922       else if(rs1[i+1]==0) {
3923         // Using r0 as a base address
3924         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3925           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3926         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3927           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3928         }else{
3929           emit_movimm(offset,ra);
3930         }
3931       }
3932     }
3933   }
3934 }
3935
3936 int get_final_value(int hr, int i, int *value)
3937 {
3938   int reg=regs[i].regmap[hr];
3939   while(i<slen-1) {
3940     if(regs[i+1].regmap[hr]!=reg) break;
3941     if(!((regs[i+1].isconst>>hr)&1)) break;
3942     if(bt[i+1]) break;
3943     i++;
3944   }
3945   if(i<slen-1) {
3946     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3947       *value=constmap[i][hr];
3948       return 1;
3949     }
3950     if(!bt[i+1]) {
3951       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3952         // Load in delay slot, out-of-order execution
3953         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3954         {
3955           #ifdef HOST_IMM_ADDR32
3956           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
3957           #endif
3958           // Precompute load address
3959           *value=constmap[i][hr]+imm[i+2];
3960           return 1;
3961         }
3962       }
3963       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3964       {
3965         #ifdef HOST_IMM_ADDR32
3966         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
3967         #endif
3968         // Precompute load address
3969         *value=constmap[i][hr]+imm[i+1];
3970         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3971         return 1;
3972       }
3973     }
3974   }
3975   *value=constmap[i][hr];
3976   //printf("c=%x\n",(int)constmap[i][hr]);
3977   if(i==slen-1) return 1;
3978   if(reg<64) {
3979     return !((unneeded_reg[i+1]>>reg)&1);
3980   }else{
3981     return !((unneeded_reg_upper[i+1]>>reg)&1);
3982   }
3983 }
3984
3985 // Load registers with known constants
3986 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3987 {
3988   int hr;
3989   // Load 32-bit regs
3990   for(hr=0;hr<HOST_REGS;hr++) {
3991     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3992       //if(entry[hr]!=regmap[hr]) {
3993       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3994         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3995           int value;
3996           if(get_final_value(hr,i,&value)) {
3997             if(value==0) {
3998               emit_zeroreg(hr);
3999             }
4000             else {
4001               emit_movimm(value,hr);
4002             }
4003           }
4004         }
4005       }
4006     }
4007   }
4008   // Load 64-bit regs
4009   for(hr=0;hr<HOST_REGS;hr++) {
4010     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4011       //if(entry[hr]!=regmap[hr]) {
4012       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4013         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4014           if((is32>>(regmap[hr]&63))&1) {
4015             int lr=get_reg(regmap,regmap[hr]-64);
4016             assert(lr>=0);
4017             emit_sarimm(lr,31,hr);
4018           }
4019           else
4020           {
4021             int value;
4022             if(get_final_value(hr,i,&value)) {
4023               if(value==0) {
4024                 emit_zeroreg(hr);
4025               }
4026               else {
4027                 emit_movimm(value,hr);
4028               }
4029             }
4030           }
4031         }
4032       }
4033     }
4034   }
4035 }
4036 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4037 {
4038   int hr;
4039   // Load 32-bit regs
4040   for(hr=0;hr<HOST_REGS;hr++) {
4041     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4042       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4043         int value=constmap[i][hr];
4044         if(value==0) {
4045           emit_zeroreg(hr);
4046         }
4047         else {
4048           emit_movimm(value,hr);
4049         }
4050       }
4051     }
4052   }
4053   // Load 64-bit regs
4054   for(hr=0;hr<HOST_REGS;hr++) {
4055     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4056       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4057         if((is32>>(regmap[hr]&63))&1) {
4058           int lr=get_reg(regmap,regmap[hr]-64);
4059           assert(lr>=0);
4060           emit_sarimm(lr,31,hr);
4061         }
4062         else
4063         {
4064           int value=constmap[i][hr];
4065           if(value==0) {
4066             emit_zeroreg(hr);
4067           }
4068           else {
4069             emit_movimm(value,hr);
4070           }
4071         }
4072       }
4073     }
4074   }
4075 }
4076
4077 // Write out all dirty registers (except cycle count)
4078 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4079 {
4080   int hr;
4081   for(hr=0;hr<HOST_REGS;hr++) {
4082     if(hr!=EXCLUDE_REG) {
4083       if(i_regmap[hr]>0) {
4084         if(i_regmap[hr]!=CCREG) {
4085           if((i_dirty>>hr)&1) {
4086             if(i_regmap[hr]<64) {
4087               emit_storereg(i_regmap[hr],hr);
4088 #ifndef FORCE32
4089               if( ((i_is32>>i_regmap[hr])&1) ) {
4090                 #ifdef DESTRUCTIVE_WRITEBACK
4091                 emit_sarimm(hr,31,hr);
4092                 emit_storereg(i_regmap[hr]|64,hr);
4093                 #else
4094                 emit_sarimm(hr,31,HOST_TEMPREG);
4095                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4096                 #endif
4097               }
4098 #endif
4099             }else{
4100               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4101                 emit_storereg(i_regmap[hr],hr);
4102               }
4103             }
4104           }
4105         }
4106       }
4107     }
4108   }
4109 }
4110 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4111 // This writes the registers not written by store_regs_bt
4112 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4113 {
4114   int hr;
4115   int t=(addr-start)>>2;
4116   for(hr=0;hr<HOST_REGS;hr++) {
4117     if(hr!=EXCLUDE_REG) {
4118       if(i_regmap[hr]>0) {
4119         if(i_regmap[hr]!=CCREG) {
4120           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4121             if((i_dirty>>hr)&1) {
4122               if(i_regmap[hr]<64) {
4123                 emit_storereg(i_regmap[hr],hr);
4124 #ifndef FORCE32
4125                 if( ((i_is32>>i_regmap[hr])&1) ) {
4126                   #ifdef DESTRUCTIVE_WRITEBACK
4127                   emit_sarimm(hr,31,hr);
4128                   emit_storereg(i_regmap[hr]|64,hr);
4129                   #else
4130                   emit_sarimm(hr,31,HOST_TEMPREG);
4131                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4132                   #endif
4133                 }
4134 #endif
4135               }else{
4136                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4137                   emit_storereg(i_regmap[hr],hr);
4138                 }
4139               }
4140             }
4141           }
4142         }
4143       }
4144     }
4145   }
4146 }
4147
4148 // Load all registers (except cycle count)
4149 void load_all_regs(signed char i_regmap[])
4150 {
4151   int hr;
4152   for(hr=0;hr<HOST_REGS;hr++) {
4153     if(hr!=EXCLUDE_REG) {
4154       if(i_regmap[hr]==0) {
4155         emit_zeroreg(hr);
4156       }
4157       else
4158       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4159       {
4160         emit_loadreg(i_regmap[hr],hr);
4161       }
4162     }
4163   }
4164 }
4165
4166 // Load all current registers also needed by next instruction
4167 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4168 {
4169   int hr;
4170   for(hr=0;hr<HOST_REGS;hr++) {
4171     if(hr!=EXCLUDE_REG) {
4172       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4173         if(i_regmap[hr]==0) {
4174           emit_zeroreg(hr);
4175         }
4176         else
4177         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4178         {
4179           emit_loadreg(i_regmap[hr],hr);
4180         }
4181       }
4182     }
4183   }
4184 }
4185
4186 // Load all regs, storing cycle count if necessary
4187 void load_regs_entry(int t)
4188 {
4189   int hr;
4190   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4191   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4192   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4193     emit_storereg(CCREG,HOST_CCREG);
4194   }
4195   // Load 32-bit regs
4196   for(hr=0;hr<HOST_REGS;hr++) {
4197     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4198       if(regs[t].regmap_entry[hr]==0) {
4199         emit_zeroreg(hr);
4200       }
4201       else if(regs[t].regmap_entry[hr]!=CCREG)
4202       {
4203         emit_loadreg(regs[t].regmap_entry[hr],hr);
4204       }
4205     }
4206   }
4207   // Load 64-bit regs
4208   for(hr=0;hr<HOST_REGS;hr++) {
4209     if(regs[t].regmap_entry[hr]>=64) {
4210       assert(regs[t].regmap_entry[hr]!=64);
4211       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4212         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4213         if(lr<0) {
4214           emit_loadreg(regs[t].regmap_entry[hr],hr);
4215         }
4216         else
4217         {
4218           emit_sarimm(lr,31,hr);
4219         }
4220       }
4221       else
4222       {
4223         emit_loadreg(regs[t].regmap_entry[hr],hr);
4224       }
4225     }
4226   }
4227 }
4228
4229 // Store dirty registers prior to branch
4230 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4231 {
4232   if(internal_branch(i_is32,addr))
4233   {
4234     int t=(addr-start)>>2;
4235     int hr;
4236     for(hr=0;hr<HOST_REGS;hr++) {
4237       if(hr!=EXCLUDE_REG) {
4238         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4239           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4240             if((i_dirty>>hr)&1) {
4241               if(i_regmap[hr]<64) {
4242                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4243                   emit_storereg(i_regmap[hr],hr);
4244                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4245                     #ifdef DESTRUCTIVE_WRITEBACK
4246                     emit_sarimm(hr,31,hr);
4247                     emit_storereg(i_regmap[hr]|64,hr);
4248                     #else
4249                     emit_sarimm(hr,31,HOST_TEMPREG);
4250                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4251                     #endif
4252                   }
4253                 }
4254               }else{
4255                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4256                   emit_storereg(i_regmap[hr],hr);
4257                 }
4258               }
4259             }
4260           }
4261         }
4262       }
4263     }
4264   }
4265   else
4266   {
4267     // Branch out of this block, write out all dirty regs
4268     wb_dirtys(i_regmap,i_is32,i_dirty);
4269   }
4270 }
4271
4272 // Load all needed registers for branch target
4273 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4274 {
4275   //if(addr>=start && addr<(start+slen*4))
4276   if(internal_branch(i_is32,addr))
4277   {
4278     int t=(addr-start)>>2;
4279     int hr;
4280     // Store the cycle count before loading something else
4281     if(i_regmap[HOST_CCREG]!=CCREG) {
4282       assert(i_regmap[HOST_CCREG]==-1);
4283     }
4284     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4285       emit_storereg(CCREG,HOST_CCREG);
4286     }
4287     // Load 32-bit regs
4288     for(hr=0;hr<HOST_REGS;hr++) {
4289       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4290         #ifdef DESTRUCTIVE_WRITEBACK
4291         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4292         #else
4293         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4294         #endif
4295           if(regs[t].regmap_entry[hr]==0) {
4296             emit_zeroreg(hr);
4297           }
4298           else if(regs[t].regmap_entry[hr]!=CCREG)
4299           {
4300             emit_loadreg(regs[t].regmap_entry[hr],hr);
4301           }
4302         }
4303       }
4304     }
4305     //Load 64-bit regs
4306     for(hr=0;hr<HOST_REGS;hr++) {
4307       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4308         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4309           assert(regs[t].regmap_entry[hr]!=64);
4310           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4311             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4312             if(lr<0) {
4313               emit_loadreg(regs[t].regmap_entry[hr],hr);
4314             }
4315             else
4316             {
4317               emit_sarimm(lr,31,hr);
4318             }
4319           }
4320           else
4321           {
4322             emit_loadreg(regs[t].regmap_entry[hr],hr);
4323           }
4324         }
4325         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4326           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4327           assert(lr>=0);
4328           emit_sarimm(lr,31,hr);
4329         }
4330       }
4331     }
4332   }
4333 }
4334
4335 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4336 {
4337   if(addr>=start && addr<start+slen*4-4)
4338   {
4339     int t=(addr-start)>>2;
4340     int hr;
4341     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4342     for(hr=0;hr<HOST_REGS;hr++)
4343     {
4344       if(hr!=EXCLUDE_REG)
4345       {
4346         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4347         {
4348           if(regs[t].regmap_entry[hr]!=-1)
4349           {
4350             return 0;
4351           }
4352           else 
4353           if((i_dirty>>hr)&1)
4354           {
4355             if(i_regmap[hr]<64)
4356             {
4357               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4358                 return 0;
4359             }
4360             else
4361             {
4362               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4363                 return 0;
4364             }
4365           }
4366         }
4367         else // Same register but is it 32-bit or dirty?
4368         if(i_regmap[hr]>=0)
4369         {
4370           if(!((regs[t].dirty>>hr)&1))
4371           {
4372             if((i_dirty>>hr)&1)
4373             {
4374               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4375               {
4376                 //printf("%x: dirty no match\n",addr);
4377                 return 0;
4378               }
4379             }
4380           }
4381           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4382           {
4383             //printf("%x: is32 no match\n",addr);
4384             return 0;
4385           }
4386         }
4387       }
4388     }
4389     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4390     if(requires_32bit[t]&~i_is32) return 0;
4391     // Delay slots are not valid branch targets
4392     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4393     // Delay slots require additional processing, so do not match
4394     if(is_ds[t]) return 0;
4395   }
4396   else
4397   {
4398     int hr;
4399     for(hr=0;hr<HOST_REGS;hr++)
4400     {
4401       if(hr!=EXCLUDE_REG)
4402       {
4403         if(i_regmap[hr]>=0)
4404         {
4405           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4406           {
4407             if((i_dirty>>hr)&1)
4408             {
4409               return 0;
4410             }
4411           }
4412         }
4413       }
4414     }
4415   }
4416   return 1;
4417 }
4418
4419 // Used when a branch jumps into the delay slot of another branch
4420 void ds_assemble_entry(int i)
4421 {
4422   int t=(ba[i]-start)>>2;
4423   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4424   assem_debug("Assemble delay slot at %x\n",ba[i]);
4425   assem_debug("<->\n");
4426   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4427     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4428   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4429   address_generation(t,&regs[t],regs[t].regmap_entry);
4430   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39)
4431     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4432   cop1_usable=0;
4433   is_delayslot=0;
4434   switch(itype[t]) {
4435     case ALU:
4436       alu_assemble(t,&regs[t]);break;
4437     case IMM16:
4438       imm16_assemble(t,&regs[t]);break;
4439     case SHIFT:
4440       shift_assemble(t,&regs[t]);break;
4441     case SHIFTIMM:
4442       shiftimm_assemble(t,&regs[t]);break;
4443     case LOAD:
4444       load_assemble(t,&regs[t]);break;
4445     case LOADLR:
4446       loadlr_assemble(t,&regs[t]);break;
4447     case STORE:
4448       store_assemble(t,&regs[t]);break;
4449     case STORELR:
4450       storelr_assemble(t,&regs[t]);break;
4451     case COP0:
4452       cop0_assemble(t,&regs[t]);break;
4453     case COP1:
4454       cop1_assemble(t,&regs[t]);break;
4455     case C1LS:
4456       c1ls_assemble(t,&regs[t]);break;
4457     case FCONV:
4458       fconv_assemble(t,&regs[t]);break;
4459     case FLOAT:
4460       float_assemble(t,&regs[t]);break;
4461     case FCOMP:
4462       fcomp_assemble(t,&regs[t]);break;
4463     case MULTDIV:
4464       multdiv_assemble(t,&regs[t]);break;
4465     case MOV:
4466       mov_assemble(t,&regs[t]);break;
4467     case SYSCALL:
4468     case SPAN:
4469     case UJUMP:
4470     case RJUMP:
4471     case CJUMP:
4472     case SJUMP:
4473     case FJUMP:
4474       printf("Jump in the delay slot.  This is probably a bug.\n");
4475   }
4476   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4477   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4478   if(internal_branch(regs[t].is32,ba[i]+4))
4479     assem_debug("branch: internal\n");
4480   else
4481     assem_debug("branch: external\n");
4482   assert(internal_branch(regs[t].is32,ba[i]+4));
4483   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4484   emit_jmp(0);
4485 }
4486
4487 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4488 {
4489   int count;
4490   int jaddr;
4491   int idle=0;
4492   if(itype[i]==RJUMP)
4493   {
4494     *adj=0;
4495   }
4496   //if(ba[i]>=start && ba[i]<(start+slen*4))
4497   if(internal_branch(branch_regs[i].is32,ba[i]))
4498   {
4499     int t=(ba[i]-start)>>2;
4500     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4501     else *adj=ccadj[t];
4502   }
4503   else
4504   {
4505     *adj=0;
4506   }
4507   count=ccadj[i];
4508   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4509     // Idle loop
4510     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4511     idle=(int)out;
4512     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4513     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4514     jaddr=(int)out;
4515     emit_jmp(0);
4516   }
4517   else if(*adj==0||invert) {
4518     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4519     jaddr=(int)out;
4520     emit_jns(0);
4521   }
4522   else
4523   {
4524     emit_cmpimm(HOST_CCREG,-2*(count+2));
4525     jaddr=(int)out;
4526     emit_jns(0);
4527   }
4528   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4529 }
4530
4531 void do_ccstub(int n)
4532 {
4533   literal_pool(256);
4534   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4535   set_jump_target(stubs[n][1],(int)out);
4536   int i=stubs[n][4];
4537   if(stubs[n][6]==NULLDS) {
4538     // Delay slot instruction is nullified ("likely" branch)
4539     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4540   }
4541   else if(stubs[n][6]!=TAKEN) {
4542     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4543   }
4544   else {
4545     if(internal_branch(branch_regs[i].is32,ba[i]))
4546       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4547   }
4548   if(stubs[n][5]!=-1)
4549   {
4550     // Save PC as return address
4551     emit_movimm(stubs[n][5],EAX);
4552     emit_writeword(EAX,(int)&pcaddr);
4553   }
4554   else
4555   {
4556     // Return address depends on which way the branch goes
4557     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4558     {
4559       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4560       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4561       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4562       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4563       if(rs1[i]==0)
4564       {
4565         s1l=s2l;s1h=s2h;
4566         s2l=s2h=-1;
4567       }
4568       else if(rs2[i]==0)
4569       {
4570         s2l=s2h=-1;
4571       }
4572       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4573         s1h=s2h=-1;
4574       }
4575       assert(s1l>=0);
4576       #ifdef DESTRUCTIVE_WRITEBACK
4577       if(rs1[i]) {
4578         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4579           emit_loadreg(rs1[i],s1l);
4580       } 
4581       else {
4582         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4583           emit_loadreg(rs2[i],s1l);
4584       }
4585       if(s2l>=0)
4586         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4587           emit_loadreg(rs2[i],s2l);
4588       #endif
4589       int hr=0;
4590       int addr,alt,ntaddr;
4591       while(hr<HOST_REGS)
4592       {
4593         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4594            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4595            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4596         {
4597           addr=hr++;break;
4598         }
4599         hr++;
4600       }
4601       while(hr<HOST_REGS)
4602       {
4603         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4604            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4605            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4606         {
4607           alt=hr++;break;
4608         }
4609         hr++;
4610       }
4611       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4612       {
4613         while(hr<HOST_REGS)
4614         {
4615           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4616              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4617              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4618           {
4619             ntaddr=hr;break;
4620           }
4621           hr++;
4622         }
4623         assert(hr<HOST_REGS);
4624       }
4625       if((opcode[i]&0x2f)==4) // BEQ
4626       {
4627         #ifdef HAVE_CMOV_IMM
4628         if(s1h<0) {
4629           if(s2l>=0) emit_cmp(s1l,s2l);
4630           else emit_test(s1l,s1l);
4631           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4632         }
4633         else
4634         #endif
4635         {
4636           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4637           if(s1h>=0) {
4638             if(s2h>=0) emit_cmp(s1h,s2h);
4639             else emit_test(s1h,s1h);
4640             emit_cmovne_reg(alt,addr);
4641           }
4642           if(s2l>=0) emit_cmp(s1l,s2l);
4643           else emit_test(s1l,s1l);
4644           emit_cmovne_reg(alt,addr);
4645         }
4646       }
4647       if((opcode[i]&0x2f)==5) // BNE
4648       {
4649         #ifdef HAVE_CMOV_IMM
4650         if(s1h<0) {
4651           if(s2l>=0) emit_cmp(s1l,s2l);
4652           else emit_test(s1l,s1l);
4653           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4654         }
4655         else
4656         #endif
4657         {
4658           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4659           if(s1h>=0) {
4660             if(s2h>=0) emit_cmp(s1h,s2h);
4661             else emit_test(s1h,s1h);
4662             emit_cmovne_reg(alt,addr);
4663           }
4664           if(s2l>=0) emit_cmp(s1l,s2l);
4665           else emit_test(s1l,s1l);
4666           emit_cmovne_reg(alt,addr);
4667         }
4668       }
4669       if((opcode[i]&0x2f)==6) // BLEZ
4670       {
4671         //emit_movimm(ba[i],alt);
4672         //emit_movimm(start+i*4+8,addr);
4673         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4674         emit_cmpimm(s1l,1);
4675         if(s1h>=0) emit_mov(addr,ntaddr);
4676         emit_cmovl_reg(alt,addr);
4677         if(s1h>=0) {
4678           emit_test(s1h,s1h);
4679           emit_cmovne_reg(ntaddr,addr);
4680           emit_cmovs_reg(alt,addr);
4681         }
4682       }
4683       if((opcode[i]&0x2f)==7) // BGTZ
4684       {
4685         //emit_movimm(ba[i],addr);
4686         //emit_movimm(start+i*4+8,ntaddr);
4687         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4688         emit_cmpimm(s1l,1);
4689         if(s1h>=0) emit_mov(addr,alt);
4690         emit_cmovl_reg(ntaddr,addr);
4691         if(s1h>=0) {
4692           emit_test(s1h,s1h);
4693           emit_cmovne_reg(alt,addr);
4694           emit_cmovs_reg(ntaddr,addr);
4695         }
4696       }
4697       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4698       {
4699         //emit_movimm(ba[i],alt);
4700         //emit_movimm(start+i*4+8,addr);
4701         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4702         if(s1h>=0) emit_test(s1h,s1h);
4703         else emit_test(s1l,s1l);
4704         emit_cmovs_reg(alt,addr);
4705       }
4706       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4707       {
4708         //emit_movimm(ba[i],addr);
4709         //emit_movimm(start+i*4+8,alt);
4710         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4711         if(s1h>=0) emit_test(s1h,s1h);
4712         else emit_test(s1l,s1l);
4713         emit_cmovs_reg(alt,addr);
4714       }
4715       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4716         if(source[i]&0x10000) // BC1T
4717         {
4718           //emit_movimm(ba[i],alt);
4719           //emit_movimm(start+i*4+8,addr);
4720           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4721           emit_testimm(s1l,0x800000);
4722           emit_cmovne_reg(alt,addr);
4723         }
4724         else // BC1F
4725         {
4726           //emit_movimm(ba[i],addr);
4727           //emit_movimm(start+i*4+8,alt);
4728           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4729           emit_testimm(s1l,0x800000);
4730           emit_cmovne_reg(alt,addr);
4731         }
4732       }
4733       emit_writeword(addr,(int)&pcaddr);
4734     }
4735     else
4736     if(itype[i]==RJUMP)
4737     {
4738       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4739       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4740         r=get_reg(branch_regs[i].regmap,RTEMP);
4741       }
4742       emit_writeword(r,(int)&pcaddr);
4743     }
4744     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4745   }
4746   // Update cycle count
4747   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4748   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4749   emit_call((int)cc_interrupt);
4750   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4751   if(stubs[n][6]==TAKEN) {
4752     if(internal_branch(branch_regs[i].is32,ba[i]))
4753       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4754     else if(itype[i]==RJUMP) {
4755       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4756         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4757       else
4758         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4759     }
4760   }else if(stubs[n][6]==NOTTAKEN) {
4761     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4762     else load_all_regs(branch_regs[i].regmap);
4763   }else if(stubs[n][6]==NULLDS) {
4764     // Delay slot instruction is nullified ("likely" branch)
4765     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4766     else load_all_regs(regs[i].regmap);
4767   }else{
4768     load_all_regs(branch_regs[i].regmap);
4769   }
4770   emit_jmp(stubs[n][2]); // return address
4771   
4772   /* This works but uses a lot of memory...
4773   emit_readword((int)&last_count,ECX);
4774   emit_add(HOST_CCREG,ECX,EAX);
4775   emit_writeword(EAX,(int)&Count);
4776   emit_call((int)gen_interupt);
4777   emit_readword((int)&Count,HOST_CCREG);
4778   emit_readword((int)&next_interupt,EAX);
4779   emit_readword((int)&pending_exception,EBX);
4780   emit_writeword(EAX,(int)&last_count);
4781   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4782   emit_test(EBX,EBX);
4783   int jne_instr=(int)out;
4784   emit_jne(0);
4785   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4786   load_all_regs(branch_regs[i].regmap);
4787   emit_jmp(stubs[n][2]); // return address
4788   set_jump_target(jne_instr,(int)out);
4789   emit_readword((int)&pcaddr,EAX);
4790   // Call get_addr_ht instead of doing the hash table here.
4791   // This code is executed infrequently and takes up a lot of space
4792   // so smaller is better.
4793   emit_storereg(CCREG,HOST_CCREG);
4794   emit_pushreg(EAX);
4795   emit_call((int)get_addr_ht);
4796   emit_loadreg(CCREG,HOST_CCREG);
4797   emit_addimm(ESP,4,ESP);
4798   emit_jmpreg(EAX);*/
4799 }
4800
4801 add_to_linker(int addr,int target,int ext)
4802 {
4803   link_addr[linkcount][0]=addr;
4804   link_addr[linkcount][1]=target;
4805   link_addr[linkcount][2]=ext;  
4806   linkcount++;
4807 }
4808
4809 void ujump_assemble(int i,struct regstat *i_regs)
4810 {
4811   signed char *i_regmap=i_regs->regmap;
4812   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4813   address_generation(i+1,i_regs,regs[i].regmap_entry);
4814   #ifdef REG_PREFETCH
4815   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4816   if(rt1[i]==31&&temp>=0) 
4817   {
4818     int return_address=start+i*4+8;
4819     if(get_reg(branch_regs[i].regmap,31)>0) 
4820     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4821   }
4822   #endif
4823   ds_assemble(i+1,i_regs);
4824   uint64_t bc_unneeded=branch_regs[i].u;
4825   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4826   bc_unneeded|=1|(1LL<<rt1[i]);
4827   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4828   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4829                 bc_unneeded,bc_unneeded_upper);
4830   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4831   if(rt1[i]==31) {
4832     int rt;
4833     unsigned int return_address;
4834     assert(rt1[i+1]!=31);
4835     assert(rt2[i+1]!=31);
4836     rt=get_reg(branch_regs[i].regmap,31);
4837     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4838     //assert(rt>=0);
4839     return_address=start+i*4+8;
4840     if(rt>=0) {
4841       #ifdef USE_MINI_HT
4842       if(internal_branch(branch_regs[i].is32,return_address)) {
4843         int temp=rt+1;
4844         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
4845            branch_regs[i].regmap[temp]>=0)
4846         {
4847           temp=get_reg(branch_regs[i].regmap,-1);
4848         }
4849         #ifdef HOST_TEMPREG
4850         if(temp<0) temp=HOST_TEMPREG;
4851         #endif
4852         if(temp>=0) do_miniht_insert(return_address,rt,temp);
4853         else emit_movimm(return_address,rt);
4854       }
4855       else
4856       #endif
4857       {
4858         #ifdef REG_PREFETCH
4859         if(temp>=0) 
4860         {
4861           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4862         }
4863         #endif
4864         emit_movimm(return_address,rt); // PC into link register
4865         #ifdef IMM_PREFETCH
4866         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4867         #endif
4868       }
4869     }
4870   }
4871   int cc,adj;
4872   cc=get_reg(branch_regs[i].regmap,CCREG);
4873   assert(cc==HOST_CCREG);
4874   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4875   #ifdef REG_PREFETCH
4876   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4877   #endif
4878   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4879   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4880   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4881   if(internal_branch(branch_regs[i].is32,ba[i]))
4882     assem_debug("branch: internal\n");
4883   else
4884     assem_debug("branch: external\n");
4885   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4886     ds_assemble_entry(i);
4887   }
4888   else {
4889     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4890     emit_jmp(0);
4891   }
4892 }
4893
4894 void rjump_assemble(int i,struct regstat *i_regs)
4895 {
4896   signed char *i_regmap=i_regs->regmap;
4897   int temp;
4898   int rs,cc,adj;
4899   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4900   assert(rs>=0);
4901   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4902     // Delay slot abuse, make a copy of the branch address register
4903     temp=get_reg(branch_regs[i].regmap,RTEMP);
4904     assert(temp>=0);
4905     assert(regs[i].regmap[temp]==RTEMP);
4906     emit_mov(rs,temp);
4907     rs=temp;
4908   }
4909   address_generation(i+1,i_regs,regs[i].regmap_entry);
4910   #ifdef REG_PREFETCH
4911   if(rt1[i]==31) 
4912   {
4913     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4914       int return_address=start+i*4+8;
4915       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4916     }
4917   }
4918   #endif
4919   #ifdef USE_MINI_HT
4920   if(rs1[i]==31) {
4921     int rh=get_reg(regs[i].regmap,RHASH);
4922     if(rh>=0) do_preload_rhash(rh);
4923   }
4924   #endif
4925   ds_assemble(i+1,i_regs);
4926   uint64_t bc_unneeded=branch_regs[i].u;
4927   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4928   bc_unneeded|=1|(1LL<<rt1[i]);
4929   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4930   bc_unneeded&=~(1LL<<rs1[i]);
4931   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4932                 bc_unneeded,bc_unneeded_upper);
4933   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4934   if(rt1[i]==31) {
4935     int rt,return_address;
4936     assert(rt1[i+1]!=31);
4937     assert(rt2[i+1]!=31);
4938     rt=get_reg(branch_regs[i].regmap,31);
4939     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4940     assert(rt>=0);
4941     return_address=start+i*4+8;
4942     #ifdef REG_PREFETCH
4943     if(temp>=0) 
4944     {
4945       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4946     }
4947     #endif
4948     emit_movimm(return_address,rt); // PC into link register
4949     #ifdef IMM_PREFETCH
4950     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4951     #endif
4952   }
4953   cc=get_reg(branch_regs[i].regmap,CCREG);
4954   assert(cc==HOST_CCREG);
4955   #ifdef USE_MINI_HT
4956   int rh=get_reg(branch_regs[i].regmap,RHASH);
4957   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4958   if(rs1[i]==31) {
4959     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4960     do_preload_rhtbl(ht);
4961     do_rhash(rs,rh);
4962   }
4963   #endif
4964   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4965   #ifdef DESTRUCTIVE_WRITEBACK
4966   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4967     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4968       emit_loadreg(rs1[i],rs);
4969     }
4970   }
4971   #endif
4972   #ifdef REG_PREFETCH
4973   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4974   #endif
4975   #ifdef USE_MINI_HT
4976   if(rs1[i]==31) {
4977     do_miniht_load(ht,rh);
4978   }
4979   #endif
4980   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4981   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4982   //assert(adj==0);
4983   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
4984   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4985   emit_jns(0);
4986   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4987   #ifdef USE_MINI_HT
4988   if(rs1[i]==31) {
4989     do_miniht_jump(rs,rh,ht);
4990   }
4991   else
4992   #endif
4993   {
4994     //if(rs!=EAX) emit_mov(rs,EAX);
4995     //emit_jmp((int)jump_vaddr_eax);
4996     emit_jmp(jump_vaddr_reg[rs]);
4997   }
4998   /* Check hash table
4999   temp=!rs;
5000   emit_mov(rs,temp);
5001   emit_shrimm(rs,16,rs);
5002   emit_xor(temp,rs,rs);
5003   emit_movzwl_reg(rs,rs);
5004   emit_shlimm(rs,4,rs);
5005   emit_cmpmem_indexed((int)hash_table,rs,temp);
5006   emit_jne((int)out+14);
5007   emit_readword_indexed((int)hash_table+4,rs,rs);
5008   emit_jmpreg(rs);
5009   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5010   emit_addimm_no_flags(8,rs);
5011   emit_jeq((int)out-17);
5012   // No hit on hash table, call compiler
5013   emit_pushreg(temp);
5014 //DEBUG >
5015 #ifdef DEBUG_CYCLE_COUNT
5016   emit_readword((int)&last_count,ECX);
5017   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5018   emit_readword((int)&next_interupt,ECX);
5019   emit_writeword(HOST_CCREG,(int)&Count);
5020   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5021   emit_writeword(ECX,(int)&last_count);
5022 #endif
5023 //DEBUG <
5024   emit_storereg(CCREG,HOST_CCREG);
5025   emit_call((int)get_addr);
5026   emit_loadreg(CCREG,HOST_CCREG);
5027   emit_addimm(ESP,4,ESP);
5028   emit_jmpreg(EAX);*/
5029   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5030   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5031   #endif
5032 }
5033
5034 void cjump_assemble(int i,struct regstat *i_regs)
5035 {
5036   signed char *i_regmap=i_regs->regmap;
5037   int cc;
5038   int match;
5039   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5040   assem_debug("match=%d\n",match);
5041   int s1h,s1l,s2h,s2l;
5042   int prev_cop1_usable=cop1_usable;
5043   int unconditional=0,nop=0;
5044   int only32=0;
5045   int ooo=1;
5046   int invert=0;
5047   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5048   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5049   if(likely[i]) ooo=0;
5050   if(!match) invert=1;
5051   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5052   if(i>(ba[i]-start)>>2) invert=1;
5053   #endif
5054     
5055   if(ooo)
5056     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5057        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5058   {
5059     // Write-after-read dependency prevents out of order execution
5060     // First test branch condition, then execute delay slot, then branch
5061     ooo=0;
5062   }
5063
5064   if(ooo) {
5065     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5066     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5067     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5068     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5069   }
5070   else {
5071     s1l=get_reg(i_regmap,rs1[i]);
5072     s1h=get_reg(i_regmap,rs1[i]|64);
5073     s2l=get_reg(i_regmap,rs2[i]);
5074     s2h=get_reg(i_regmap,rs2[i]|64);
5075   }
5076   if(rs1[i]==0&&rs2[i]==0)
5077   {
5078     if(opcode[i]&1) nop=1;
5079     else unconditional=1;
5080     //assert(opcode[i]!=5);
5081     //assert(opcode[i]!=7);
5082     //assert(opcode[i]!=0x15);
5083     //assert(opcode[i]!=0x17);
5084   }
5085   else if(rs1[i]==0)
5086   {
5087     s1l=s2l;s1h=s2h;
5088     s2l=s2h=-1;
5089     only32=(regs[i].was32>>rs2[i])&1;
5090   }
5091   else if(rs2[i]==0)
5092   {
5093     s2l=s2h=-1;
5094     only32=(regs[i].was32>>rs1[i])&1;
5095   }
5096   else {
5097     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5098   }
5099
5100   if(ooo) {
5101     // Out of order execution (delay slot first)
5102     //printf("OOOE\n");
5103     address_generation(i+1,i_regs,regs[i].regmap_entry);
5104     ds_assemble(i+1,i_regs);
5105     int adj;
5106     uint64_t bc_unneeded=branch_regs[i].u;
5107     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5108     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5109     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5110     bc_unneeded|=1;
5111     bc_unneeded_upper|=1;
5112     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5113                   bc_unneeded,bc_unneeded_upper);
5114     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5115     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5116     cc=get_reg(branch_regs[i].regmap,CCREG);
5117     assert(cc==HOST_CCREG);
5118     if(unconditional) 
5119       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5120     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5121     //assem_debug("cycle count (adj)\n");
5122     if(unconditional) {
5123       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5124       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5125         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5126         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5127         if(internal)
5128           assem_debug("branch: internal\n");
5129         else
5130           assem_debug("branch: external\n");
5131         if(internal&&is_ds[(ba[i]-start)>>2]) {
5132           ds_assemble_entry(i);
5133         }
5134         else {
5135           add_to_linker((int)out,ba[i],internal);
5136           emit_jmp(0);
5137         }
5138         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5139         if(((u_int)out)&7) emit_addnop(0);
5140         #endif
5141       }
5142     }
5143     else if(nop) {
5144       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5145       int jaddr=(int)out;
5146       emit_jns(0);
5147       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5148     }
5149     else {
5150       int taken=0,nottaken=0,nottaken1=0;
5151       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5152       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5153       if(!only32)
5154       {
5155         assert(s1h>=0);
5156         if(opcode[i]==4) // BEQ
5157         {
5158           if(s2h>=0) emit_cmp(s1h,s2h);
5159           else emit_test(s1h,s1h);
5160           nottaken1=(int)out;
5161           emit_jne(1);
5162         }
5163         if(opcode[i]==5) // BNE
5164         {
5165           if(s2h>=0) emit_cmp(s1h,s2h);
5166           else emit_test(s1h,s1h);
5167           if(invert) taken=(int)out;
5168           else add_to_linker((int)out,ba[i],internal);
5169           emit_jne(0);
5170         }
5171         if(opcode[i]==6) // BLEZ
5172         {
5173           emit_test(s1h,s1h);
5174           if(invert) taken=(int)out;
5175           else add_to_linker((int)out,ba[i],internal);
5176           emit_js(0);
5177           nottaken1=(int)out;
5178           emit_jne(1);
5179         }
5180         if(opcode[i]==7) // BGTZ
5181         {
5182           emit_test(s1h,s1h);
5183           nottaken1=(int)out;
5184           emit_js(1);
5185           if(invert) taken=(int)out;
5186           else add_to_linker((int)out,ba[i],internal);
5187           emit_jne(0);
5188         }
5189       } // if(!only32)
5190           
5191       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5192       assert(s1l>=0);
5193       if(opcode[i]==4) // BEQ
5194       {
5195         if(s2l>=0) emit_cmp(s1l,s2l);
5196         else emit_test(s1l,s1l);
5197         if(invert){
5198           nottaken=(int)out;
5199           emit_jne(1);
5200         }else{
5201           add_to_linker((int)out,ba[i],internal);
5202           emit_jeq(0);
5203         }
5204       }
5205       if(opcode[i]==5) // BNE
5206       {
5207         if(s2l>=0) emit_cmp(s1l,s2l);
5208         else emit_test(s1l,s1l);
5209         if(invert){
5210           nottaken=(int)out;
5211           emit_jeq(1);
5212         }else{
5213           add_to_linker((int)out,ba[i],internal);
5214           emit_jne(0);
5215         }
5216       }
5217       if(opcode[i]==6) // BLEZ
5218       {
5219         emit_cmpimm(s1l,1);
5220         if(invert){
5221           nottaken=(int)out;
5222           emit_jge(1);
5223         }else{
5224           add_to_linker((int)out,ba[i],internal);
5225           emit_jl(0);
5226         }
5227       }
5228       if(opcode[i]==7) // BGTZ
5229       {
5230         emit_cmpimm(s1l,1);
5231         if(invert){
5232           nottaken=(int)out;
5233           emit_jl(1);
5234         }else{
5235           add_to_linker((int)out,ba[i],internal);
5236           emit_jge(0);
5237         }
5238       }
5239       if(invert) {
5240         if(taken) set_jump_target(taken,(int)out);
5241         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5242         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5243           if(adj) {
5244             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5245             add_to_linker((int)out,ba[i],internal);
5246           }else{
5247             emit_addnop(13);
5248             add_to_linker((int)out,ba[i],internal*2);
5249           }
5250           emit_jmp(0);
5251         }else
5252         #endif
5253         {
5254           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5255           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5256           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5257           if(internal)
5258             assem_debug("branch: internal\n");
5259           else
5260             assem_debug("branch: external\n");
5261           if(internal&&is_ds[(ba[i]-start)>>2]) {
5262             ds_assemble_entry(i);
5263           }
5264           else {
5265             add_to_linker((int)out,ba[i],internal);
5266             emit_jmp(0);
5267           }
5268         }
5269         set_jump_target(nottaken,(int)out);
5270       }
5271
5272       if(nottaken1) set_jump_target(nottaken1,(int)out);
5273       if(adj) {
5274         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5275       }
5276     } // (!unconditional)
5277   } // if(ooo)
5278   else
5279   {
5280     // In-order execution (branch first)
5281     //if(likely[i]) printf("IOL\n");
5282     //else
5283     //printf("IOE\n");
5284     int taken=0,nottaken=0,nottaken1=0;
5285     if(!unconditional&&!nop) {
5286       if(!only32)
5287       {
5288         assert(s1h>=0);
5289         if((opcode[i]&0x2f)==4) // BEQ
5290         {
5291           if(s2h>=0) emit_cmp(s1h,s2h);
5292           else emit_test(s1h,s1h);
5293           nottaken1=(int)out;
5294           emit_jne(2);
5295         }
5296         if((opcode[i]&0x2f)==5) // BNE
5297         {
5298           if(s2h>=0) emit_cmp(s1h,s2h);
5299           else emit_test(s1h,s1h);
5300           taken=(int)out;
5301           emit_jne(1);
5302         }
5303         if((opcode[i]&0x2f)==6) // BLEZ
5304         {
5305           emit_test(s1h,s1h);
5306           taken=(int)out;
5307           emit_js(1);
5308           nottaken1=(int)out;
5309           emit_jne(2);
5310         }
5311         if((opcode[i]&0x2f)==7) // BGTZ
5312         {
5313           emit_test(s1h,s1h);
5314           nottaken1=(int)out;
5315           emit_js(2);
5316           taken=(int)out;
5317           emit_jne(1);
5318         }
5319       } // if(!only32)
5320           
5321       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5322       assert(s1l>=0);
5323       if((opcode[i]&0x2f)==4) // BEQ
5324       {
5325         if(s2l>=0) emit_cmp(s1l,s2l);
5326         else emit_test(s1l,s1l);
5327         nottaken=(int)out;
5328         emit_jne(2);
5329       }
5330       if((opcode[i]&0x2f)==5) // BNE
5331       {
5332         if(s2l>=0) emit_cmp(s1l,s2l);
5333         else emit_test(s1l,s1l);
5334         nottaken=(int)out;
5335         emit_jeq(2);
5336       }
5337       if((opcode[i]&0x2f)==6) // BLEZ
5338       {
5339         emit_cmpimm(s1l,1);
5340         nottaken=(int)out;
5341         emit_jge(2);
5342       }
5343       if((opcode[i]&0x2f)==7) // BGTZ
5344       {
5345         emit_cmpimm(s1l,1);
5346         nottaken=(int)out;
5347         emit_jl(2);
5348       }
5349     } // if(!unconditional)
5350     int adj;
5351     uint64_t ds_unneeded=branch_regs[i].u;
5352     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5353     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5354     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5355     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5356     ds_unneeded|=1;
5357     ds_unneeded_upper|=1;
5358     // branch taken
5359     if(!nop) {
5360       if(taken) set_jump_target(taken,(int)out);
5361       assem_debug("1:\n");
5362       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5363                     ds_unneeded,ds_unneeded_upper);
5364       // load regs
5365       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5366       address_generation(i+1,&branch_regs[i],0);
5367       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5368       ds_assemble(i+1,&branch_regs[i]);
5369       cc=get_reg(branch_regs[i].regmap,CCREG);
5370       if(cc==-1) {
5371         emit_loadreg(CCREG,cc=HOST_CCREG);
5372         // CHECK: Is the following instruction (fall thru) allocated ok?
5373       }
5374       assert(cc==HOST_CCREG);
5375       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5376       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5377       assem_debug("cycle count (adj)\n");
5378       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5379       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5380       if(internal)
5381         assem_debug("branch: internal\n");
5382       else
5383         assem_debug("branch: external\n");
5384       if(internal&&is_ds[(ba[i]-start)>>2]) {
5385         ds_assemble_entry(i);
5386       }
5387       else {
5388         add_to_linker((int)out,ba[i],internal);
5389         emit_jmp(0);
5390       }
5391     }
5392     // branch not taken
5393     cop1_usable=prev_cop1_usable;
5394     if(!unconditional) {
5395       if(nottaken1) set_jump_target(nottaken1,(int)out);
5396       set_jump_target(nottaken,(int)out);
5397       assem_debug("2:\n");
5398       if(!likely[i]) {
5399         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5400                       ds_unneeded,ds_unneeded_upper);
5401         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5402         address_generation(i+1,&branch_regs[i],0);
5403         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5404         ds_assemble(i+1,&branch_regs[i]);
5405       }
5406       cc=get_reg(branch_regs[i].regmap,CCREG);
5407       if(cc==-1&&!likely[i]) {
5408         // Cycle count isn't in a register, temporarily load it then write it out
5409         emit_loadreg(CCREG,HOST_CCREG);
5410         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5411         int jaddr=(int)out;
5412         emit_jns(0);
5413         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5414         emit_storereg(CCREG,HOST_CCREG);
5415       }
5416       else{
5417         cc=get_reg(i_regmap,CCREG);
5418         assert(cc==HOST_CCREG);
5419         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5420         int jaddr=(int)out;
5421         emit_jns(0);
5422         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5423       }
5424     }
5425   }
5426 }
5427
5428 void sjump_assemble(int i,struct regstat *i_regs)
5429 {
5430   signed char *i_regmap=i_regs->regmap;
5431   int cc;
5432   int match;
5433   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5434   assem_debug("smatch=%d\n",match);
5435   int s1h,s1l;
5436   int prev_cop1_usable=cop1_usable;
5437   int unconditional=0,nevertaken=0;
5438   int only32=0;
5439   int ooo=1;
5440   int invert=0;
5441   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5442   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5443   if(likely[i]) ooo=0;
5444   if(!match) invert=1;
5445   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5446   if(i>(ba[i]-start)>>2) invert=1;
5447   #endif
5448
5449   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5450   assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5451
5452   if(ooo)
5453     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5454   {
5455     // Write-after-read dependency prevents out of order execution
5456     // First test branch condition, then execute delay slot, then branch
5457     ooo=0;
5458   }
5459   // TODO: Conditional branches w/link must execute in-order so that
5460   // condition test and write to r31 occur before cycle count test
5461
5462   if(ooo) {
5463     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5464     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5465   }
5466   else {
5467     s1l=get_reg(i_regmap,rs1[i]);
5468     s1h=get_reg(i_regmap,rs1[i]|64);
5469   }
5470   if(rs1[i]==0)
5471   {
5472     if(opcode2[i]&1) unconditional=1;
5473     else nevertaken=1;
5474     // These are never taken (r0 is never less than zero)
5475     //assert(opcode2[i]!=0);
5476     //assert(opcode2[i]!=2);
5477     //assert(opcode2[i]!=0x10);
5478     //assert(opcode2[i]!=0x12);
5479   }
5480   else {
5481     only32=(regs[i].was32>>rs1[i])&1;
5482   }
5483
5484   if(ooo) {
5485     // Out of order execution (delay slot first)
5486     //printf("OOOE\n");
5487     address_generation(i+1,i_regs,regs[i].regmap_entry);
5488     ds_assemble(i+1,i_regs);
5489     int adj;
5490     uint64_t bc_unneeded=branch_regs[i].u;
5491     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5492     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5493     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5494     bc_unneeded|=1;
5495     bc_unneeded_upper|=1;
5496     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5497                   bc_unneeded,bc_unneeded_upper);
5498     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5499     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5500     if(rt1[i]==31) {
5501       int rt,return_address;
5502       assert(rt1[i+1]!=31);
5503       assert(rt2[i+1]!=31);
5504       rt=get_reg(branch_regs[i].regmap,31);
5505       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5506       if(rt>=0) {
5507         // Save the PC even if the branch is not taken
5508         return_address=start+i*4+8;
5509         emit_movimm(return_address,rt); // PC into link register
5510         #ifdef IMM_PREFETCH
5511         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5512         #endif
5513       }
5514     }
5515     cc=get_reg(branch_regs[i].regmap,CCREG);
5516     assert(cc==HOST_CCREG);
5517     if(unconditional) 
5518       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5519     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5520     assem_debug("cycle count (adj)\n");
5521     if(unconditional) {
5522       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5523       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5524         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5525         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5526         if(internal)
5527           assem_debug("branch: internal\n");
5528         else
5529           assem_debug("branch: external\n");
5530         if(internal&&is_ds[(ba[i]-start)>>2]) {
5531           ds_assemble_entry(i);
5532         }
5533         else {
5534           add_to_linker((int)out,ba[i],internal);
5535           emit_jmp(0);
5536         }
5537         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5538         if(((u_int)out)&7) emit_addnop(0);
5539         #endif
5540       }
5541     }
5542     else if(nevertaken) {
5543       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5544       int jaddr=(int)out;
5545       emit_jns(0);
5546       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5547     }
5548     else {
5549       int nottaken=0;
5550       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5551       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5552       if(!only32)
5553       {
5554         assert(s1h>=0);
5555         if(opcode2[i]==0) // BLTZ
5556         {
5557           emit_test(s1h,s1h);
5558           if(invert){
5559             nottaken=(int)out;
5560             emit_jns(1);
5561           }else{
5562             add_to_linker((int)out,ba[i],internal);
5563             emit_js(0);
5564           }
5565         }
5566         if(opcode2[i]==1) // BGEZ
5567         {
5568           emit_test(s1h,s1h);
5569           if(invert){
5570             nottaken=(int)out;
5571             emit_js(1);
5572           }else{
5573             add_to_linker((int)out,ba[i],internal);
5574             emit_jns(0);
5575           }
5576         }
5577       } // if(!only32)
5578       else
5579       {
5580         assert(s1l>=0);
5581         if(opcode2[i]==0) // BLTZ
5582         {
5583           emit_test(s1l,s1l);
5584           if(invert){
5585             nottaken=(int)out;
5586             emit_jns(1);
5587           }else{
5588             add_to_linker((int)out,ba[i],internal);
5589             emit_js(0);
5590           }
5591         }
5592         if(opcode2[i]==1) // BGEZ
5593         {
5594           emit_test(s1l,s1l);
5595           if(invert){
5596             nottaken=(int)out;
5597             emit_js(1);
5598           }else{
5599             add_to_linker((int)out,ba[i],internal);
5600             emit_jns(0);
5601           }
5602         }
5603       } // if(!only32)
5604           
5605       if(invert) {
5606         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5607         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5608           if(adj) {
5609             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5610             add_to_linker((int)out,ba[i],internal);
5611           }else{
5612             emit_addnop(13);
5613             add_to_linker((int)out,ba[i],internal*2);
5614           }
5615           emit_jmp(0);
5616         }else
5617         #endif
5618         {
5619           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5620           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5621           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5622           if(internal)
5623             assem_debug("branch: internal\n");
5624           else
5625             assem_debug("branch: external\n");
5626           if(internal&&is_ds[(ba[i]-start)>>2]) {
5627             ds_assemble_entry(i);
5628           }
5629           else {
5630             add_to_linker((int)out,ba[i],internal);
5631             emit_jmp(0);
5632           }
5633         }
5634         set_jump_target(nottaken,(int)out);
5635       }
5636
5637       if(adj) {
5638         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5639       }
5640     } // (!unconditional)
5641   } // if(ooo)
5642   else
5643   {
5644     // In-order execution (branch first)
5645     //printf("IOE\n");
5646     int nottaken=0;
5647     if(!unconditional) {
5648       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5649       if(!only32)
5650       {
5651         assert(s1h>=0);
5652         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5653         {
5654           emit_test(s1h,s1h);
5655           nottaken=(int)out;
5656           emit_jns(1);
5657         }
5658         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5659         {
5660           emit_test(s1h,s1h);
5661           nottaken=(int)out;
5662           emit_js(1);
5663         }
5664       } // if(!only32)
5665       else
5666       {
5667         assert(s1l>=0);
5668         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5669         {
5670           emit_test(s1l,s1l);
5671           nottaken=(int)out;
5672           emit_jns(1);
5673         }
5674         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5675         {
5676           emit_test(s1l,s1l);
5677           nottaken=(int)out;
5678           emit_js(1);
5679         }
5680       }
5681     } // if(!unconditional)
5682     int adj;
5683     uint64_t ds_unneeded=branch_regs[i].u;
5684     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5685     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5686     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5687     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5688     ds_unneeded|=1;
5689     ds_unneeded_upper|=1;
5690     // branch taken
5691     if(!nevertaken) {
5692       //assem_debug("1:\n");
5693       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5694                     ds_unneeded,ds_unneeded_upper);
5695       // load regs
5696       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5697       address_generation(i+1,&branch_regs[i],0);
5698       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5699       ds_assemble(i+1,&branch_regs[i]);
5700       cc=get_reg(branch_regs[i].regmap,CCREG);
5701       if(cc==-1) {
5702         emit_loadreg(CCREG,cc=HOST_CCREG);
5703         // CHECK: Is the following instruction (fall thru) allocated ok?
5704       }
5705       assert(cc==HOST_CCREG);
5706       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5707       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5708       assem_debug("cycle count (adj)\n");
5709       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5710       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5711       if(internal)
5712         assem_debug("branch: internal\n");
5713       else
5714         assem_debug("branch: external\n");
5715       if(internal&&is_ds[(ba[i]-start)>>2]) {
5716         ds_assemble_entry(i);
5717       }
5718       else {
5719         add_to_linker((int)out,ba[i],internal);
5720         emit_jmp(0);
5721       }
5722     }
5723     // branch not taken
5724     cop1_usable=prev_cop1_usable;
5725     if(!unconditional) {
5726       set_jump_target(nottaken,(int)out);
5727       assem_debug("1:\n");
5728       if(!likely[i]) {
5729         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5730                       ds_unneeded,ds_unneeded_upper);
5731         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5732         address_generation(i+1,&branch_regs[i],0);
5733         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5734         ds_assemble(i+1,&branch_regs[i]);
5735       }
5736       cc=get_reg(branch_regs[i].regmap,CCREG);
5737       if(cc==-1&&!likely[i]) {
5738         // Cycle count isn't in a register, temporarily load it then write it out
5739         emit_loadreg(CCREG,HOST_CCREG);
5740         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5741         int jaddr=(int)out;
5742         emit_jns(0);
5743         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5744         emit_storereg(CCREG,HOST_CCREG);
5745       }
5746       else{
5747         cc=get_reg(i_regmap,CCREG);
5748         assert(cc==HOST_CCREG);
5749         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5750         int jaddr=(int)out;
5751         emit_jns(0);
5752         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5753       }
5754     }
5755   }
5756 }
5757
5758 void fjump_assemble(int i,struct regstat *i_regs)
5759 {
5760   signed char *i_regmap=i_regs->regmap;
5761   int cc;
5762   int match;
5763   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5764   assem_debug("fmatch=%d\n",match);
5765   int fs,cs;
5766   int eaddr;
5767   int ooo=1;
5768   int invert=0;
5769   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5770   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5771   if(likely[i]) ooo=0;
5772   if(!match) invert=1;
5773   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5774   if(i>(ba[i]-start)>>2) invert=1;
5775   #endif
5776
5777   if(ooo)
5778     if(itype[i+1]==FCOMP)
5779   {
5780     // Write-after-read dependency prevents out of order execution
5781     // First test branch condition, then execute delay slot, then branch
5782     ooo=0;
5783   }
5784
5785   if(ooo) {
5786     fs=get_reg(branch_regs[i].regmap,FSREG);
5787     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5788   }
5789   else {
5790     fs=get_reg(i_regmap,FSREG);
5791   }
5792
5793   // Check cop1 unusable
5794   if(!cop1_usable) {
5795     cs=get_reg(i_regmap,CSREG);
5796     assert(cs>=0);
5797     emit_testimm(cs,0x20000000);
5798     eaddr=(int)out;
5799     emit_jeq(0);
5800     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5801     cop1_usable=1;
5802   }
5803
5804   if(ooo) {
5805     // Out of order execution (delay slot first)
5806     //printf("OOOE\n");
5807     ds_assemble(i+1,i_regs);
5808     int adj;
5809     uint64_t bc_unneeded=branch_regs[i].u;
5810     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5811     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5812     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5813     bc_unneeded|=1;
5814     bc_unneeded_upper|=1;
5815     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5816                   bc_unneeded,bc_unneeded_upper);
5817     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5818     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5819     cc=get_reg(branch_regs[i].regmap,CCREG);
5820     assert(cc==HOST_CCREG);
5821     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5822     assem_debug("cycle count (adj)\n");
5823     if(1) {
5824       int nottaken=0;
5825       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5826       if(1) {
5827         assert(fs>=0);
5828         emit_testimm(fs,0x800000);
5829         if(source[i]&0x10000) // BC1T
5830         {
5831           if(invert){
5832             nottaken=(int)out;
5833             emit_jeq(1);
5834           }else{
5835             add_to_linker((int)out,ba[i],internal);
5836             emit_jne(0);
5837           }
5838         }
5839         else // BC1F
5840           if(invert){
5841             nottaken=(int)out;
5842             emit_jne(1);
5843           }else{
5844             add_to_linker((int)out,ba[i],internal);
5845             emit_jeq(0);
5846           }
5847         {
5848         }
5849       } // if(!only32)
5850           
5851       if(invert) {
5852         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5853         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5854         else if(match) emit_addnop(13);
5855         #endif
5856         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5857         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5858         if(internal)
5859           assem_debug("branch: internal\n");
5860         else
5861           assem_debug("branch: external\n");
5862         if(internal&&is_ds[(ba[i]-start)>>2]) {
5863           ds_assemble_entry(i);
5864         }
5865         else {
5866           add_to_linker((int)out,ba[i],internal);
5867           emit_jmp(0);
5868         }
5869         set_jump_target(nottaken,(int)out);
5870       }
5871
5872       if(adj) {
5873         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5874       }
5875     } // (!unconditional)
5876   } // if(ooo)
5877   else
5878   {
5879     // In-order execution (branch first)
5880     //printf("IOE\n");
5881     int nottaken=0;
5882     if(1) {
5883       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5884       if(1) {
5885         assert(fs>=0);
5886         emit_testimm(fs,0x800000);
5887         if(source[i]&0x10000) // BC1T
5888         {
5889           nottaken=(int)out;
5890           emit_jeq(1);
5891         }
5892         else // BC1F
5893         {
5894           nottaken=(int)out;
5895           emit_jne(1);
5896         }
5897       }
5898     } // if(!unconditional)
5899     int adj;
5900     uint64_t ds_unneeded=branch_regs[i].u;
5901     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5902     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5903     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5904     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5905     ds_unneeded|=1;
5906     ds_unneeded_upper|=1;
5907     // branch taken
5908     //assem_debug("1:\n");
5909     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5910                   ds_unneeded,ds_unneeded_upper);
5911     // load regs
5912     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5913     address_generation(i+1,&branch_regs[i],0);
5914     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5915     ds_assemble(i+1,&branch_regs[i]);
5916     cc=get_reg(branch_regs[i].regmap,CCREG);
5917     if(cc==-1) {
5918       emit_loadreg(CCREG,cc=HOST_CCREG);
5919       // CHECK: Is the following instruction (fall thru) allocated ok?
5920     }
5921     assert(cc==HOST_CCREG);
5922     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5923     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5924     assem_debug("cycle count (adj)\n");
5925     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5926     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5927     if(internal)
5928       assem_debug("branch: internal\n");
5929     else
5930       assem_debug("branch: external\n");
5931     if(internal&&is_ds[(ba[i]-start)>>2]) {
5932       ds_assemble_entry(i);
5933     }
5934     else {
5935       add_to_linker((int)out,ba[i],internal);
5936       emit_jmp(0);
5937     }
5938
5939     // branch not taken
5940     if(1) { // <- FIXME (don't need this)
5941       set_jump_target(nottaken,(int)out);
5942       assem_debug("1:\n");
5943       if(!likely[i]) {
5944         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5945                       ds_unneeded,ds_unneeded_upper);
5946         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5947         address_generation(i+1,&branch_regs[i],0);
5948         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5949         ds_assemble(i+1,&branch_regs[i]);
5950       }
5951       cc=get_reg(branch_regs[i].regmap,CCREG);
5952       if(cc==-1&&!likely[i]) {
5953         // Cycle count isn't in a register, temporarily load it then write it out
5954         emit_loadreg(CCREG,HOST_CCREG);
5955         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5956         int jaddr=(int)out;
5957         emit_jns(0);
5958         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5959         emit_storereg(CCREG,HOST_CCREG);
5960       }
5961       else{
5962         cc=get_reg(i_regmap,CCREG);
5963         assert(cc==HOST_CCREG);
5964         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5965         int jaddr=(int)out;
5966         emit_jns(0);
5967         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5968       }
5969     }
5970   }
5971 }
5972
5973 static void pagespan_assemble(int i,struct regstat *i_regs)
5974 {
5975   int s1l=get_reg(i_regs->regmap,rs1[i]);
5976   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5977   int s2l=get_reg(i_regs->regmap,rs2[i]);
5978   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5979   void *nt_branch=NULL;
5980   int taken=0;
5981   int nottaken=0;
5982   int unconditional=0;
5983   if(rs1[i]==0)
5984   {
5985     s1l=s2l;s1h=s2h;
5986     s2l=s2h=-1;
5987   }
5988   else if(rs2[i]==0)
5989   {
5990     s2l=s2h=-1;
5991   }
5992   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5993     s1h=s2h=-1;
5994   }
5995   int hr=0;
5996   int addr,alt,ntaddr;
5997   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5998   else {
5999     while(hr<HOST_REGS)
6000     {
6001       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6002          (i_regs->regmap[hr]&63)!=rs1[i] &&
6003          (i_regs->regmap[hr]&63)!=rs2[i] )
6004       {
6005         addr=hr++;break;
6006       }
6007       hr++;
6008     }
6009   }
6010   while(hr<HOST_REGS)
6011   {
6012     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6013        (i_regs->regmap[hr]&63)!=rs1[i] &&
6014        (i_regs->regmap[hr]&63)!=rs2[i] )
6015     {
6016       alt=hr++;break;
6017     }
6018     hr++;
6019   }
6020   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6021   {
6022     while(hr<HOST_REGS)
6023     {
6024       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6025          (i_regs->regmap[hr]&63)!=rs1[i] &&
6026          (i_regs->regmap[hr]&63)!=rs2[i] )
6027       {
6028         ntaddr=hr;break;
6029       }
6030       hr++;
6031     }
6032   }
6033   assert(hr<HOST_REGS);
6034   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6035     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6036   }
6037   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6038   if(opcode[i]==2) // J
6039   {
6040     unconditional=1;
6041   }
6042   if(opcode[i]==3) // JAL
6043   {
6044     // TODO: mini_ht
6045     int rt=get_reg(i_regs->regmap,31);
6046     emit_movimm(start+i*4+8,rt);
6047     unconditional=1;
6048   }
6049   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6050   {
6051     emit_mov(s1l,addr);
6052     if(opcode2[i]==9) // JALR
6053     {
6054       int rt=get_reg(i_regs->regmap,31);
6055       emit_movimm(start+i*4+8,rt);
6056     }
6057   }
6058   if((opcode[i]&0x3f)==4) // BEQ
6059   {
6060     if(rs1[i]==rs2[i])
6061     {
6062       unconditional=1;
6063     }
6064     else
6065     #ifdef HAVE_CMOV_IMM
6066     if(s1h<0) {
6067       if(s2l>=0) emit_cmp(s1l,s2l);
6068       else emit_test(s1l,s1l);
6069       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6070     }
6071     else
6072     #endif
6073     {
6074       assert(s1l>=0);
6075       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6076       if(s1h>=0) {
6077         if(s2h>=0) emit_cmp(s1h,s2h);
6078         else emit_test(s1h,s1h);
6079         emit_cmovne_reg(alt,addr);
6080       }
6081       if(s2l>=0) emit_cmp(s1l,s2l);
6082       else emit_test(s1l,s1l);
6083       emit_cmovne_reg(alt,addr);
6084     }
6085   }
6086   if((opcode[i]&0x3f)==5) // BNE
6087   {
6088     #ifdef HAVE_CMOV_IMM
6089     if(s1h<0) {
6090       if(s2l>=0) emit_cmp(s1l,s2l);
6091       else emit_test(s1l,s1l);
6092       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6093     }
6094     else
6095     #endif
6096     {
6097       assert(s1l>=0);
6098       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6099       if(s1h>=0) {
6100         if(s2h>=0) emit_cmp(s1h,s2h);
6101         else emit_test(s1h,s1h);
6102         emit_cmovne_reg(alt,addr);
6103       }
6104       if(s2l>=0) emit_cmp(s1l,s2l);
6105       else emit_test(s1l,s1l);
6106       emit_cmovne_reg(alt,addr);
6107     }
6108   }
6109   if((opcode[i]&0x3f)==0x14) // BEQL
6110   {
6111     if(s1h>=0) {
6112       if(s2h>=0) emit_cmp(s1h,s2h);
6113       else emit_test(s1h,s1h);
6114       nottaken=(int)out;
6115       emit_jne(0);
6116     }
6117     if(s2l>=0) emit_cmp(s1l,s2l);
6118     else emit_test(s1l,s1l);
6119     if(nottaken) set_jump_target(nottaken,(int)out);
6120     nottaken=(int)out;
6121     emit_jne(0);
6122   }
6123   if((opcode[i]&0x3f)==0x15) // BNEL
6124   {
6125     if(s1h>=0) {
6126       if(s2h>=0) emit_cmp(s1h,s2h);
6127       else emit_test(s1h,s1h);
6128       taken=(int)out;
6129       emit_jne(0);
6130     }
6131     if(s2l>=0) emit_cmp(s1l,s2l);
6132     else emit_test(s1l,s1l);
6133     nottaken=(int)out;
6134     emit_jeq(0);
6135     if(taken) set_jump_target(taken,(int)out);
6136   }
6137   if((opcode[i]&0x3f)==6) // BLEZ
6138   {
6139     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6140     emit_cmpimm(s1l,1);
6141     if(s1h>=0) emit_mov(addr,ntaddr);
6142     emit_cmovl_reg(alt,addr);
6143     if(s1h>=0) {
6144       emit_test(s1h,s1h);
6145       emit_cmovne_reg(ntaddr,addr);
6146       emit_cmovs_reg(alt,addr);
6147     }
6148   }
6149   if((opcode[i]&0x3f)==7) // BGTZ
6150   {
6151     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6152     emit_cmpimm(s1l,1);
6153     if(s1h>=0) emit_mov(addr,alt);
6154     emit_cmovl_reg(ntaddr,addr);
6155     if(s1h>=0) {
6156       emit_test(s1h,s1h);
6157       emit_cmovne_reg(alt,addr);
6158       emit_cmovs_reg(ntaddr,addr);
6159     }
6160   }
6161   if((opcode[i]&0x3f)==0x16) // BLEZL
6162   {
6163     assert((opcode[i]&0x3f)!=0x16);
6164   }
6165   if((opcode[i]&0x3f)==0x17) // BGTZL
6166   {
6167     assert((opcode[i]&0x3f)!=0x17);
6168   }
6169   assert(opcode[i]!=1); // BLTZ/BGEZ
6170
6171   //FIXME: Check CSREG
6172   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6173     if((source[i]&0x30000)==0) // BC1F
6174     {
6175       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6176       emit_testimm(s1l,0x800000);
6177       emit_cmovne_reg(alt,addr);
6178     }
6179     if((source[i]&0x30000)==0x10000) // BC1T
6180     {
6181       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6182       emit_testimm(s1l,0x800000);
6183       emit_cmovne_reg(alt,addr);
6184     }
6185     if((source[i]&0x30000)==0x20000) // BC1FL
6186     {
6187       emit_testimm(s1l,0x800000);
6188       nottaken=(int)out;
6189       emit_jne(0);
6190     }
6191     if((source[i]&0x30000)==0x30000) // BC1TL
6192     {
6193       emit_testimm(s1l,0x800000);
6194       nottaken=(int)out;
6195       emit_jeq(0);
6196     }
6197   }
6198
6199   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6200   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6201   if(likely[i]||unconditional)
6202   {
6203     emit_movimm(ba[i],HOST_BTREG);
6204   }
6205   else if(addr!=HOST_BTREG)
6206   {
6207     emit_mov(addr,HOST_BTREG);
6208   }
6209   void *branch_addr=out;
6210   emit_jmp(0);
6211   int target_addr=start+i*4+5;
6212   void *stub=out;
6213   void *compiled_target_addr=check_addr(target_addr);
6214   emit_extjump_ds((int)branch_addr,target_addr);
6215   if(compiled_target_addr) {
6216     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6217     add_link(target_addr,stub);
6218   }
6219   else set_jump_target((int)branch_addr,(int)stub);
6220   if(likely[i]) {
6221     // Not-taken path
6222     set_jump_target((int)nottaken,(int)out);
6223     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6224     void *branch_addr=out;
6225     emit_jmp(0);
6226     int target_addr=start+i*4+8;
6227     void *stub=out;
6228     void *compiled_target_addr=check_addr(target_addr);
6229     emit_extjump_ds((int)branch_addr,target_addr);
6230     if(compiled_target_addr) {
6231       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6232       add_link(target_addr,stub);
6233     }
6234     else set_jump_target((int)branch_addr,(int)stub);
6235   }
6236 }
6237
6238 // Assemble the delay slot for the above
6239 static void pagespan_ds()
6240 {
6241   assem_debug("initial delay slot:\n");
6242   u_int vaddr=start+1;
6243   u_int page=get_page(vaddr);
6244   u_int vpage=get_vpage(vaddr);
6245   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6246   do_dirty_stub_ds();
6247   ll_add(jump_in+page,vaddr,(void *)out);
6248   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6249   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6250     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6251   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6252     emit_writeword(HOST_BTREG,(int)&branch_target);
6253   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6254   address_generation(0,&regs[0],regs[0].regmap_entry);
6255   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39)
6256     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6257   cop1_usable=0;
6258   is_delayslot=0;
6259   switch(itype[0]) {
6260     case ALU:
6261       alu_assemble(0,&regs[0]);break;
6262     case IMM16:
6263       imm16_assemble(0,&regs[0]);break;
6264     case SHIFT:
6265       shift_assemble(0,&regs[0]);break;
6266     case SHIFTIMM:
6267       shiftimm_assemble(0,&regs[0]);break;
6268     case LOAD:
6269       load_assemble(0,&regs[0]);break;
6270     case LOADLR:
6271       loadlr_assemble(0,&regs[0]);break;
6272     case STORE:
6273       store_assemble(0,&regs[0]);break;
6274     case STORELR:
6275       storelr_assemble(0,&regs[0]);break;
6276     case COP0:
6277       cop0_assemble(0,&regs[0]);break;
6278     case COP1:
6279       cop1_assemble(0,&regs[0]);break;
6280     case C1LS:
6281       c1ls_assemble(0,&regs[0]);break;
6282     case FCONV:
6283       fconv_assemble(0,&regs[0]);break;
6284     case FLOAT:
6285       float_assemble(0,&regs[0]);break;
6286     case FCOMP:
6287       fcomp_assemble(0,&regs[0]);break;
6288     case MULTDIV:
6289       multdiv_assemble(0,&regs[0]);break;
6290     case MOV:
6291       mov_assemble(0,&regs[0]);break;
6292     case SYSCALL:
6293     case SPAN:
6294     case UJUMP:
6295     case RJUMP:
6296     case CJUMP:
6297     case SJUMP:
6298     case FJUMP:
6299       printf("Jump in the delay slot.  This is probably a bug.\n");
6300   }
6301   int btaddr=get_reg(regs[0].regmap,BTREG);
6302   if(btaddr<0) {
6303     btaddr=get_reg(regs[0].regmap,-1);
6304     emit_readword((int)&branch_target,btaddr);
6305   }
6306   assert(btaddr!=HOST_CCREG);
6307   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6308 #ifdef HOST_IMM8
6309   emit_movimm(start+4,HOST_TEMPREG);
6310   emit_cmp(btaddr,HOST_TEMPREG);
6311 #else
6312   emit_cmpimm(btaddr,start+4);
6313 #endif
6314   int branch=(int)out;
6315   emit_jeq(0);
6316   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6317   emit_jmp(jump_vaddr_reg[btaddr]);
6318   set_jump_target(branch,(int)out);
6319   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6320   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6321 }
6322
6323 // Basic liveness analysis for MIPS registers
6324 void unneeded_registers(int istart,int iend,int r)
6325 {
6326   int i;
6327   uint64_t u,uu,b,bu;
6328   uint64_t temp_u,temp_uu;
6329   uint64_t tdep;
6330   if(iend==slen-1) {
6331     u=1;uu=1;
6332   }else{
6333     u=unneeded_reg[iend+1];
6334     uu=unneeded_reg_upper[iend+1];
6335     u=1;uu=1;
6336   }
6337   for (i=iend;i>=istart;i--)
6338   {
6339     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6340     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6341     {
6342       // If subroutine call, flag return address as a possible branch target
6343       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6344       
6345       if(ba[i]<start || ba[i]>=(start+slen*4))
6346       {
6347         // Branch out of this block, flush all regs
6348         u=1;
6349         uu=1;
6350         /* Hexagon hack 
6351         if(itype[i]==UJUMP&&rt1[i]==31)
6352         {
6353           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6354         }
6355         if(itype[i]==RJUMP&&rs1[i]==31)
6356         {
6357           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6358         }
6359         if(start>0x80000400&&start<0x80800000) {
6360           if(itype[i]==UJUMP&&rt1[i]==31)
6361           {
6362             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6363             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6364           }
6365           if(itype[i]==RJUMP&&rs1[i]==31)
6366           {
6367             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6368             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6369           }
6370         }*/
6371         branch_unneeded_reg[i]=u;
6372         branch_unneeded_reg_upper[i]=uu;
6373         // Merge in delay slot
6374         tdep=(~uu>>rt1[i+1])&1;
6375         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6376         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6377         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6378         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6379         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6380         u|=1;uu|=1;
6381         // If branch is "likely" (and conditional)
6382         // then we skip the delay slot on the fall-thru path
6383         if(likely[i]) {
6384           if(i<slen-1) {
6385             u&=unneeded_reg[i+2];
6386             uu&=unneeded_reg_upper[i+2];
6387           }
6388           else
6389           {
6390             u=1;
6391             uu=1;
6392           }
6393         }
6394       }
6395       else
6396       {
6397         // Internal branch, flag target
6398         bt[(ba[i]-start)>>2]=1;
6399         if(ba[i]<=start+i*4) {
6400           // Backward branch
6401           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6402           {
6403             // Unconditional branch
6404             temp_u=1;temp_uu=1;
6405           } else {
6406             // Conditional branch (not taken case)
6407             temp_u=unneeded_reg[i+2];
6408             temp_uu=unneeded_reg_upper[i+2];
6409           }
6410           // Merge in delay slot
6411           tdep=(~temp_uu>>rt1[i+1])&1;
6412           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6413           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6414           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6415           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6416           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6417           temp_u|=1;temp_uu|=1;
6418           // If branch is "likely" (and conditional)
6419           // then we skip the delay slot on the fall-thru path
6420           if(likely[i]) {
6421             if(i<slen-1) {
6422               temp_u&=unneeded_reg[i+2];
6423               temp_uu&=unneeded_reg_upper[i+2];
6424             }
6425             else
6426             {
6427               temp_u=1;
6428               temp_uu=1;
6429             }
6430           }
6431           tdep=(~temp_uu>>rt1[i])&1;
6432           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6433           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6434           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6435           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6436           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6437           temp_u|=1;temp_uu|=1;
6438           unneeded_reg[i]=temp_u;
6439           unneeded_reg_upper[i]=temp_uu;
6440           // Only go three levels deep.  This recursion can take an
6441           // excessive amount of time if there are a lot of nested loops.
6442           if(r<2) {
6443             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6444           }else{
6445             unneeded_reg[(ba[i]-start)>>2]=1;
6446             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6447           }
6448         } /*else*/ if(1) {
6449           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6450           {
6451             // Unconditional branch
6452             u=unneeded_reg[(ba[i]-start)>>2];
6453             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6454             branch_unneeded_reg[i]=u;
6455             branch_unneeded_reg_upper[i]=uu;
6456         //u=1;
6457         //uu=1;
6458         //branch_unneeded_reg[i]=u;
6459         //branch_unneeded_reg_upper[i]=uu;
6460             // Merge in delay slot
6461             tdep=(~uu>>rt1[i+1])&1;
6462             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6463             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6464             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6465             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6466             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6467             u|=1;uu|=1;
6468           } else {
6469             // Conditional branch
6470             b=unneeded_reg[(ba[i]-start)>>2];
6471             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6472             branch_unneeded_reg[i]=b;
6473             branch_unneeded_reg_upper[i]=bu;
6474         //b=1;
6475         //bu=1;
6476         //branch_unneeded_reg[i]=b;
6477         //branch_unneeded_reg_upper[i]=bu;
6478             // Branch delay slot
6479             tdep=(~uu>>rt1[i+1])&1;
6480             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6481             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6482             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6483             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6484             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6485             b|=1;bu|=1;
6486             // If branch is "likely" then we skip the
6487             // delay slot on the fall-thru path
6488             if(likely[i]) {
6489               u=b;
6490               uu=bu;
6491               if(i<slen-1) {
6492                 u&=unneeded_reg[i+2];
6493                 uu&=unneeded_reg_upper[i+2];
6494         //u=1;
6495         //uu=1;
6496               }
6497             } else {
6498               u&=b;
6499               uu&=bu;
6500         //u=1;
6501         //uu=1;
6502             }
6503             if(i<slen-1) {
6504               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6505               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6506         //branch_unneeded_reg[i]=1;
6507         //branch_unneeded_reg_upper[i]=1;
6508             } else {
6509               branch_unneeded_reg[i]=1;
6510               branch_unneeded_reg_upper[i]=1;
6511             }
6512           }
6513         }
6514       }
6515     }
6516     else if(itype[i]==SYSCALL)
6517     {
6518       // SYSCALL instruction (software interrupt)
6519       u=1;
6520       uu=1;
6521     }
6522     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6523     {
6524       // ERET instruction (return from interrupt)
6525       u=1;
6526       uu=1;
6527     }
6528     //u=uu=1; // DEBUG
6529     tdep=(~uu>>rt1[i])&1;
6530     // Written registers are unneeded
6531     u|=1LL<<rt1[i];
6532     u|=1LL<<rt2[i];
6533     uu|=1LL<<rt1[i];
6534     uu|=1LL<<rt2[i];
6535     // Accessed registers are needed
6536     u&=~(1LL<<rs1[i]);
6537     u&=~(1LL<<rs2[i]);
6538     uu&=~(1LL<<us1[i]);
6539     uu&=~(1LL<<us2[i]);
6540     // Source-target dependencies
6541     uu&=~(tdep<<dep1[i]);
6542     uu&=~(tdep<<dep2[i]);
6543     // R0 is always unneeded
6544     u|=1;uu|=1;
6545     // Save it
6546     unneeded_reg[i]=u;
6547     unneeded_reg_upper[i]=uu;
6548 #ifdef FORCE32
6549     unneeded_reg_upper[i]=-1LL;
6550 #endif
6551     /*
6552     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6553     printf("U:");
6554     int r;
6555     for(r=1;r<=CCREG;r++) {
6556       if((unneeded_reg[i]>>r)&1) {
6557         if(r==HIREG) printf(" HI");
6558         else if(r==LOREG) printf(" LO");
6559         else printf(" r%d",r);
6560       }
6561     }
6562     printf(" UU:");
6563     for(r=1;r<=CCREG;r++) {
6564       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6565         if(r==HIREG) printf(" HI");
6566         else if(r==LOREG) printf(" LO");
6567         else printf(" r%d",r);
6568       }
6569     }
6570     printf("\n");*/
6571   }
6572 }
6573
6574 // Identify registers which are likely to contain 32-bit values
6575 // This is used to predict whether any branches will jump to a
6576 // location with 64-bit values in registers.
6577 static void provisional_32bit()
6578 {
6579   int i,j;
6580   uint64_t is32=1;
6581   uint64_t lastbranch=1;
6582   
6583   for(i=0;i<slen;i++)
6584   {
6585     if(i>0) {
6586       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6587         if(i>1) is32=lastbranch;
6588         else is32=1;
6589       }
6590     }
6591     if(i>1)
6592     {
6593       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6594         if(likely[i-2]) {
6595           if(i>2) is32=lastbranch;
6596           else is32=1;
6597         }
6598       }
6599       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6600       {
6601         if(rs1[i-2]==0||rs2[i-2]==0)
6602         {
6603           if(rs1[i-2]) {
6604             is32|=1LL<<rs1[i-2];
6605           }
6606           if(rs2[i-2]) {
6607             is32|=1LL<<rs2[i-2];
6608           }
6609         }
6610       }
6611     }
6612     // If something jumps here with 64-bit values
6613     // then promote those registers to 64 bits
6614     if(bt[i])
6615     {
6616       uint64_t temp_is32=is32;
6617       for(j=i-1;j>=0;j--)
6618       {
6619         if(ba[j]==start+i*4) 
6620           //temp_is32&=branch_regs[j].is32;
6621           temp_is32&=p32[j];
6622       }
6623       for(j=i;j<slen;j++)
6624       {
6625         if(ba[j]==start+i*4) 
6626           temp_is32=1;
6627       }
6628       is32=temp_is32;
6629     }
6630     int type=itype[i];
6631     int op=opcode[i];
6632     int op2=opcode2[i];
6633     int rt=rt1[i];
6634     int s1=rs1[i];
6635     int s2=rs2[i];
6636     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6637       // Branches don't write registers, consider the delay slot instead.
6638       type=itype[i+1];
6639       op=opcode[i+1];
6640       op2=opcode2[i+1];
6641       rt=rt1[i+1];
6642       s1=rs1[i+1];
6643       s2=rs2[i+1];
6644       lastbranch=is32;
6645     }
6646     switch(type) {
6647       case LOAD:
6648         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6649            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6650           is32&=~(1LL<<rt);
6651         else
6652           is32|=1LL<<rt;
6653         break;
6654       case STORE:
6655       case STORELR:
6656         break;
6657       case LOADLR:
6658         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6659         if(op==0x22) is32|=1LL<<rt; // LWL
6660         break;
6661       case IMM16:
6662         if (op==0x08||op==0x09|| // ADDI/ADDIU
6663             op==0x0a||op==0x0b|| // SLTI/SLTIU
6664             op==0x0c|| // ANDI
6665             op==0x0f)  // LUI
6666         {
6667           is32|=1LL<<rt;
6668         }
6669         if(op==0x18||op==0x19) { // DADDI/DADDIU
6670           is32&=~(1LL<<rt);
6671           //if(imm[i]==0)
6672           //  is32|=((is32>>s1)&1LL)<<rt;
6673         }
6674         if(op==0x0d||op==0x0e) { // ORI/XORI
6675           uint64_t sr=((is32>>s1)&1LL);
6676           is32&=~(1LL<<rt);
6677           is32|=sr<<rt;
6678         }
6679         break;
6680       case UJUMP:
6681         break;
6682       case RJUMP:
6683         break;
6684       case CJUMP:
6685         break;
6686       case SJUMP:
6687         break;
6688       case FJUMP:
6689         break;
6690       case ALU:
6691         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6692           is32|=1LL<<rt;
6693         }
6694         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6695           is32|=1LL<<rt;
6696         }
6697         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6698           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6699           is32&=~(1LL<<rt);
6700           is32|=sr<<rt;
6701         }
6702         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6703           if(s1==0&&s2==0) {
6704             is32|=1LL<<rt;
6705           }
6706           else if(s2==0) {
6707             uint64_t sr=((is32>>s1)&1LL);
6708             is32&=~(1LL<<rt);
6709             is32|=sr<<rt;
6710           }
6711           else if(s1==0) {
6712             uint64_t sr=((is32>>s2)&1LL);
6713             is32&=~(1LL<<rt);
6714             is32|=sr<<rt;
6715           }
6716           else {
6717             is32&=~(1LL<<rt);
6718           }
6719         }
6720         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6721           if(s1==0&&s2==0) {
6722             is32|=1LL<<rt;
6723           }
6724           else if(s2==0) {
6725             uint64_t sr=((is32>>s1)&1LL);
6726             is32&=~(1LL<<rt);
6727             is32|=sr<<rt;
6728           }
6729           else {
6730             is32&=~(1LL<<rt);
6731           }
6732         }
6733         break;
6734       case MULTDIV:
6735         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6736           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6737         }
6738         else {
6739           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6740         }
6741         break;
6742       case MOV:
6743         {
6744           uint64_t sr=((is32>>s1)&1LL);
6745           is32&=~(1LL<<rt);
6746           is32|=sr<<rt;
6747         }
6748         break;
6749       case SHIFT:
6750         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6751         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6752         break;
6753       case SHIFTIMM:
6754         is32|=1LL<<rt;
6755         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6756         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6757         break;
6758       case COP0:
6759         if(op2==0) is32|=1LL<<rt; // MFC0
6760         break;
6761       case COP1:
6762         if(op2==0) is32|=1LL<<rt; // MFC1
6763         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6764         if(op2==2) is32|=1LL<<rt; // CFC1
6765         break;
6766       case C1LS:
6767         break;
6768       case FLOAT:
6769       case FCONV:
6770         break;
6771       case FCOMP:
6772         break;
6773       case SYSCALL:
6774         break;
6775       default:
6776         break;
6777     }
6778     is32|=1;
6779     p32[i]=is32;
6780
6781     if(i>0)
6782     {
6783       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6784       {
6785         if(rt1[i-1]==31) // JAL/JALR
6786         {
6787           // Subroutine call will return here, don't alloc any registers
6788           is32=1;
6789         }
6790         else if(i+1<slen)
6791         {
6792           // Internal branch will jump here, match registers to caller
6793           is32=0x3FFFFFFFFLL;
6794         }
6795       }
6796     }
6797   }
6798 }
6799
6800 // Identify registers which may be assumed to contain 32-bit values
6801 // and where optimizations will rely on this.
6802 // This is used to determine whether backward branches can safely
6803 // jump to a location with 64-bit values in registers.
6804 static void provisional_r32()
6805 {
6806   u_int r32=0;
6807   int i;
6808   
6809   for (i=slen-1;i>=0;i--)
6810   {
6811     int hr;
6812     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6813     {
6814       if(ba[i]<start || ba[i]>=(start+slen*4))
6815       {
6816         // Branch out of this block, don't need anything
6817         r32=0;
6818       }
6819       else
6820       {
6821         // Internal branch
6822         // Need whatever matches the target
6823         // (and doesn't get overwritten by the delay slot instruction)
6824         r32=0;
6825         int t=(ba[i]-start)>>2;
6826         if(ba[i]>start+i*4) {
6827           // Forward branch
6828           //if(!(requires_32bit[t]&~regs[i].was32))
6829           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6830           if(!(pr32[t]&~regs[i].was32))
6831             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6832         }else{
6833           // Backward branch
6834           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
6835             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
6836         }
6837       }
6838       // Conditional branch may need registers for following instructions
6839       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6840       {
6841         if(i<slen-2) {
6842           //r32|=requires_32bit[i+2];
6843           r32|=pr32[i+2];
6844           r32&=regs[i].was32;
6845           // Mark this address as a branch target since it may be called
6846           // upon return from interrupt
6847           //bt[i+2]=1;
6848         }
6849       }
6850       // Merge in delay slot
6851       if(!likely[i]) {
6852         // These are overwritten unless the branch is "likely"
6853         // and the delay slot is nullified if not taken
6854         r32&=~(1LL<<rt1[i+1]);
6855         r32&=~(1LL<<rt2[i+1]);
6856       }
6857       // Assume these are needed (delay slot)
6858       if(us1[i+1]>0)
6859       {
6860         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
6861       }
6862       if(us2[i+1]>0)
6863       {
6864         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
6865       }
6866       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
6867       {
6868         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
6869       }
6870       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
6871       {
6872         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
6873       }
6874     }
6875     else if(itype[i]==SYSCALL)
6876     {
6877       // SYSCALL instruction (software interrupt)
6878       r32=0;
6879     }
6880     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6881     {
6882       // ERET instruction (return from interrupt)
6883       r32=0;
6884     }
6885     // Check 32 bits
6886     r32&=~(1LL<<rt1[i]);
6887     r32&=~(1LL<<rt2[i]);
6888     if(us1[i]>0)
6889     {
6890       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
6891     }
6892     if(us2[i]>0)
6893     {
6894       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
6895     }
6896     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
6897     {
6898       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
6899     }
6900     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
6901     {
6902       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
6903     }
6904     //requires_32bit[i]=r32;
6905     pr32[i]=r32;
6906     
6907     // Dirty registers which are 32-bit, require 32-bit input
6908     // as they will be written as 32-bit values
6909     for(hr=0;hr<HOST_REGS;hr++)
6910     {
6911       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
6912         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
6913           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
6914           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
6915           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
6916         }
6917       }
6918     }
6919   }
6920 }
6921
6922 // Write back dirty registers as soon as we will no longer modify them,
6923 // so that we don't end up with lots of writes at the branches.
6924 void clean_registers(int istart,int iend,int wr)
6925 {
6926   int i;
6927   int r;
6928   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6929   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6930   if(iend==slen-1) {
6931     will_dirty_i=will_dirty_next=0;
6932     wont_dirty_i=wont_dirty_next=0;
6933   }else{
6934     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6935     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6936   }
6937   for (i=iend;i>=istart;i--)
6938   {
6939     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6940     {
6941       if(ba[i]<start || ba[i]>=(start+slen*4))
6942       {
6943         // Branch out of this block, flush all regs
6944         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6945         {
6946           // Unconditional branch
6947           will_dirty_i=0;
6948           wont_dirty_i=0;
6949           // Merge in delay slot (will dirty)
6950           for(r=0;r<HOST_REGS;r++) {
6951             if(r!=EXCLUDE_REG) {
6952               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6953               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6954               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6955               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6956               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6957               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6958               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6959               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6960               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6961               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6962               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6963               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6964               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6965               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6966             }
6967           }
6968         }
6969         else
6970         {
6971           // Conditional branch
6972           will_dirty_i=0;
6973           wont_dirty_i=wont_dirty_next;
6974           // Merge in delay slot (will dirty)
6975           for(r=0;r<HOST_REGS;r++) {
6976             if(r!=EXCLUDE_REG) {
6977               if(!likely[i]) {
6978                 // Might not dirty if likely branch is not taken
6979                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6980                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6981                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6982                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6983                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6984                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6985                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6986                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6987                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6988                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6989                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6990                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6991                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6992                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6993               }
6994             }
6995           }
6996         }
6997         // Merge in delay slot (wont dirty)
6998         for(r=0;r<HOST_REGS;r++) {
6999           if(r!=EXCLUDE_REG) {
7000             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7001             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7002             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7003             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7004             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7005             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7006             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7007             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7008             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7009             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7010           }
7011         }
7012         if(wr) {
7013           #ifndef DESTRUCTIVE_WRITEBACK
7014           branch_regs[i].dirty&=wont_dirty_i;
7015           #endif
7016           branch_regs[i].dirty|=will_dirty_i;
7017         }
7018       }
7019       else
7020       {
7021         // Internal branch
7022         if(ba[i]<=start+i*4) {
7023           // Backward branch
7024           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7025           {
7026             // Unconditional branch
7027             temp_will_dirty=0;
7028             temp_wont_dirty=0;
7029             // Merge in delay slot (will dirty)
7030             for(r=0;r<HOST_REGS;r++) {
7031               if(r!=EXCLUDE_REG) {
7032                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7033                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7034                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7035                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7036                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7037                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7038                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7039                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7040                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7041                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7042                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7043                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7044                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7045                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7046               }
7047             }
7048           } else {
7049             // Conditional branch (not taken case)
7050             temp_will_dirty=will_dirty_next;
7051             temp_wont_dirty=wont_dirty_next;
7052             // Merge in delay slot (will dirty)
7053             for(r=0;r<HOST_REGS;r++) {
7054               if(r!=EXCLUDE_REG) {
7055                 if(!likely[i]) {
7056                   // Will not dirty if likely branch is not taken
7057                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7058                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7059                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7060                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7061                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7062                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7063                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7064                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7065                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7066                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7067                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7068                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7069                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7070                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7071                 }
7072               }
7073             }
7074           }
7075           // Merge in delay slot (wont dirty)
7076           for(r=0;r<HOST_REGS;r++) {
7077             if(r!=EXCLUDE_REG) {
7078               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7079               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7080               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7081               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7082               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7083               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7084               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7085               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7086               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7087               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7088             }
7089           }
7090           // Deal with changed mappings
7091           if(i<iend) {
7092             for(r=0;r<HOST_REGS;r++) {
7093               if(r!=EXCLUDE_REG) {
7094                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7095                   temp_will_dirty&=~(1<<r);
7096                   temp_wont_dirty&=~(1<<r);
7097                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7098                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7099                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7100                   } else {
7101                     temp_will_dirty|=1<<r;
7102                     temp_wont_dirty|=1<<r;
7103                   }
7104                 }
7105               }
7106             }
7107           }
7108           if(wr) {
7109             will_dirty[i]=temp_will_dirty;
7110             wont_dirty[i]=temp_wont_dirty;
7111             clean_registers((ba[i]-start)>>2,i-1,0);
7112           }else{
7113             // Limit recursion.  It can take an excessive amount
7114             // of time if there are a lot of nested loops.
7115             will_dirty[(ba[i]-start)>>2]=0;
7116             wont_dirty[(ba[i]-start)>>2]=-1;
7117           }
7118         }
7119         /*else*/ if(1)
7120         {
7121           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7122           {
7123             // Unconditional branch
7124             will_dirty_i=0;
7125             wont_dirty_i=0;
7126           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7127             for(r=0;r<HOST_REGS;r++) {
7128               if(r!=EXCLUDE_REG) {
7129                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7130                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7131                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7132                 }
7133               }
7134             }
7135           //}
7136             // Merge in delay slot
7137             for(r=0;r<HOST_REGS;r++) {
7138               if(r!=EXCLUDE_REG) {
7139                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7140                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7141                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7142                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7143                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7144                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7145                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7146                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7147                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7148                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7149                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7150                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7151                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7152                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7153               }
7154             }
7155           } else {
7156             // Conditional branch
7157             will_dirty_i=will_dirty_next;
7158             wont_dirty_i=wont_dirty_next;
7159           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7160             for(r=0;r<HOST_REGS;r++) {
7161               if(r!=EXCLUDE_REG) {
7162                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7163                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7164                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7165                 }
7166                 else
7167                 {
7168                   will_dirty_i&=~(1<<r);
7169                 }
7170                 // Treat delay slot as part of branch too
7171                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7172                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7173                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7174                 }
7175                 else
7176                 {
7177                   will_dirty[i+1]&=~(1<<r);
7178                 }*/
7179               }
7180             }
7181           //}
7182             // Merge in delay slot
7183             for(r=0;r<HOST_REGS;r++) {
7184               if(r!=EXCLUDE_REG) {
7185                 if(!likely[i]) {
7186                   // Might not dirty if likely branch is not taken
7187                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7188                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7189                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7190                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7191                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7192                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7193                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7194                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7195                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7196                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7197                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7198                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7199                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7200                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7201                 }
7202               }
7203             }
7204           }
7205           // Merge in delay slot
7206           for(r=0;r<HOST_REGS;r++) {
7207             if(r!=EXCLUDE_REG) {
7208               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7209               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7210               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7211               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7212               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7213               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7214               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7215               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7216               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7217               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7218             }
7219           }
7220           if(wr) {
7221             #ifndef DESTRUCTIVE_WRITEBACK
7222             branch_regs[i].dirty&=wont_dirty_i;
7223             #endif
7224             branch_regs[i].dirty|=will_dirty_i;
7225           }
7226         }
7227       }
7228     }
7229     else if(itype[i]==SYSCALL)
7230     {
7231       // SYSCALL instruction (software interrupt)
7232       will_dirty_i=0;
7233       wont_dirty_i=0;
7234     }
7235     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7236     {
7237       // ERET instruction (return from interrupt)
7238       will_dirty_i=0;
7239       wont_dirty_i=0;
7240     }
7241     will_dirty_next=will_dirty_i;
7242     wont_dirty_next=wont_dirty_i;
7243     for(r=0;r<HOST_REGS;r++) {
7244       if(r!=EXCLUDE_REG) {
7245         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7246         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7247         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7248         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7249         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7250         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7251         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7252         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7253         if(i>istart) {
7254           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7255           {
7256             // Don't store a register immediately after writing it,
7257             // may prevent dual-issue.
7258             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7259             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7260           }
7261         }
7262       }
7263     }
7264     // Save it
7265     will_dirty[i]=will_dirty_i;
7266     wont_dirty[i]=wont_dirty_i;
7267     // Mark registers that won't be dirtied as not dirty
7268     if(wr) {
7269       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7270       for(r=0;r<HOST_REGS;r++) {
7271         if((will_dirty_i>>r)&1) {
7272           printf(" r%d",r);
7273         }
7274       }
7275       printf("\n");*/
7276
7277       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7278         regs[i].dirty|=will_dirty_i;
7279         #ifndef DESTRUCTIVE_WRITEBACK
7280         regs[i].dirty&=wont_dirty_i;
7281         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7282         {
7283           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7284             for(r=0;r<HOST_REGS;r++) {
7285               if(r!=EXCLUDE_REG) {
7286                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7287                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7288                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7289               }
7290             }
7291           }
7292         }
7293         else
7294         {
7295           if(i<iend) {
7296             for(r=0;r<HOST_REGS;r++) {
7297               if(r!=EXCLUDE_REG) {
7298                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7299                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7300                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7301               }
7302             }
7303           }
7304         }
7305         #endif
7306       //}
7307     }
7308     // Deal with changed mappings
7309     temp_will_dirty=will_dirty_i;
7310     temp_wont_dirty=wont_dirty_i;
7311     for(r=0;r<HOST_REGS;r++) {
7312       if(r!=EXCLUDE_REG) {
7313         int nr;
7314         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7315           if(wr) {
7316             #ifndef DESTRUCTIVE_WRITEBACK
7317             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7318             #endif
7319             regs[i].wasdirty|=will_dirty_i&(1<<r);
7320           }
7321         }
7322         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7323           // Register moved to a different register
7324           will_dirty_i&=~(1<<r);
7325           wont_dirty_i&=~(1<<r);
7326           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7327           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7328           if(wr) {
7329             #ifndef DESTRUCTIVE_WRITEBACK
7330             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7331             #endif
7332             regs[i].wasdirty|=will_dirty_i&(1<<r);
7333           }
7334         }
7335         else {
7336           will_dirty_i&=~(1<<r);
7337           wont_dirty_i&=~(1<<r);
7338           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7339             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7340             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7341           } else {
7342             wont_dirty_i|=1<<r;
7343             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7344           }
7345         }
7346       }
7347     }
7348   }
7349 }
7350
7351   /* disassembly */
7352 void disassemble_inst(int i)
7353 {
7354     if (bt[i]) printf("*"); else printf(" ");
7355     switch(itype[i]) {
7356       case UJUMP:
7357         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7358       case CJUMP:
7359         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7360       case SJUMP:
7361         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7362       case FJUMP:
7363         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7364       case RJUMP:
7365         printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break;
7366       case SPAN:
7367         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7368       case IMM16:
7369         if(opcode[i]==0xf) //LUI
7370           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7371         else
7372           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7373         break;
7374       case LOAD:
7375       case LOADLR:
7376         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7377         break;
7378       case STORE:
7379       case STORELR:
7380         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7381         break;
7382       case ALU:
7383       case SHIFT:
7384         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7385         break;
7386       case MULTDIV:
7387         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7388         break;
7389       case SHIFTIMM:
7390         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7391         break;
7392       case MOV:
7393         if((opcode2[i]&0x1d)==0x10)
7394           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7395         else if((opcode2[i]&0x1d)==0x11)
7396           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7397         else
7398           printf (" %x: %s\n",start+i*4,insn[i]);
7399         break;
7400       case COP0:
7401         if(opcode2[i]==0)
7402           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7403         else if(opcode2[i]==4)
7404           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7405         else printf (" %x: %s\n",start+i*4,insn[i]);
7406         break;
7407       case COP1:
7408         if(opcode2[i]<3)
7409           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7410         else if(opcode2[i]>3)
7411           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7412         else printf (" %x: %s\n",start+i*4,insn[i]);
7413         break;
7414       case C1LS:
7415         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7416         break;
7417       default:
7418         //printf (" %s %8x\n",insn[i],source[i]);
7419         printf (" %x: %s\n",start+i*4,insn[i]);
7420     }
7421 }
7422
7423 void new_dynarec_init()
7424 {
7425   printf("Init new dynarec\n");
7426   out=(u_char *)BASE_ADDR;
7427   if (mmap (out, 1<<TARGET_SIZE_2,
7428             PROT_READ | PROT_WRITE | PROT_EXEC,
7429             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7430             -1, 0) <= 0) {printf("mmap() failed\n");}
7431 #ifdef MUPEN64
7432   rdword=&readmem_dword;
7433   fake_pc.f.r.rs=&readmem_dword;
7434   fake_pc.f.r.rt=&readmem_dword;
7435   fake_pc.f.r.rd=&readmem_dword;
7436 #endif
7437   int n;
7438   for(n=0x80000;n<0x80800;n++)
7439     invalid_code[n]=1;
7440   for(n=0;n<65536;n++)
7441     hash_table[n][0]=hash_table[n][2]=-1;
7442   memset(mini_ht,-1,sizeof(mini_ht));
7443   memset(restore_candidate,0,sizeof(restore_candidate));
7444   copy=shadow;
7445   expirep=16384; // Expiry pointer, +2 blocks
7446   pending_exception=0;
7447   literalcount=0;
7448 #ifdef HOST_IMM8
7449   // Copy this into local area so we don't have to put it in every literal pool
7450   invc_ptr=invalid_code;
7451 #endif
7452   stop_after_jal=0;
7453   // TLB
7454   using_tlb=0;
7455   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7456     memory_map[n]=-1;
7457   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7458     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7459   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7460     memory_map[n]=-1;
7461 #ifdef MUPEN64
7462   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7463     writemem[n] = write_nomem_new;
7464     writememb[n] = write_nomemb_new;
7465     writememh[n] = write_nomemh_new;
7466 #ifndef FORCE32
7467     writememd[n] = write_nomemd_new;
7468 #endif
7469     readmem[n] = read_nomem_new;
7470     readmemb[n] = read_nomemb_new;
7471     readmemh[n] = read_nomemh_new;
7472 #ifndef FORCE32
7473     readmemd[n] = read_nomemd_new;
7474 #endif
7475   }
7476   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7477     writemem[n] = write_rdram_new;
7478     writememb[n] = write_rdramb_new;
7479     writememh[n] = write_rdramh_new;
7480 #ifndef FORCE32
7481     writememd[n] = write_rdramd_new;
7482 #endif
7483   }
7484   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7485     writemem[n] = write_nomem_new;
7486     writememb[n] = write_nomemb_new;
7487     writememh[n] = write_nomemh_new;
7488 #ifndef FORCE32
7489     writememd[n] = write_nomemd_new;
7490 #endif
7491     readmem[n] = read_nomem_new;
7492     readmemb[n] = read_nomemb_new;
7493     readmemh[n] = read_nomemh_new;
7494 #ifndef FORCE32
7495     readmemd[n] = read_nomemd_new;
7496 #endif
7497   }
7498 #endif
7499   tlb_hacks();
7500   arch_init();
7501 }
7502
7503 void new_dynarec_cleanup()
7504 {
7505   int n;
7506   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7507   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7508   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7509   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7510   #ifdef ROM_COPY
7511   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7512   #endif
7513 }
7514
7515 int new_recompile_block(int addr)
7516 {
7517 /*
7518   if(addr==0x800cd050) {
7519     int block;
7520     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7521     int n;
7522     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7523   }
7524 */
7525   //if(Count==365117028) tracedebug=1;
7526   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7527   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7528   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7529   //if(debug) 
7530   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7531   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7532   /*if(Count>=312978186) {
7533     rlist();
7534   }*/
7535   //rlist();
7536   start = (u_int)addr&~3;
7537   //assert(((u_int)addr&1)==0);
7538 #ifdef MUPEN64
7539   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7540     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7541     pagelimit = 0xa4001000;
7542   }
7543   else
7544 #endif
7545   if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) {
7546     source = (u_int *)((u_int)rdram+start-0x80000000);
7547     pagelimit = 0x80800000;
7548   }
7549 #ifndef DISABLE_TLB
7550   else if ((signed int)addr >= (signed int)0xC0000000) {
7551     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7552     //if(tlb_LUT_r[start>>12])
7553       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7554     if((signed int)memory_map[start>>12]>=0) {
7555       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7556       pagelimit=(start+4096)&0xFFFFF000;
7557       int map=memory_map[start>>12];
7558       int i;
7559       for(i=0;i<5;i++) {
7560         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7561         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7562       }
7563       assem_debug("pagelimit=%x\n",pagelimit);
7564       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7565     }
7566     else {
7567       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7568       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7569       return 1; // Caller will invoke exception handler
7570     }
7571     //printf("source= %x\n",(int)source);
7572   }
7573 #endif
7574   else {
7575     printf("Compile at bogus memory address: %x \n", (int)addr);
7576     exit(1);
7577   }
7578
7579   /* Pass 1: disassemble */
7580   /* Pass 2: register dependencies, branch targets */
7581   /* Pass 3: register allocation */
7582   /* Pass 4: branch dependencies */
7583   /* Pass 5: pre-alloc */
7584   /* Pass 6: optimize clean/dirty state */
7585   /* Pass 7: flag 32-bit registers */
7586   /* Pass 8: assembly */
7587   /* Pass 9: linker */
7588   /* Pass 10: garbage collection / free memory */
7589
7590   int i,j;
7591   int done=0;
7592   unsigned int type,op,op2;
7593
7594   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7595   
7596   /* Pass 1 disassembly */
7597
7598   for(i=0;!done;i++) {
7599     bt[i]=0;likely[i]=0;op2=0;
7600     opcode[i]=op=source[i]>>26;
7601     switch(op)
7602     {
7603       case 0x00: strcpy(insn[i],"special"); type=NI;
7604         op2=source[i]&0x3f;
7605         switch(op2)
7606         {
7607           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7608           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7609           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7610           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7611           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7612           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7613           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7614           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7615           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7616           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7617           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7618           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7619           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7620           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7621           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7622           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7623           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7624           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7625           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7626           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7627           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7628           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7629           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7630           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7631           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7632           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7633           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7634           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7635           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7636           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7637           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7638           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7639           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7640           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7641           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7642           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7643           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7644           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7645           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7646           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7647           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7648           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7649           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7650           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7651           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7652           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7653           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7654           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7655           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7656           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7657           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7658           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7659         }
7660         break;
7661       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7662         op2=(source[i]>>16)&0x1f;
7663         switch(op2)
7664         {
7665           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7666           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7667           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7668           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7669           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7670           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7671           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7672           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7673           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7674           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7675           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7676           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7677           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7678           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7679         }
7680         break;
7681       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7682       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7683       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7684       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7685       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7686       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7687       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7688       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7689       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7690       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7691       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7692       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7693       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7694       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7695       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7696         op2=(source[i]>>21)&0x1f;
7697         switch(op2)
7698         {
7699           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7700           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7701           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7702           switch(source[i]&0x3f)
7703           {
7704             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7705             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7706             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7707             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7708             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7709           }
7710         }
7711         break;
7712       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7713         op2=(source[i]>>21)&0x1f;
7714         switch(op2)
7715         {
7716           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7717           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7718           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7719           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7720           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7721           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7722           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7723           switch((source[i]>>16)&0x3)
7724           {
7725             case 0x00: strcpy(insn[i],"BC1F"); break;
7726             case 0x01: strcpy(insn[i],"BC1T"); break;
7727             case 0x02: strcpy(insn[i],"BC1FL"); break;
7728             case 0x03: strcpy(insn[i],"BC1TL"); break;
7729           }
7730           break;
7731           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7732           switch(source[i]&0x3f)
7733           {
7734             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7735             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7736             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7737             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7738             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7739             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7740             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7741             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7742             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7743             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7744             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7745             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7746             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7747             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7748             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7749             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7750             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7751             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7752             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7753             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7754             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7755             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7756             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7757             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7758             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7759             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7760             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7761             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7762             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7763             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7764             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7765             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7766             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7767             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7768             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7769           }
7770           break;
7771           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7772           switch(source[i]&0x3f)
7773           {
7774             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7775             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7776             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7777             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7778             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7779             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7780             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7781             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7782             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7783             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7784             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7785             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7786             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7787             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7788             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7789             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7790             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7791             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7792             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7793             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7794             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7795             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7796             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7797             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7798             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7799             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7800             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7801             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7802             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7803             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7804             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7805             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7806             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7807             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7808             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7809           }
7810           break;
7811           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7812           switch(source[i]&0x3f)
7813           {
7814             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7815             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7816           }
7817           break;
7818           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7819           switch(source[i]&0x3f)
7820           {
7821             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7822             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7823           }
7824           break;
7825         }
7826         break;
7827       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7828       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7829       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7830       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7831       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7832       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7833       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7834       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7835       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7836       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7837       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7838       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7839       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7840       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7841       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7842       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7843       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7844       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7845       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7846       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7847       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7848       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7849       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7850       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7851       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7852       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7853       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7854       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7855       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7856       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7857       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7858       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7859       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7860       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7861       default: strcpy(insn[i],"???"); type=NI;
7862         assem_debug("NI %08x @%08x\n", source[i], addr + i*4);
7863         break;
7864     }
7865     itype[i]=type;
7866     opcode2[i]=op2;
7867     /* Get registers/immediates */
7868     lt1[i]=0;
7869     us1[i]=0;
7870     us2[i]=0;
7871     dep1[i]=0;
7872     dep2[i]=0;
7873     switch(type) {
7874       case LOAD:
7875         rs1[i]=(source[i]>>21)&0x1f;
7876         rs2[i]=0;
7877         rt1[i]=(source[i]>>16)&0x1f;
7878         rt2[i]=0;
7879         imm[i]=(short)source[i];
7880         break;
7881       case STORE:
7882       case STORELR:
7883         rs1[i]=(source[i]>>21)&0x1f;
7884         rs2[i]=(source[i]>>16)&0x1f;
7885         rt1[i]=0;
7886         rt2[i]=0;
7887         imm[i]=(short)source[i];
7888         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7889         break;
7890       case LOADLR:
7891         // LWL/LWR only load part of the register,
7892         // therefore the target register must be treated as a source too
7893         rs1[i]=(source[i]>>21)&0x1f;
7894         rs2[i]=(source[i]>>16)&0x1f;
7895         rt1[i]=(source[i]>>16)&0x1f;
7896         rt2[i]=0;
7897         imm[i]=(short)source[i];
7898         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7899         if(op==0x26) dep1[i]=rt1[i]; // LWR
7900         break;
7901       case IMM16:
7902         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7903         else rs1[i]=(source[i]>>21)&0x1f;
7904         rs2[i]=0;
7905         rt1[i]=(source[i]>>16)&0x1f;
7906         rt2[i]=0;
7907         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7908           imm[i]=(unsigned short)source[i];
7909         }else{
7910           imm[i]=(short)source[i];
7911         }
7912         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7913         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7914         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7915         break;
7916       case UJUMP:
7917         rs1[i]=0;
7918         rs2[i]=0;
7919         rt1[i]=0;
7920         rt2[i]=0;
7921         // The JAL instruction writes to r31.
7922         if (op&1) {
7923           rt1[i]=31;
7924         }
7925         rs2[i]=CCREG;
7926         break;
7927       case RJUMP:
7928         rs1[i]=(source[i]>>21)&0x1f;
7929         rs2[i]=0;
7930         rt1[i]=0;
7931         rt2[i]=0;
7932         // The JALR instruction writes to r31.
7933         if (op2&1) {
7934           rt1[i]=31;   
7935         }
7936         rs2[i]=CCREG;
7937         break;
7938       case CJUMP:
7939         rs1[i]=(source[i]>>21)&0x1f;
7940         rs2[i]=(source[i]>>16)&0x1f;
7941         rt1[i]=0;
7942         rt2[i]=0;
7943         if(op&2) { // BGTZ/BLEZ
7944           rs2[i]=0;
7945         }
7946         us1[i]=rs1[i];
7947         us2[i]=rs2[i];
7948         likely[i]=op>>4;
7949         break;
7950       case SJUMP:
7951         rs1[i]=(source[i]>>21)&0x1f;
7952         rs2[i]=CCREG;
7953         rt1[i]=0;
7954         rt2[i]=0;
7955         us1[i]=rs1[i];
7956         if(op2&0x10) { // BxxAL
7957           rt1[i]=31;
7958           // NOTE: If the branch is not taken, r31 is still overwritten
7959         }
7960         likely[i]=(op2&2)>>1;
7961         break;
7962       case FJUMP:
7963         rs1[i]=FSREG;
7964         rs2[i]=CSREG;
7965         rt1[i]=0;
7966         rt2[i]=0;
7967         likely[i]=((source[i])>>17)&1;
7968         break;
7969       case ALU:
7970         rs1[i]=(source[i]>>21)&0x1f; // source
7971         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7972         rt1[i]=(source[i]>>11)&0x1f; // destination
7973         rt2[i]=0;
7974         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7975           us1[i]=rs1[i];us2[i]=rs2[i];
7976         }
7977         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7978           dep1[i]=rs1[i];dep2[i]=rs2[i];
7979         }
7980         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7981           dep1[i]=rs1[i];dep2[i]=rs2[i];
7982         }
7983         break;
7984       case MULTDIV:
7985         rs1[i]=(source[i]>>21)&0x1f; // source
7986         rs2[i]=(source[i]>>16)&0x1f; // divisor
7987         rt1[i]=HIREG;
7988         rt2[i]=LOREG;
7989         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7990           us1[i]=rs1[i];us2[i]=rs2[i];
7991         }
7992         break;
7993       case MOV:
7994         rs1[i]=0;
7995         rs2[i]=0;
7996         rt1[i]=0;
7997         rt2[i]=0;
7998         if(op2==0x10) rs1[i]=HIREG; // MFHI
7999         if(op2==0x11) rt1[i]=HIREG; // MTHI
8000         if(op2==0x12) rs1[i]=LOREG; // MFLO
8001         if(op2==0x13) rt1[i]=LOREG; // MTLO
8002         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8003         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8004         dep1[i]=rs1[i];
8005         break;
8006       case SHIFT:
8007         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8008         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8009         rt1[i]=(source[i]>>11)&0x1f; // destination
8010         rt2[i]=0;
8011         // DSLLV/DSRLV/DSRAV are 64-bit
8012         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8013         break;
8014       case SHIFTIMM:
8015         rs1[i]=(source[i]>>16)&0x1f;
8016         rs2[i]=0;
8017         rt1[i]=(source[i]>>11)&0x1f;
8018         rt2[i]=0;
8019         imm[i]=(source[i]>>6)&0x1f;
8020         // DSxx32 instructions
8021         if(op2>=0x3c) imm[i]|=0x20;
8022         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8023         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8024         break;
8025       case COP0:
8026         rs1[i]=0;
8027         rs2[i]=0;
8028         rt1[i]=0;
8029         rt2[i]=0;
8030         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8031         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8032         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8033         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8034         break;
8035       case COP1:
8036         rs1[i]=0;
8037         rs2[i]=0;
8038         rt1[i]=0;
8039         rt2[i]=0;
8040         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8041         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8042         if(op2==5) us1[i]=rs1[i]; // DMTC1
8043         rs2[i]=CSREG;
8044         break;
8045       case C1LS:
8046         rs1[i]=(source[i]>>21)&0x1F;
8047         rs2[i]=CSREG;
8048         rt1[i]=0;
8049         rt2[i]=0;
8050         imm[i]=(short)source[i];
8051         break;
8052       case FLOAT:
8053       case FCONV:
8054         rs1[i]=0;
8055         rs2[i]=CSREG;
8056         rt1[i]=0;
8057         rt2[i]=0;
8058         break;
8059       case FCOMP:
8060         rs1[i]=FSREG;
8061         rs2[i]=CSREG;
8062         rt1[i]=FSREG;
8063         rt2[i]=0;
8064         break;
8065       case SYSCALL:
8066         rs1[i]=CCREG;
8067         rs2[i]=0;
8068         rt1[i]=0;
8069         rt2[i]=0;
8070         break;
8071       default:
8072         rs1[i]=0;
8073         rs2[i]=0;
8074         rt1[i]=0;
8075         rt2[i]=0;
8076     }
8077     /* Calculate branch target addresses */
8078     if(type==UJUMP)
8079       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8080     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8081       ba[i]=start+i*4+8; // Ignore never taken branch
8082     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8083       ba[i]=start+i*4+8; // Ignore never taken branch
8084     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8085       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8086     else ba[i]=-1;
8087     /* Is this the end of the block? */
8088     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8089       if(rt1[i-1]!=31) { // Continue past subroutine call (JAL)
8090         done=1;
8091         // Does the block continue due to a branch?
8092         for(j=i-1;j>=0;j--)
8093         {
8094           if(ba[j]==start+i*4+4) done=j=0;
8095           if(ba[j]==start+i*4+8) done=j=0;
8096         }
8097       }
8098       else {
8099         if(stop_after_jal) done=1;
8100         // Stop on BREAK
8101         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8102       }
8103       // Don't recompile stuff that's already compiled
8104       if(check_addr(start+i*4+4)) done=1;
8105       // Don't get too close to the limit
8106       if(i>MAXBLOCK/2) done=1;
8107     }
8108     if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
8109     assert(i<MAXBLOCK-1);
8110     if(start+i*4==pagelimit-4) done=1;
8111     assert(start+i*4<pagelimit);
8112     if (i==MAXBLOCK-1) done=1;
8113     // Stop if we're compiling junk
8114     if(itype[i]==NI&&opcode[i]==0x11) {
8115       done=stop_after_jal=1;
8116       printf("Disabled speculative precompilation\n");
8117     }
8118   }
8119   slen=i;
8120   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8121     if(start+i*4==pagelimit) {
8122       itype[i-1]=SPAN;
8123     }
8124   }
8125   assert(slen>0);
8126
8127   /* Pass 2 - Register dependencies and branch targets */
8128
8129   unneeded_registers(0,slen-1,0);
8130   
8131   /* Pass 3 - Register allocation */
8132
8133   struct regstat current; // Current register allocations/status
8134   current.is32=1;
8135   current.dirty=0;
8136   current.u=unneeded_reg[0];
8137   current.uu=unneeded_reg_upper[0];
8138   clear_all_regs(current.regmap);
8139   alloc_reg(&current,0,CCREG);
8140   dirty_reg(&current,CCREG);
8141   current.isconst=0;
8142   current.wasconst=0;
8143   int ds=0;
8144   int cc=0;
8145   int hr;
8146   
8147   provisional_32bit();
8148   
8149   if((u_int)addr&1) {
8150     // First instruction is delay slot
8151     cc=-1;
8152     bt[1]=1;
8153     ds=1;
8154     unneeded_reg[0]=1;
8155     unneeded_reg_upper[0]=1;
8156     current.regmap[HOST_BTREG]=BTREG;
8157   }
8158   
8159   for(i=0;i<slen;i++)
8160   {
8161     if(bt[i])
8162     {
8163       int hr;
8164       for(hr=0;hr<HOST_REGS;hr++)
8165       {
8166         // Is this really necessary?
8167         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8168       }
8169       current.isconst=0;
8170     }
8171     if(i>1)
8172     {
8173       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8174       {
8175         if(rs1[i-2]==0||rs2[i-2]==0)
8176         {
8177           if(rs1[i-2]) {
8178             current.is32|=1LL<<rs1[i-2];
8179             int hr=get_reg(current.regmap,rs1[i-2]|64);
8180             if(hr>=0) current.regmap[hr]=-1;
8181           }
8182           if(rs2[i-2]) {
8183             current.is32|=1LL<<rs2[i-2];
8184             int hr=get_reg(current.regmap,rs2[i-2]|64);
8185             if(hr>=0) current.regmap[hr]=-1;
8186           }
8187         }
8188       }
8189     }
8190     // If something jumps here with 64-bit values
8191     // then promote those registers to 64 bits
8192     if(bt[i])
8193     {
8194       uint64_t temp_is32=current.is32;
8195       for(j=i-1;j>=0;j--)
8196       {
8197         if(ba[j]==start+i*4) 
8198           temp_is32&=branch_regs[j].is32;
8199       }
8200       for(j=i;j<slen;j++)
8201       {
8202         if(ba[j]==start+i*4) 
8203           //temp_is32=1;
8204           temp_is32&=p32[j];
8205       }
8206       if(temp_is32!=current.is32) {
8207         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8208         #ifdef DESTRUCTIVE_WRITEBACK
8209         for(hr=0;hr<HOST_REGS;hr++)
8210         {
8211           int r=current.regmap[hr];
8212           if(r>0&&r<64)
8213           {
8214             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8215               temp_is32|=1LL<<r;
8216               //printf("restore %d\n",r);
8217             }
8218           }
8219         }
8220         #endif
8221         current.is32=temp_is32;
8222       }
8223     }
8224 #ifdef FORCE32
8225     memset(p32, 0xff, sizeof(p32));
8226     current.is32=-1LL;
8227 #endif
8228
8229     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8230     regs[i].wasconst=current.isconst;
8231     regs[i].was32=current.is32;
8232     regs[i].wasdirty=current.dirty;
8233     #ifdef DESTRUCTIVE_WRITEBACK
8234     // To change a dirty register from 32 to 64 bits, we must write
8235     // it out during the previous cycle (for branches, 2 cycles)
8236     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8237     {
8238       uint64_t temp_is32=current.is32;
8239       for(j=i-1;j>=0;j--)
8240       {
8241         if(ba[j]==start+i*4+4) 
8242           temp_is32&=branch_regs[j].is32;
8243       }
8244       for(j=i;j<slen;j++)
8245       {
8246         if(ba[j]==start+i*4+4) 
8247           //temp_is32=1;
8248           temp_is32&=p32[j];
8249       }
8250       if(temp_is32!=current.is32) {
8251         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8252         for(hr=0;hr<HOST_REGS;hr++)
8253         {
8254           int r=current.regmap[hr];
8255           if(r>0)
8256           {
8257             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8258               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8259               {
8260                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8261                 {
8262                   //printf("dump %d/r%d\n",hr,r);
8263                   current.regmap[hr]=-1;
8264                   if(get_reg(current.regmap,r|64)>=0) 
8265                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8266                 }
8267               }
8268             }
8269           }
8270         }
8271       }
8272     }
8273     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8274     {
8275       uint64_t temp_is32=current.is32;
8276       for(j=i-1;j>=0;j--)
8277       {
8278         if(ba[j]==start+i*4+8) 
8279           temp_is32&=branch_regs[j].is32;
8280       }
8281       for(j=i;j<slen;j++)
8282       {
8283         if(ba[j]==start+i*4+8) 
8284           //temp_is32=1;
8285           temp_is32&=p32[j];
8286       }
8287       if(temp_is32!=current.is32) {
8288         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8289         for(hr=0;hr<HOST_REGS;hr++)
8290         {
8291           int r=current.regmap[hr];
8292           if(r>0)
8293           {
8294             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8295               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8296               {
8297                 //printf("dump %d/r%d\n",hr,r);
8298                 current.regmap[hr]=-1;
8299                 if(get_reg(current.regmap,r|64)>=0) 
8300                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8301               }
8302             }
8303           }
8304         }
8305       }
8306     }
8307     #endif
8308     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8309       if(i+1<slen) {
8310         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8311         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8312         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8313         current.u|=1;
8314         current.uu|=1;
8315       } else {
8316         current.u=1;
8317         current.uu=1;
8318       }
8319     } else {
8320       if(i+1<slen) {
8321         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8322         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8323         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8324         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8325         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8326         current.u|=1;
8327         current.uu|=1;
8328       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8329     }
8330     is_ds[i]=ds;
8331     if(ds) {
8332       ds=0; // Skip delay slot, already allocated as part of branch
8333       // ...but we need to alloc it in case something jumps here
8334       if(i+1<slen) {
8335         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8336         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8337       }else{
8338         current.u=branch_unneeded_reg[i-1];
8339         current.uu=branch_unneeded_reg_upper[i-1];
8340       }
8341       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8342       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8343       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8344       current.u|=1;
8345       current.uu|=1;
8346       struct regstat temp;
8347       memcpy(&temp,&current,sizeof(current));
8348       temp.wasdirty=temp.dirty;
8349       temp.was32=temp.is32;
8350       // TODO: Take into account unconditional branches, as below
8351       delayslot_alloc(&temp,i);
8352       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8353       regs[i].wasdirty=temp.wasdirty;
8354       regs[i].was32=temp.was32;
8355       regs[i].dirty=temp.dirty;
8356       regs[i].is32=temp.is32;
8357       regs[i].isconst=0;
8358       regs[i].wasconst=0;
8359       current.isconst=0;
8360       // Create entry (branch target) regmap
8361       for(hr=0;hr<HOST_REGS;hr++)
8362       {
8363         int r=temp.regmap[hr];
8364         if(r>=0) {
8365           if(r!=regmap_pre[i][hr]) {
8366             regs[i].regmap_entry[hr]=-1;
8367           }
8368           else
8369           {
8370             if(r<64){
8371               if((current.u>>r)&1) {
8372                 regs[i].regmap_entry[hr]=-1;
8373                 regs[i].regmap[hr]=-1;
8374                 //Don't clear regs in the delay slot as the branch might need them
8375                 //current.regmap[hr]=-1;
8376               }else
8377                 regs[i].regmap_entry[hr]=r;
8378             }
8379             else {
8380               if((current.uu>>(r&63))&1) {
8381                 regs[i].regmap_entry[hr]=-1;
8382                 regs[i].regmap[hr]=-1;
8383                 //Don't clear regs in the delay slot as the branch might need them
8384                 //current.regmap[hr]=-1;
8385               }else
8386                 regs[i].regmap_entry[hr]=r;
8387             }
8388           }
8389         } else {
8390           // First instruction expects CCREG to be allocated
8391           if(i==0&&hr==HOST_CCREG) 
8392             regs[i].regmap_entry[hr]=CCREG;
8393           else
8394             regs[i].regmap_entry[hr]=-1;
8395         }
8396       }
8397     }
8398     else { // Not delay slot
8399       switch(itype[i]) {
8400         case UJUMP:
8401           //current.isconst=0; // DEBUG
8402           //current.wasconst=0; // DEBUG
8403           //regs[i].wasconst=0; // DEBUG
8404           clear_const(&current,rt1[i]);
8405           alloc_cc(&current,i);
8406           dirty_reg(&current,CCREG);
8407           if (rt1[i]==31) {
8408             alloc_reg(&current,i,31);
8409             dirty_reg(&current,31);
8410             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8411             #ifdef REG_PREFETCH
8412             alloc_reg(&current,i,PTEMP);
8413             #endif
8414             //current.is32|=1LL<<rt1[i];
8415           }
8416           delayslot_alloc(&current,i+1);
8417           //current.isconst=0; // DEBUG
8418           ds=1;
8419           //printf("i=%d, isconst=%x\n",i,current.isconst);
8420           break;
8421         case RJUMP:
8422           //current.isconst=0;
8423           //current.wasconst=0;
8424           //regs[i].wasconst=0;
8425           clear_const(&current,rs1[i]);
8426           clear_const(&current,rt1[i]);
8427           alloc_cc(&current,i);
8428           dirty_reg(&current,CCREG);
8429           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8430             alloc_reg(&current,i,rs1[i]);
8431             if (rt1[i]==31) {
8432               alloc_reg(&current,i,31);
8433               dirty_reg(&current,31);
8434               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8435               #ifdef REG_PREFETCH
8436               alloc_reg(&current,i,PTEMP);
8437               #endif
8438             }
8439             #ifdef USE_MINI_HT
8440             if(rs1[i]==31) { // JALR
8441               alloc_reg(&current,i,RHASH);
8442               #ifndef HOST_IMM_ADDR32
8443               alloc_reg(&current,i,RHTBL);
8444               #endif
8445             }
8446             #endif
8447             delayslot_alloc(&current,i+1);
8448           } else {
8449             // The delay slot overwrites our source register,
8450             // allocate a temporary register to hold the old value.
8451             current.isconst=0;
8452             current.wasconst=0;
8453             regs[i].wasconst=0;
8454             delayslot_alloc(&current,i+1);
8455             current.isconst=0;
8456             alloc_reg(&current,i,RTEMP);
8457           }
8458           //current.isconst=0; // DEBUG
8459           ds=1;
8460           break;
8461         case CJUMP:
8462           //current.isconst=0;
8463           //current.wasconst=0;
8464           //regs[i].wasconst=0;
8465           clear_const(&current,rs1[i]);
8466           clear_const(&current,rs2[i]);
8467           if((opcode[i]&0x3E)==4) // BEQ/BNE
8468           {
8469             alloc_cc(&current,i);
8470             dirty_reg(&current,CCREG);
8471             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8472             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8473             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8474             {
8475               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8476               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8477             }
8478             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8479                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8480               // The delay slot overwrites one of our conditions.
8481               // Allocate the branch condition registers instead.
8482               // Note that such a sequence of instructions could
8483               // be considered a bug since the branch can not be
8484               // re-executed if an exception occurs.
8485               current.isconst=0;
8486               current.wasconst=0;
8487               regs[i].wasconst=0;
8488               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8489               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8490               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8491               {
8492                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8493                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8494               }
8495             }
8496             else delayslot_alloc(&current,i+1);
8497           }
8498           else
8499           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8500           {
8501             alloc_cc(&current,i);
8502             dirty_reg(&current,CCREG);
8503             alloc_reg(&current,i,rs1[i]);
8504             if(!(current.is32>>rs1[i]&1))
8505             {
8506               alloc_reg64(&current,i,rs1[i]);
8507             }
8508             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8509               // The delay slot overwrites one of our conditions.
8510               // Allocate the branch condition registers instead.
8511               // Note that such a sequence of instructions could
8512               // be considered a bug since the branch can not be
8513               // re-executed if an exception occurs.
8514               current.isconst=0;
8515               current.wasconst=0;
8516               regs[i].wasconst=0;
8517               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8518               if(!((current.is32>>rs1[i])&1))
8519               {
8520                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8521               }
8522             }
8523             else delayslot_alloc(&current,i+1);
8524           }
8525           else
8526           // Don't alloc the delay slot yet because we might not execute it
8527           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8528           {
8529             current.isconst=0;
8530             current.wasconst=0;
8531             regs[i].wasconst=0;
8532             alloc_cc(&current,i);
8533             dirty_reg(&current,CCREG);
8534             alloc_reg(&current,i,rs1[i]);
8535             alloc_reg(&current,i,rs2[i]);
8536             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8537             {
8538               alloc_reg64(&current,i,rs1[i]);
8539               alloc_reg64(&current,i,rs2[i]);
8540             }
8541           }
8542           else
8543           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8544           {
8545             current.isconst=0;
8546             current.wasconst=0;
8547             regs[i].wasconst=0;
8548             alloc_cc(&current,i);
8549             dirty_reg(&current,CCREG);
8550             alloc_reg(&current,i,rs1[i]);
8551             if(!(current.is32>>rs1[i]&1))
8552             {
8553               alloc_reg64(&current,i,rs1[i]);
8554             }
8555           }
8556           ds=1;
8557           //current.isconst=0;
8558           break;
8559         case SJUMP:
8560           //current.isconst=0;
8561           //current.wasconst=0;
8562           //regs[i].wasconst=0;
8563           clear_const(&current,rs1[i]);
8564           clear_const(&current,rt1[i]);
8565           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8566           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8567           {
8568             alloc_cc(&current,i);
8569             dirty_reg(&current,CCREG);
8570             alloc_reg(&current,i,rs1[i]);
8571             if(!(current.is32>>rs1[i]&1))
8572             {
8573               alloc_reg64(&current,i,rs1[i]);
8574             }
8575             if (rt1[i]==31) { // BLTZAL/BGEZAL
8576               alloc_reg(&current,i,31);
8577               dirty_reg(&current,31);
8578               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8579               //#ifdef REG_PREFETCH
8580               //alloc_reg(&current,i,PTEMP);
8581               //#endif
8582               //current.is32|=1LL<<rt1[i];
8583             }
8584             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8585               // The delay slot overwrites the branch condition.
8586               // Allocate the branch condition registers instead.
8587               // Note that such a sequence of instructions could
8588               // be considered a bug since the branch can not be
8589               // re-executed if an exception occurs.
8590               current.isconst=0;
8591               current.wasconst=0;
8592               regs[i].wasconst=0;
8593               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8594               if(!((current.is32>>rs1[i])&1))
8595               {
8596                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8597               }
8598             }
8599             else delayslot_alloc(&current,i+1);
8600           }
8601           else
8602           // Don't alloc the delay slot yet because we might not execute it
8603           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8604           {
8605             current.isconst=0;
8606             current.wasconst=0;
8607             regs[i].wasconst=0;
8608             alloc_cc(&current,i);
8609             dirty_reg(&current,CCREG);
8610             alloc_reg(&current,i,rs1[i]);
8611             if(!(current.is32>>rs1[i]&1))
8612             {
8613               alloc_reg64(&current,i,rs1[i]);
8614             }
8615           }
8616           ds=1;
8617           //current.isconst=0;
8618           break;
8619         case FJUMP:
8620           current.isconst=0;
8621           current.wasconst=0;
8622           regs[i].wasconst=0;
8623           if(likely[i]==0) // BC1F/BC1T
8624           {
8625             // TODO: Theoretically we can run out of registers here on x86.
8626             // The delay slot can allocate up to six, and we need to check
8627             // CSREG before executing the delay slot.  Possibly we can drop
8628             // the cycle count and then reload it after checking that the
8629             // FPU is in a usable state, or don't do out-of-order execution.
8630             alloc_cc(&current,i);
8631             dirty_reg(&current,CCREG);
8632             alloc_reg(&current,i,FSREG);
8633             alloc_reg(&current,i,CSREG);
8634             if(itype[i+1]==FCOMP) {
8635               // The delay slot overwrites the branch condition.
8636               // Allocate the branch condition registers instead.
8637               // Note that such a sequence of instructions could
8638               // be considered a bug since the branch can not be
8639               // re-executed if an exception occurs.
8640               alloc_cc(&current,i);
8641               dirty_reg(&current,CCREG);
8642               alloc_reg(&current,i,CSREG);
8643               alloc_reg(&current,i,FSREG);
8644             }
8645             else {
8646               delayslot_alloc(&current,i+1);
8647               alloc_reg(&current,i+1,CSREG);
8648             }
8649           }
8650           else
8651           // Don't alloc the delay slot yet because we might not execute it
8652           if(likely[i]) // BC1FL/BC1TL
8653           {
8654             alloc_cc(&current,i);
8655             dirty_reg(&current,CCREG);
8656             alloc_reg(&current,i,CSREG);
8657             alloc_reg(&current,i,FSREG);
8658           }
8659           ds=1;
8660           current.isconst=0;
8661           break;
8662         case IMM16:
8663           imm16_alloc(&current,i);
8664           break;
8665         case LOAD:
8666         case LOADLR:
8667           load_alloc(&current,i);
8668           break;
8669         case STORE:
8670         case STORELR:
8671           store_alloc(&current,i);
8672           break;
8673         case ALU:
8674           alu_alloc(&current,i);
8675           break;
8676         case SHIFT:
8677           shift_alloc(&current,i);
8678           break;
8679         case MULTDIV:
8680           multdiv_alloc(&current,i);
8681           break;
8682         case SHIFTIMM:
8683           shiftimm_alloc(&current,i);
8684           break;
8685         case MOV:
8686           mov_alloc(&current,i);
8687           break;
8688         case COP0:
8689           cop0_alloc(&current,i);
8690           break;
8691         case COP1:
8692           cop1_alloc(&current,i);
8693           break;
8694         case C1LS:
8695           c1ls_alloc(&current,i);
8696           break;
8697         case FCONV:
8698           fconv_alloc(&current,i);
8699           break;
8700         case FLOAT:
8701           float_alloc(&current,i);
8702           break;
8703         case FCOMP:
8704           fcomp_alloc(&current,i);
8705           break;
8706         case SYSCALL:
8707           syscall_alloc(&current,i);
8708           break;
8709         case SPAN:
8710           pagespan_alloc(&current,i);
8711           break;
8712       }
8713       
8714       // Drop the upper half of registers that have become 32-bit
8715       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8716       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8717         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8718         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8719         current.uu|=1;
8720       } else {
8721         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8722         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8723         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8724         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8725         current.uu|=1;
8726       }
8727
8728       // Create entry (branch target) regmap
8729       for(hr=0;hr<HOST_REGS;hr++)
8730       {
8731         int r,or,er;
8732         r=current.regmap[hr];
8733         if(r>=0) {
8734           if(r!=regmap_pre[i][hr]) {
8735             // TODO: delay slot (?)
8736             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8737             if(or<0||(r&63)>=TEMPREG){
8738               regs[i].regmap_entry[hr]=-1;
8739             }
8740             else
8741             {
8742               // Just move it to a different register
8743               regs[i].regmap_entry[hr]=r;
8744               // If it was dirty before, it's still dirty
8745               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8746             }
8747           }
8748           else
8749           {
8750             // Unneeded
8751             if(r==0){
8752               regs[i].regmap_entry[hr]=0;
8753             }
8754             else
8755             if(r<64){
8756               if((current.u>>r)&1) {
8757                 regs[i].regmap_entry[hr]=-1;
8758                 //regs[i].regmap[hr]=-1;
8759                 current.regmap[hr]=-1;
8760               }else
8761                 regs[i].regmap_entry[hr]=r;
8762             }
8763             else {
8764               if((current.uu>>(r&63))&1) {
8765                 regs[i].regmap_entry[hr]=-1;
8766                 //regs[i].regmap[hr]=-1;
8767                 current.regmap[hr]=-1;
8768               }else
8769                 regs[i].regmap_entry[hr]=r;
8770             }
8771           }
8772         } else {
8773           // Branches expect CCREG to be allocated at the target
8774           if(regmap_pre[i][hr]==CCREG) 
8775             regs[i].regmap_entry[hr]=CCREG;
8776           else
8777             regs[i].regmap_entry[hr]=-1;
8778         }
8779       }
8780       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8781     }
8782     /* Branch post-alloc */
8783     if(i>0)
8784     {
8785       current.was32=current.is32;
8786       current.wasdirty=current.dirty;
8787       switch(itype[i-1]) {
8788         case UJUMP:
8789           memcpy(&branch_regs[i-1],&current,sizeof(current));
8790           branch_regs[i-1].isconst=0;
8791           branch_regs[i-1].wasconst=0;
8792           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8793           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8794           alloc_cc(&branch_regs[i-1],i-1);
8795           dirty_reg(&branch_regs[i-1],CCREG);
8796           if(rt1[i-1]==31) { // JAL
8797             alloc_reg(&branch_regs[i-1],i-1,31);
8798             dirty_reg(&branch_regs[i-1],31);
8799             branch_regs[i-1].is32|=1LL<<31;
8800           }
8801           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8802           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8803           break;
8804         case RJUMP:
8805           memcpy(&branch_regs[i-1],&current,sizeof(current));
8806           branch_regs[i-1].isconst=0;
8807           branch_regs[i-1].wasconst=0;
8808           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8809           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8810           alloc_cc(&branch_regs[i-1],i-1);
8811           dirty_reg(&branch_regs[i-1],CCREG);
8812           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8813           if(rt1[i-1]==31) { // JALR
8814             alloc_reg(&branch_regs[i-1],i-1,31);
8815             dirty_reg(&branch_regs[i-1],31);
8816             branch_regs[i-1].is32|=1LL<<31;
8817           }
8818           #ifdef USE_MINI_HT
8819           if(rs1[i-1]==31) { // JALR
8820             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8821             #ifndef HOST_IMM_ADDR32
8822             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8823             #endif
8824           }
8825           #endif
8826           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8827           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8828           break;
8829         case CJUMP:
8830           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8831           {
8832             alloc_cc(&current,i-1);
8833             dirty_reg(&current,CCREG);
8834             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8835                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8836               // The delay slot overwrote one of our conditions
8837               // Delay slot goes after the test (in order)
8838               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8839               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8840               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8841               current.u|=1;
8842               current.uu|=1;
8843               delayslot_alloc(&current,i);
8844               current.isconst=0;
8845             }
8846             else
8847             {
8848               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8849               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8850               // Alloc the branch condition registers
8851               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8852               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8853               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8854               {
8855                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8856                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8857               }
8858             }
8859             memcpy(&branch_regs[i-1],&current,sizeof(current));
8860             branch_regs[i-1].isconst=0;
8861             branch_regs[i-1].wasconst=0;
8862             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8863             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8864           }
8865           else
8866           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8867           {
8868             alloc_cc(&current,i-1);
8869             dirty_reg(&current,CCREG);
8870             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8871               // The delay slot overwrote the branch condition
8872               // Delay slot goes after the test (in order)
8873               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8874               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8875               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8876               current.u|=1;
8877               current.uu|=1;
8878               delayslot_alloc(&current,i);
8879               current.isconst=0;
8880             }
8881             else
8882             {
8883               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8884               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8885               // Alloc the branch condition register
8886               alloc_reg(&current,i-1,rs1[i-1]);
8887               if(!(current.is32>>rs1[i-1]&1))
8888               {
8889                 alloc_reg64(&current,i-1,rs1[i-1]);
8890               }
8891             }
8892             memcpy(&branch_regs[i-1],&current,sizeof(current));
8893             branch_regs[i-1].isconst=0;
8894             branch_regs[i-1].wasconst=0;
8895             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8896             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8897           }
8898           else
8899           // Alloc the delay slot in case the branch is taken
8900           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8901           {
8902             memcpy(&branch_regs[i-1],&current,sizeof(current));
8903             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8904             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8905             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8906             alloc_cc(&branch_regs[i-1],i);
8907             dirty_reg(&branch_regs[i-1],CCREG);
8908             delayslot_alloc(&branch_regs[i-1],i);
8909             branch_regs[i-1].isconst=0;
8910             alloc_reg(&current,i,CCREG); // Not taken path
8911             dirty_reg(&current,CCREG);
8912             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8913           }
8914           else
8915           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8916           {
8917             memcpy(&branch_regs[i-1],&current,sizeof(current));
8918             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8919             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8920             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8921             alloc_cc(&branch_regs[i-1],i);
8922             dirty_reg(&branch_regs[i-1],CCREG);
8923             delayslot_alloc(&branch_regs[i-1],i);
8924             branch_regs[i-1].isconst=0;
8925             alloc_reg(&current,i,CCREG); // Not taken path
8926             dirty_reg(&current,CCREG);
8927             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8928           }
8929           break;
8930         case SJUMP:
8931           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8932           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8933           {
8934             alloc_cc(&current,i-1);
8935             dirty_reg(&current,CCREG);
8936             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8937               // The delay slot overwrote the branch condition
8938               // Delay slot goes after the test (in order)
8939               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8940               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8941               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8942               current.u|=1;
8943               current.uu|=1;
8944               delayslot_alloc(&current,i);
8945               current.isconst=0;
8946             }
8947             else
8948             {
8949               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8950               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8951               // Alloc the branch condition register
8952               alloc_reg(&current,i-1,rs1[i-1]);
8953               if(!(current.is32>>rs1[i-1]&1))
8954               {
8955                 alloc_reg64(&current,i-1,rs1[i-1]);
8956               }
8957             }
8958             memcpy(&branch_regs[i-1],&current,sizeof(current));
8959             branch_regs[i-1].isconst=0;
8960             branch_regs[i-1].wasconst=0;
8961             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8962             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
8963           }
8964           else
8965           // Alloc the delay slot in case the branch is taken
8966           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8967           {
8968             memcpy(&branch_regs[i-1],&current,sizeof(current));
8969             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8970             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8971             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8972             alloc_cc(&branch_regs[i-1],i);
8973             dirty_reg(&branch_regs[i-1],CCREG);
8974             delayslot_alloc(&branch_regs[i-1],i);
8975             branch_regs[i-1].isconst=0;
8976             alloc_reg(&current,i,CCREG); // Not taken path
8977             dirty_reg(&current,CCREG);
8978             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8979           }
8980           // FIXME: BLTZAL/BGEZAL
8981           if(opcode2[i-1]&0x10) { // BxxZAL
8982             alloc_reg(&branch_regs[i-1],i-1,31);
8983             dirty_reg(&branch_regs[i-1],31);
8984             branch_regs[i-1].is32|=1LL<<31;
8985           }
8986           break;
8987         case FJUMP:
8988           if(likely[i-1]==0) // BC1F/BC1T
8989           {
8990             alloc_cc(&current,i-1);
8991             dirty_reg(&current,CCREG);
8992             if(itype[i]==FCOMP) {
8993               // The delay slot overwrote the branch condition
8994               // Delay slot goes after the test (in order)
8995               delayslot_alloc(&current,i);
8996               current.isconst=0;
8997             }
8998             else
8999             {
9000               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9001               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9002               // Alloc the branch condition register
9003               alloc_reg(&current,i-1,FSREG);
9004             }
9005             memcpy(&branch_regs[i-1],&current,sizeof(current));
9006             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9007           }
9008           else // BC1FL/BC1TL
9009           {
9010             // Alloc the delay slot in case the branch is taken
9011             memcpy(&branch_regs[i-1],&current,sizeof(current));
9012             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9013             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9014             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9015             alloc_cc(&branch_regs[i-1],i);
9016             dirty_reg(&branch_regs[i-1],CCREG);
9017             delayslot_alloc(&branch_regs[i-1],i);
9018             branch_regs[i-1].isconst=0;
9019             alloc_reg(&current,i,CCREG); // Not taken path
9020             dirty_reg(&current,CCREG);
9021             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9022           }
9023           break;
9024       }
9025
9026       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9027       {
9028         if(rt1[i-1]==31) // JAL/JALR
9029         {
9030           // Subroutine call will return here, don't alloc any registers
9031           current.is32=1;
9032           current.dirty=0;
9033           clear_all_regs(current.regmap);
9034           alloc_reg(&current,i,CCREG);
9035           dirty_reg(&current,CCREG);
9036         }
9037         else if(i+1<slen)
9038         {
9039           // Internal branch will jump here, match registers to caller
9040           current.is32=0x3FFFFFFFFLL;
9041           current.dirty=0;
9042           clear_all_regs(current.regmap);
9043           alloc_reg(&current,i,CCREG);
9044           dirty_reg(&current,CCREG);
9045           for(j=i-1;j>=0;j--)
9046           {
9047             if(ba[j]==start+i*4+4) {
9048               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9049               current.is32=branch_regs[j].is32;
9050               current.dirty=branch_regs[j].dirty;
9051               break;
9052             }
9053           }
9054           while(j>=0) {
9055             if(ba[j]==start+i*4+4) {
9056               for(hr=0;hr<HOST_REGS;hr++) {
9057                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9058                   current.regmap[hr]=-1;
9059                 }
9060                 current.is32&=branch_regs[j].is32;
9061                 current.dirty&=branch_regs[j].dirty;
9062               }
9063             }
9064             j--;
9065           }
9066         }
9067       }
9068     }
9069
9070     // Count cycles in between branches
9071     ccadj[i]=cc;
9072     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL))
9073     {
9074       cc=0;
9075     }
9076     else
9077     {
9078       cc++;
9079     }
9080
9081     flush_dirty_uppers(&current);
9082     if(!is_ds[i]) {
9083       regs[i].is32=current.is32;
9084       regs[i].dirty=current.dirty;
9085       regs[i].isconst=current.isconst;
9086       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9087     }
9088     for(hr=0;hr<HOST_REGS;hr++) {
9089       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9090         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9091           regs[i].wasconst&=~(1<<hr);
9092         }
9093       }
9094     }
9095     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9096   }
9097   
9098   /* Pass 4 - Cull unused host registers */
9099   
9100   uint64_t nr=0;
9101   
9102   for (i=slen-1;i>=0;i--)
9103   {
9104     int hr;
9105     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9106     {
9107       if(ba[i]<start || ba[i]>=(start+slen*4))
9108       {
9109         // Branch out of this block, don't need anything
9110         nr=0;
9111       }
9112       else
9113       {
9114         // Internal branch
9115         // Need whatever matches the target
9116         nr=0;
9117         int t=(ba[i]-start)>>2;
9118         for(hr=0;hr<HOST_REGS;hr++)
9119         {
9120           if(regs[i].regmap_entry[hr]>=0) {
9121             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9122           }
9123         }
9124       }
9125       // Conditional branch may need registers for following instructions
9126       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9127       {
9128         if(i<slen-2) {
9129           nr|=needed_reg[i+2];
9130           for(hr=0;hr<HOST_REGS;hr++)
9131           {
9132             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9133             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9134           }
9135         }
9136       }
9137       // Don't need stuff which is overwritten
9138       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9139       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9140       // Merge in delay slot
9141       for(hr=0;hr<HOST_REGS;hr++)
9142       {
9143         if(!likely[i]) {
9144           // These are overwritten unless the branch is "likely"
9145           // and the delay slot is nullified if not taken
9146           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9147           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9148         }
9149         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9150         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9151         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9152         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9153         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9154         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9155         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9156         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9157         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9158           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9159           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9160         }
9161         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9162           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9163           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9164         }
9165         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9166           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9167           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9168         }
9169       }
9170     }
9171     else if(itype[i]==SYSCALL)
9172     {
9173       // SYSCALL instruction (software interrupt)
9174       nr=0;
9175     }
9176     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9177     {
9178       // ERET instruction (return from interrupt)
9179       nr=0;
9180     }
9181     else // Non-branch
9182     {
9183       if(i<slen-1) {
9184         for(hr=0;hr<HOST_REGS;hr++) {
9185           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9186           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9187           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9188           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9189         }
9190       }
9191     }
9192     for(hr=0;hr<HOST_REGS;hr++)
9193     {
9194       // Overwritten registers are not needed
9195       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9196       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9197       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9198       // Source registers are needed
9199       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9200       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9201       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9202       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9203       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9204       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9205       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9206       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9207       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9208         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9209         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9210       }
9211       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9212         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9213         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9214       }
9215       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9216         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9217         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9218       }
9219       // Don't store a register immediately after writing it,
9220       // may prevent dual-issue.
9221       // But do so if this is a branch target, otherwise we
9222       // might have to load the register before the branch.
9223       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9224         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9225            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9226           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9227           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9228         }
9229         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9230            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9231           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9232           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9233         }
9234       }
9235     }
9236     // Cycle count is needed at branches.  Assume it is needed at the target too.
9237     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9238       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9239       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9240     }
9241     // Save it
9242     needed_reg[i]=nr;
9243     
9244     // Deallocate unneeded registers
9245     for(hr=0;hr<HOST_REGS;hr++)
9246     {
9247       if(!((nr>>hr)&1)) {
9248         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9249         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9250            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9251            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9252         {
9253           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9254           {
9255             if(likely[i]) {
9256               regs[i].regmap[hr]=-1;
9257               regs[i].isconst&=~(1<<hr);
9258               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9259             }
9260           }
9261         }
9262         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9263         {
9264           int d1=0,d2=0,map=0,temp=0;
9265           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9266           {
9267             d1=dep1[i+1];
9268             d2=dep2[i+1];
9269           }
9270           if(using_tlb) {
9271             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9272                itype[i+1]==STORE || itype[i+1]==STORELR ||
9273                itype[i+1]==C1LS )
9274             map=TLREG;
9275           } else
9276           if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) {
9277             map=INVCP;
9278           }
9279           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9280              itype[i+1]==C1LS )
9281             temp=FTEMP;
9282           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9283              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9284              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9285              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9286              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9287              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9288              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9289              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9290              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9291              regs[i].regmap[hr]!=map )
9292           {
9293             regs[i].regmap[hr]=-1;
9294             regs[i].isconst&=~(1<<hr);
9295             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9296                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9297                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9298                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9299                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9300                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9301                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9302                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9303                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9304                branch_regs[i].regmap[hr]!=map)
9305             {
9306               branch_regs[i].regmap[hr]=-1;
9307               branch_regs[i].regmap_entry[hr]=-1;
9308               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9309               {
9310                 if(!likely[i]&&i<slen-2) {
9311                   regmap_pre[i+2][hr]=-1;
9312                 }
9313               }
9314             }
9315           }
9316         }
9317         else
9318         {
9319           // Non-branch
9320           if(i>0)
9321           {
9322             int d1=0,d2=0,map=-1,temp=-1;
9323             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9324             {
9325               d1=dep1[i];
9326               d2=dep2[i];
9327             }
9328             if(using_tlb) {
9329               if(itype[i]==LOAD || itype[i]==LOADLR ||
9330                  itype[i]==STORE || itype[i]==STORELR ||
9331                  itype[i]==C1LS )
9332               map=TLREG;
9333             } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) {
9334               map=INVCP;
9335             }
9336             if(itype[i]==LOADLR || itype[i]==STORELR ||
9337                itype[i]==C1LS )
9338               temp=FTEMP;
9339             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9340                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9341                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9342                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9343                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9344                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9345             {
9346               if(i<slen-1&&!is_ds[i]) {
9347                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9348                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9349                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9350                 {
9351                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9352                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9353                 }
9354                 regmap_pre[i+1][hr]=-1;
9355                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9356               }
9357               regs[i].regmap[hr]=-1;
9358               regs[i].isconst&=~(1<<hr);
9359             }
9360           }
9361         }
9362       }
9363     }
9364   }
9365   
9366   /* Pass 5 - Pre-allocate registers */
9367   
9368   // If a register is allocated during a loop, try to allocate it for the
9369   // entire loop, if possible.  This avoids loading/storing registers
9370   // inside of the loop.
9371
9372   signed char f_regmap[HOST_REGS];
9373   clear_all_regs(f_regmap);
9374   for(i=0;i<slen-1;i++)
9375   {
9376     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9377     {
9378       if(ba[i]>=start && ba[i]<(start+i*4)) 
9379       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9380       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9381       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9382       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9383       ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9384       {
9385         int t=(ba[i]-start)>>2;
9386         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9387         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9388         for(hr=0;hr<HOST_REGS;hr++)
9389         {
9390           if(regs[i].regmap[hr]>64) {
9391             if(!((regs[i].dirty>>hr)&1))
9392               f_regmap[hr]=regs[i].regmap[hr];
9393             else f_regmap[hr]=-1;
9394           }
9395           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9396           if(branch_regs[i].regmap[hr]>64) {
9397             if(!((branch_regs[i].dirty>>hr)&1))
9398               f_regmap[hr]=branch_regs[i].regmap[hr];
9399             else f_regmap[hr]=-1;
9400           }
9401           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9402           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9403           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9404           ||itype[i+1]==FCOMP||itype[i+1]==FCONV)
9405           {
9406             // Test both in case the delay slot is ooo,
9407             // could be done better...
9408             if(count_free_regs(branch_regs[i].regmap)<2
9409              ||count_free_regs(regs[i].regmap)<2) 
9410               f_regmap[hr]=branch_regs[i].regmap[hr];
9411           }
9412           // Avoid dirty->clean transition
9413           // #ifdef DESTRUCTIVE_WRITEBACK here?
9414           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9415           if(f_regmap[hr]>0) {
9416             if(regs[t].regmap_entry[hr]<0) {
9417               int r=f_regmap[hr];
9418               for(j=t;j<=i;j++)
9419               {
9420                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9421                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9422                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9423                 if(r>63) {
9424                   // NB This can exclude the case where the upper-half
9425                   // register is lower numbered than the lower-half
9426                   // register.  Not sure if it's worth fixing...
9427                   if(get_reg(regs[j].regmap,r&63)<0) break;
9428                   if(regs[j].is32&(1LL<<(r&63))) break;
9429                 }
9430                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9431                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9432                   int k;
9433                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9434                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9435                     if(r>63) {
9436                       if(get_reg(regs[i].regmap,r&63)<0) break;
9437                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9438                     }
9439                     k=i;
9440                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9441                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9442                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9443                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV
9444                       ||itype[k-1]==FCOMP) {
9445                         if(count_free_regs(regs[k-1].regmap)<2) {
9446                           //printf("no free regs for store %x\n",start+(k-1)*4);
9447                           break;
9448                         }
9449                       }
9450                       else
9451                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9452                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9453                         //printf("no-match due to different register\n");
9454                         break;
9455                       }
9456                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9457                         //printf("no-match due to branch\n");
9458                         break;
9459                       }
9460                       // call/ret fast path assumes no registers allocated
9461                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9462                         break;
9463                       }
9464                       if(r>63) {
9465                         // NB This can exclude the case where the upper-half
9466                         // register is lower numbered than the lower-half
9467                         // register.  Not sure if it's worth fixing...
9468                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9469                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9470                       }
9471                       k--;
9472                     }
9473                     if(i<slen-1) {
9474                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9475                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9476                         //printf("bad match after branch\n");
9477                         break;
9478                       }
9479                     }
9480                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9481                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9482                       while(k<i) {
9483                         regs[k].regmap_entry[hr]=f_regmap[hr];
9484                         regs[k].regmap[hr]=f_regmap[hr];
9485                         regmap_pre[k+1][hr]=f_regmap[hr];
9486                         regs[k].wasdirty&=~(1<<hr);
9487                         regs[k].dirty&=~(1<<hr);
9488                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9489                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9490                         regs[k].wasconst&=~(1<<hr);
9491                         regs[k].isconst&=~(1<<hr);
9492                         k++;
9493                       }
9494                     }
9495                     else {
9496                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9497                       break;
9498                     }
9499                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9500                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9501                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9502                       regs[i].regmap_entry[hr]=f_regmap[hr];
9503                       regs[i].regmap[hr]=f_regmap[hr];
9504                       regs[i].wasdirty&=~(1<<hr);
9505                       regs[i].dirty&=~(1<<hr);
9506                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9507                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9508                       regs[i].wasconst&=~(1<<hr);
9509                       regs[i].isconst&=~(1<<hr);
9510                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9511                       branch_regs[i].wasdirty&=~(1<<hr);
9512                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9513                       branch_regs[i].regmap[hr]=f_regmap[hr];
9514                       branch_regs[i].dirty&=~(1<<hr);
9515                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9516                       branch_regs[i].wasconst&=~(1<<hr);
9517                       branch_regs[i].isconst&=~(1<<hr);
9518                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9519                         regmap_pre[i+2][hr]=f_regmap[hr];
9520                         regs[i+2].wasdirty&=~(1<<hr);
9521                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9522                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9523                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9524                       }
9525                     }
9526                   }
9527                   for(k=t;k<j;k++) {
9528                     regs[k].regmap_entry[hr]=f_regmap[hr];
9529                     regs[k].regmap[hr]=f_regmap[hr];
9530                     regmap_pre[k+1][hr]=f_regmap[hr];
9531                     regs[k+1].wasdirty&=~(1<<hr);
9532                     regs[k].dirty&=~(1<<hr);
9533                     regs[k].wasconst&=~(1<<hr);
9534                     regs[k].isconst&=~(1<<hr);
9535                   }
9536                   if(regs[j].regmap[hr]==f_regmap[hr])
9537                     regs[j].regmap_entry[hr]=f_regmap[hr];
9538                   break;
9539                 }
9540                 if(j==i) break;
9541                 if(regs[j].regmap[hr]>=0)
9542                   break;
9543                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9544                   //printf("no-match due to different register\n");
9545                   break;
9546                 }
9547                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9548                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9549                   break;
9550                 }
9551                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9552                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9553                 ||itype[j]==FCOMP||itype[j]==FCONV) {
9554                   if(count_free_regs(regs[j].regmap)<2) {
9555                     //printf("No free regs for store %x\n",start+j*4);
9556                     break;
9557                   }
9558                 }
9559                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9560                 if(f_regmap[hr]>=64) {
9561                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9562                     break;
9563                   }
9564                   else
9565                   {
9566                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9567                       break;
9568                     }
9569                   }
9570                 }
9571               }
9572             }
9573           }
9574         }
9575       }
9576     }else{
9577       int count=0;
9578       for(hr=0;hr<HOST_REGS;hr++)
9579       {
9580         if(hr!=EXCLUDE_REG) {
9581           if(regs[i].regmap[hr]>64) {
9582             if(!((regs[i].dirty>>hr)&1))
9583               f_regmap[hr]=regs[i].regmap[hr];
9584           }
9585           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9586           else if(regs[i].regmap[hr]<0) count++;
9587         }
9588       }
9589       // Try to restore cycle count at branch targets
9590       if(bt[i]) {
9591         for(j=i;j<slen-1;j++) {
9592           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9593           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9594           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9595           ||itype[j]==FCOMP||itype[j]==FCONV) {
9596             if(count_free_regs(regs[j].regmap)<2) {
9597               //printf("no free regs for store %x\n",start+j*4);
9598               break;
9599             }
9600           }
9601           else
9602           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9603         }
9604         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9605           int k=i;
9606           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9607           while(k<j) {
9608             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9609             regs[k].regmap[HOST_CCREG]=CCREG;
9610             regmap_pre[k+1][HOST_CCREG]=CCREG;
9611             regs[k+1].wasdirty|=1<<HOST_CCREG;
9612             regs[k].dirty|=1<<HOST_CCREG;
9613             regs[k].wasconst&=~(1<<HOST_CCREG);
9614             regs[k].isconst&=~(1<<HOST_CCREG);
9615             k++;
9616           }
9617           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9618         }
9619         // Work backwards from the branch target
9620         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9621         {
9622           //printf("Extend backwards\n");
9623           int k;
9624           k=i;
9625           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9626             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9627             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9628             ||itype[k-1]==FCONV||itype[k-1]==FCOMP) {
9629               if(count_free_regs(regs[k-1].regmap)<2) {
9630                 //printf("no free regs for store %x\n",start+(k-1)*4);
9631                 break;
9632               }
9633             }
9634             else
9635             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9636             k--;
9637           }
9638           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9639             //printf("Extend CC, %x ->\n",start+k*4);
9640             while(k<=i) {
9641               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9642               regs[k].regmap[HOST_CCREG]=CCREG;
9643               regmap_pre[k+1][HOST_CCREG]=CCREG;
9644               regs[k+1].wasdirty|=1<<HOST_CCREG;
9645               regs[k].dirty|=1<<HOST_CCREG;
9646               regs[k].wasconst&=~(1<<HOST_CCREG);
9647               regs[k].isconst&=~(1<<HOST_CCREG);
9648               k++;
9649             }
9650           }
9651           else {
9652             //printf("Fail Extend CC, %x ->\n",start+k*4);
9653           }
9654         }
9655       }
9656       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9657          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9658          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9659          itype[i]!=FCONV&&itype[i]!=FCOMP)
9660       {
9661         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9662       }
9663     }
9664   }
9665   
9666   // This allocates registers (if possible) one instruction prior
9667   // to use, which can avoid a load-use penalty on certain CPUs.
9668   for(i=0;i<slen-1;i++)
9669   {
9670     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9671     {
9672       if(!bt[i+1])
9673       {
9674         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3))
9675         {
9676           if(rs1[i+1]) {
9677             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9678             {
9679               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9680               {
9681                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9682                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9683                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9684                 regs[i].isconst&=~(1<<hr);
9685                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9686                 constmap[i][hr]=constmap[i+1][hr];
9687                 regs[i+1].wasdirty&=~(1<<hr);
9688                 regs[i].dirty&=~(1<<hr);
9689               }
9690             }
9691           }
9692           if(rs2[i+1]) {
9693             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9694             {
9695               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9696               {
9697                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9698                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9699                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9700                 regs[i].isconst&=~(1<<hr);
9701                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9702                 constmap[i][hr]=constmap[i+1][hr];
9703                 regs[i+1].wasdirty&=~(1<<hr);
9704                 regs[i].dirty&=~(1<<hr);
9705               }
9706             }
9707           }
9708           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9709             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9710             {
9711               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9712               {
9713                 regs[i].regmap[hr]=rs1[i+1];
9714                 regmap_pre[i+1][hr]=rs1[i+1];
9715                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9716                 regs[i].isconst&=~(1<<hr);
9717                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9718                 constmap[i][hr]=constmap[i+1][hr];
9719                 regs[i+1].wasdirty&=~(1<<hr);
9720                 regs[i].dirty&=~(1<<hr);
9721               }
9722             }
9723           }
9724           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9725             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9726             {
9727               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9728               {
9729                 regs[i].regmap[hr]=rs1[i+1];
9730                 regmap_pre[i+1][hr]=rs1[i+1];
9731                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9732                 regs[i].isconst&=~(1<<hr);
9733                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9734                 constmap[i][hr]=constmap[i+1][hr];
9735                 regs[i+1].wasdirty&=~(1<<hr);
9736                 regs[i].dirty&=~(1<<hr);
9737               }
9738             }
9739           }
9740           #ifndef HOST_IMM_ADDR32
9741           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) {
9742             hr=get_reg(regs[i+1].regmap,TLREG);
9743             if(hr>=0) {
9744               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
9745               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
9746                 int nr;
9747                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9748                 {
9749                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
9750                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
9751                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
9752                   regs[i].isconst&=~(1<<hr);
9753                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9754                   constmap[i][hr]=constmap[i+1][hr];
9755                   regs[i+1].wasdirty&=~(1<<hr);
9756                   regs[i].dirty&=~(1<<hr);
9757                 }
9758                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9759                 {
9760                   // move it to another register
9761                   regs[i+1].regmap[hr]=-1;
9762                   regmap_pre[i+2][hr]=-1;
9763                   regs[i+1].regmap[nr]=TLREG;
9764                   regmap_pre[i+2][nr]=TLREG;
9765                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
9766                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
9767                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
9768                   regs[i].isconst&=~(1<<nr);
9769                   regs[i+1].isconst&=~(1<<nr);
9770                   regs[i].dirty&=~(1<<nr);
9771                   regs[i+1].wasdirty&=~(1<<nr);
9772                   regs[i+1].dirty&=~(1<<nr);
9773                   regs[i+2].wasdirty&=~(1<<nr);
9774                 }
9775               }
9776             }
9777           }
9778           #endif
9779           if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1
9780             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9781               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9782               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9783               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9784               assert(hr>=0);
9785               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9786               {
9787                 regs[i].regmap[hr]=rs1[i+1];
9788                 regmap_pre[i+1][hr]=rs1[i+1];
9789                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9790                 regs[i].isconst&=~(1<<hr);
9791                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9792                 constmap[i][hr]=constmap[i+1][hr];
9793                 regs[i+1].wasdirty&=~(1<<hr);
9794                 regs[i].dirty&=~(1<<hr);
9795               }
9796             }
9797           }
9798           if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1
9799             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9800               int nr;
9801               hr=get_reg(regs[i+1].regmap,FTEMP);
9802               assert(hr>=0);
9803               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9804               {
9805                 regs[i].regmap[hr]=rs1[i+1];
9806                 regmap_pre[i+1][hr]=rs1[i+1];
9807                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9808                 regs[i].isconst&=~(1<<hr);
9809                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9810                 constmap[i][hr]=constmap[i+1][hr];
9811                 regs[i+1].wasdirty&=~(1<<hr);
9812                 regs[i].dirty&=~(1<<hr);
9813               }
9814               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9815               {
9816                 // move it to another register
9817                 regs[i+1].regmap[hr]=-1;
9818                 regmap_pre[i+2][hr]=-1;
9819                 regs[i+1].regmap[nr]=FTEMP;
9820                 regmap_pre[i+2][nr]=FTEMP;
9821                 regs[i].regmap[nr]=rs1[i+1];
9822                 regmap_pre[i+1][nr]=rs1[i+1];
9823                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9824                 regs[i].isconst&=~(1<<nr);
9825                 regs[i+1].isconst&=~(1<<nr);
9826                 regs[i].dirty&=~(1<<nr);
9827                 regs[i+1].wasdirty&=~(1<<nr);
9828                 regs[i+1].dirty&=~(1<<nr);
9829                 regs[i+2].wasdirty&=~(1<<nr);
9830               }
9831             }
9832           }
9833           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) {
9834             if(itype[i+1]==LOAD) 
9835               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9836             if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1
9837               hr=get_reg(regs[i+1].regmap,FTEMP);
9838             if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1
9839               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9840               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9841             }
9842             if(hr>=0&&regs[i].regmap[hr]<0) {
9843               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9844               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9845                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9846                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9847                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9848                 regs[i].isconst&=~(1<<hr);
9849                 regs[i+1].wasdirty&=~(1<<hr);
9850                 regs[i].dirty&=~(1<<hr);
9851               }
9852             }
9853           }
9854         }
9855       }
9856     }
9857   }
9858   
9859   /* Pass 6 - Optimize clean/dirty state */
9860   clean_registers(0,slen-1,1);
9861   
9862   /* Pass 7 - Identify 32-bit registers */
9863   
9864   provisional_r32();
9865
9866   u_int r32=0;
9867   
9868   for (i=slen-1;i>=0;i--)
9869   {
9870     int hr;
9871     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9872     {
9873       if(ba[i]<start || ba[i]>=(start+slen*4))
9874       {
9875         // Branch out of this block, don't need anything
9876         r32=0;
9877       }
9878       else
9879       {
9880         // Internal branch
9881         // Need whatever matches the target
9882         // (and doesn't get overwritten by the delay slot instruction)
9883         r32=0;
9884         int t=(ba[i]-start)>>2;
9885         if(ba[i]>start+i*4) {
9886           // Forward branch
9887           if(!(requires_32bit[t]&~regs[i].was32))
9888             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9889         }else{
9890           // Backward branch
9891           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
9892           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9893           if(!(pr32[t]&~regs[i].was32))
9894             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
9895         }
9896       }
9897       // Conditional branch may need registers for following instructions
9898       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9899       {
9900         if(i<slen-2) {
9901           r32|=requires_32bit[i+2];
9902           r32&=regs[i].was32;
9903           // Mark this address as a branch target since it may be called
9904           // upon return from interrupt
9905           bt[i+2]=1;
9906         }
9907       }
9908       // Merge in delay slot
9909       if(!likely[i]) {
9910         // These are overwritten unless the branch is "likely"
9911         // and the delay slot is nullified if not taken
9912         r32&=~(1LL<<rt1[i+1]);
9913         r32&=~(1LL<<rt2[i+1]);
9914       }
9915       // Assume these are needed (delay slot)
9916       if(us1[i+1]>0)
9917       {
9918         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
9919       }
9920       if(us2[i+1]>0)
9921       {
9922         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
9923       }
9924       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
9925       {
9926         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
9927       }
9928       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
9929       {
9930         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
9931       }
9932     }
9933     else if(itype[i]==SYSCALL)
9934     {
9935       // SYSCALL instruction (software interrupt)
9936       r32=0;
9937     }
9938     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9939     {
9940       // ERET instruction (return from interrupt)
9941       r32=0;
9942     }
9943     // Check 32 bits
9944     r32&=~(1LL<<rt1[i]);
9945     r32&=~(1LL<<rt2[i]);
9946     if(us1[i]>0)
9947     {
9948       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
9949     }
9950     if(us2[i]>0)
9951     {
9952       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
9953     }
9954     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
9955     {
9956       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
9957     }
9958     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
9959     {
9960       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
9961     }
9962     requires_32bit[i]=r32;
9963     
9964     // Dirty registers which are 32-bit, require 32-bit input
9965     // as they will be written as 32-bit values
9966     for(hr=0;hr<HOST_REGS;hr++)
9967     {
9968       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
9969         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
9970           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
9971           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
9972         }
9973       }
9974     }
9975     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
9976   }
9977
9978   if(itype[slen-1]==SPAN) {
9979     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9980   }
9981   
9982   /* Debug/disassembly */
9983   if((void*)assem_debug==(void*)printf) 
9984   for(i=0;i<slen;i++)
9985   {
9986     printf("U:");
9987     int r;
9988     for(r=1;r<=CCREG;r++) {
9989       if((unneeded_reg[i]>>r)&1) {
9990         if(r==HIREG) printf(" HI");
9991         else if(r==LOREG) printf(" LO");
9992         else printf(" r%d",r);
9993       }
9994     }
9995 #ifndef FORCE32
9996     printf(" UU:");
9997     for(r=1;r<=CCREG;r++) {
9998       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
9999         if(r==HIREG) printf(" HI");
10000         else if(r==LOREG) printf(" LO");
10001         else printf(" r%d",r);
10002       }
10003     }
10004     printf(" 32:");
10005     for(r=0;r<=CCREG;r++) {
10006       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10007       if((regs[i].was32>>r)&1) {
10008         if(r==CCREG) printf(" CC");
10009         else if(r==HIREG) printf(" HI");
10010         else if(r==LOREG) printf(" LO");
10011         else printf(" r%d",r);
10012       }
10013     }
10014 #endif
10015     printf("\n");
10016     #if defined(__i386__) || defined(__x86_64__)
10017     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10018     #endif
10019     #ifdef __arm__
10020     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10021     #endif
10022     printf("needs: ");
10023     if(needed_reg[i]&1) printf("eax ");
10024     if((needed_reg[i]>>1)&1) printf("ecx ");
10025     if((needed_reg[i]>>2)&1) printf("edx ");
10026     if((needed_reg[i]>>3)&1) printf("ebx ");
10027     if((needed_reg[i]>>5)&1) printf("ebp ");
10028     if((needed_reg[i]>>6)&1) printf("esi ");
10029     if((needed_reg[i]>>7)&1) printf("edi ");
10030     printf("r:");
10031     for(r=0;r<=CCREG;r++) {
10032       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10033       if((requires_32bit[i]>>r)&1) {
10034         if(r==CCREG) printf(" CC");
10035         else if(r==HIREG) printf(" HI");
10036         else if(r==LOREG) printf(" LO");
10037         else printf(" r%d",r);
10038       }
10039     }
10040     printf("\n");
10041     /*printf("pr:");
10042     for(r=0;r<=CCREG;r++) {
10043       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10044       if((pr32[i]>>r)&1) {
10045         if(r==CCREG) printf(" CC");
10046         else if(r==HIREG) printf(" HI");
10047         else if(r==LOREG) printf(" LO");
10048         else printf(" r%d",r);
10049       }
10050     }
10051     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10052     printf("\n");*/
10053     #if defined(__i386__) || defined(__x86_64__)
10054     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10055     printf("dirty: ");
10056     if(regs[i].wasdirty&1) printf("eax ");
10057     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10058     if((regs[i].wasdirty>>2)&1) printf("edx ");
10059     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10060     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10061     if((regs[i].wasdirty>>6)&1) printf("esi ");
10062     if((regs[i].wasdirty>>7)&1) printf("edi ");
10063     #endif
10064     #ifdef __arm__
10065     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10066     printf("dirty: ");
10067     if(regs[i].wasdirty&1) printf("r0 ");
10068     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10069     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10070     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10071     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10072     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10073     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10074     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10075     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10076     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10077     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10078     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10079     #endif
10080     printf("\n");
10081     disassemble_inst(i);
10082     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10083     #if defined(__i386__) || defined(__x86_64__)
10084     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10085     if(regs[i].dirty&1) printf("eax ");
10086     if((regs[i].dirty>>1)&1) printf("ecx ");
10087     if((regs[i].dirty>>2)&1) printf("edx ");
10088     if((regs[i].dirty>>3)&1) printf("ebx ");
10089     if((regs[i].dirty>>5)&1) printf("ebp ");
10090     if((regs[i].dirty>>6)&1) printf("esi ");
10091     if((regs[i].dirty>>7)&1) printf("edi ");
10092     #endif
10093     #ifdef __arm__
10094     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10095     if(regs[i].dirty&1) printf("r0 ");
10096     if((regs[i].dirty>>1)&1) printf("r1 ");
10097     if((regs[i].dirty>>2)&1) printf("r2 ");
10098     if((regs[i].dirty>>3)&1) printf("r3 ");
10099     if((regs[i].dirty>>4)&1) printf("r4 ");
10100     if((regs[i].dirty>>5)&1) printf("r5 ");
10101     if((regs[i].dirty>>6)&1) printf("r6 ");
10102     if((regs[i].dirty>>7)&1) printf("r7 ");
10103     if((regs[i].dirty>>8)&1) printf("r8 ");
10104     if((regs[i].dirty>>9)&1) printf("r9 ");
10105     if((regs[i].dirty>>10)&1) printf("r10 ");
10106     if((regs[i].dirty>>12)&1) printf("r12 ");
10107     #endif
10108     printf("\n");
10109     if(regs[i].isconst) {
10110       printf("constants: ");
10111       #if defined(__i386__) || defined(__x86_64__)
10112       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10113       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10114       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10115       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10116       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10117       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10118       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10119       #endif
10120       #ifdef __arm__
10121       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10122       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10123       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10124       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10125       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10126       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10127       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10128       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10129       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10130       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10131       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10132       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10133       #endif
10134       printf("\n");
10135     }
10136 #ifndef FORCE32
10137     printf(" 32:");
10138     for(r=0;r<=CCREG;r++) {
10139       if((regs[i].is32>>r)&1) {
10140         if(r==CCREG) printf(" CC");
10141         else if(r==HIREG) printf(" HI");
10142         else if(r==LOREG) printf(" LO");
10143         else printf(" r%d",r);
10144       }
10145     }
10146     printf("\n");
10147 #endif
10148     /*printf(" p32:");
10149     for(r=0;r<=CCREG;r++) {
10150       if((p32[i]>>r)&1) {
10151         if(r==CCREG) printf(" CC");
10152         else if(r==HIREG) printf(" HI");
10153         else if(r==LOREG) printf(" LO");
10154         else printf(" r%d",r);
10155       }
10156     }
10157     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10158     else printf("\n");*/
10159     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10160       #if defined(__i386__) || defined(__x86_64__)
10161       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10162       if(branch_regs[i].dirty&1) printf("eax ");
10163       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10164       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10165       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10166       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10167       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10168       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10169       #endif
10170       #ifdef __arm__
10171       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10172       if(branch_regs[i].dirty&1) printf("r0 ");
10173       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10174       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10175       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10176       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10177       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10178       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10179       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10180       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10181       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10182       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10183       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10184       #endif
10185 #ifndef FORCE32
10186       printf(" 32:");
10187       for(r=0;r<=CCREG;r++) {
10188         if((branch_regs[i].is32>>r)&1) {
10189           if(r==CCREG) printf(" CC");
10190           else if(r==HIREG) printf(" HI");
10191           else if(r==LOREG) printf(" LO");
10192           else printf(" r%d",r);
10193         }
10194       }
10195       printf("\n");
10196 #endif
10197     }
10198   }
10199
10200   /* Pass 8 - Assembly */
10201   linkcount=0;stubcount=0;
10202   ds=0;is_delayslot=0;
10203   cop1_usable=0;
10204   uint64_t is32_pre=0;
10205   u_int dirty_pre=0;
10206   u_int beginning=(u_int)out;
10207   if((u_int)addr&1) {
10208     ds=1;
10209     pagespan_ds();
10210   }
10211   for(i=0;i<slen;i++)
10212   {
10213     //if(ds) printf("ds: ");
10214     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10215     if(ds) {
10216       ds=0; // Skip delay slot
10217       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10218       instr_addr[i]=0;
10219     } else {
10220       #ifndef DESTRUCTIVE_WRITEBACK
10221       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10222       {
10223         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10224               unneeded_reg[i],unneeded_reg_upper[i]);
10225         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10226               unneeded_reg[i],unneeded_reg_upper[i]);
10227       }
10228       is32_pre=regs[i].is32;
10229       dirty_pre=regs[i].dirty;
10230       #endif
10231       // write back
10232       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10233       {
10234         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10235                       unneeded_reg[i],unneeded_reg_upper[i]);
10236         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10237       }
10238       // branch target entry point
10239       instr_addr[i]=(u_int)out;
10240       assem_debug("<->\n");
10241       // load regs
10242       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10243         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10244       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10245       address_generation(i,&regs[i],regs[i].regmap_entry);
10246       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10247       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10248       {
10249         // Load the delay slot registers if necessary
10250         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10251           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10252         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10253           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10254         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39)
10255           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10256       }
10257       else if(i+1<slen)
10258       {
10259         // Preload registers for following instruction
10260         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10261           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10262             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10263         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10264           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10265             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10266       }
10267       // TODO: if(is_ooo(i)) address_generation(i+1);
10268       if(itype[i]==CJUMP||itype[i]==FJUMP)
10269         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10270       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39)
10271         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10272       if(bt[i]) cop1_usable=0;
10273       // assemble
10274       switch(itype[i]) {
10275         case ALU:
10276           alu_assemble(i,&regs[i]);break;
10277         case IMM16:
10278           imm16_assemble(i,&regs[i]);break;
10279         case SHIFT:
10280           shift_assemble(i,&regs[i]);break;
10281         case SHIFTIMM:
10282           shiftimm_assemble(i,&regs[i]);break;
10283         case LOAD:
10284           load_assemble(i,&regs[i]);break;
10285         case LOADLR:
10286           loadlr_assemble(i,&regs[i]);break;
10287         case STORE:
10288           store_assemble(i,&regs[i]);break;
10289         case STORELR:
10290           storelr_assemble(i,&regs[i]);break;
10291         case COP0:
10292           cop0_assemble(i,&regs[i]);break;
10293         case COP1:
10294           cop1_assemble(i,&regs[i]);break;
10295         case C1LS:
10296           c1ls_assemble(i,&regs[i]);break;
10297         case FCONV:
10298           fconv_assemble(i,&regs[i]);break;
10299         case FLOAT:
10300           float_assemble(i,&regs[i]);break;
10301         case FCOMP:
10302           fcomp_assemble(i,&regs[i]);break;
10303         case MULTDIV:
10304           multdiv_assemble(i,&regs[i]);break;
10305         case MOV:
10306           mov_assemble(i,&regs[i]);break;
10307         case SYSCALL:
10308           syscall_assemble(i,&regs[i]);break;
10309         case UJUMP:
10310           ujump_assemble(i,&regs[i]);ds=1;break;
10311         case RJUMP:
10312           rjump_assemble(i,&regs[i]);ds=1;break;
10313         case CJUMP:
10314           cjump_assemble(i,&regs[i]);ds=1;break;
10315         case SJUMP:
10316           sjump_assemble(i,&regs[i]);ds=1;break;
10317         case FJUMP:
10318           fjump_assemble(i,&regs[i]);ds=1;break;
10319         case SPAN:
10320           pagespan_assemble(i,&regs[i]);break;
10321       }
10322       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10323         literal_pool(1024);
10324       else
10325         literal_pool_jumpover(256);
10326     }
10327   }
10328   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10329   // If the block did not end with an unconditional branch,
10330   // add a jump to the next instruction.
10331   if(i>1) {
10332     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10333       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10334       assert(i==slen);
10335       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10336         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10337         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10338           emit_loadreg(CCREG,HOST_CCREG);
10339         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10340       }
10341       else if(!likely[i-2])
10342       {
10343         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10344         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10345       }
10346       else
10347       {
10348         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10349         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10350       }
10351       add_to_linker((int)out,start+i*4,0);
10352       emit_jmp(0);
10353     }
10354   }
10355   else
10356   {
10357     assert(i>0);
10358     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10359     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10360     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10361       emit_loadreg(CCREG,HOST_CCREG);
10362     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10363     add_to_linker((int)out,start+i*4,0);
10364     emit_jmp(0);
10365   }
10366
10367   // TODO: delay slot stubs?
10368   // Stubs
10369   for(i=0;i<stubcount;i++)
10370   {
10371     switch(stubs[i][0])
10372     {
10373       case LOADB_STUB:
10374       case LOADH_STUB:
10375       case LOADW_STUB:
10376       case LOADD_STUB:
10377       case LOADBU_STUB:
10378       case LOADHU_STUB:
10379         do_readstub(i);break;
10380       case STOREB_STUB:
10381       case STOREH_STUB:
10382       case STOREW_STUB:
10383       case STORED_STUB:
10384         do_writestub(i);break;
10385       case CC_STUB:
10386         do_ccstub(i);break;
10387       case INVCODE_STUB:
10388         do_invstub(i);break;
10389       case FP_STUB:
10390         do_cop1stub(i);break;
10391       case STORELR_STUB:
10392         do_unalignedwritestub(i);break;
10393     }
10394   }
10395
10396   /* Pass 9 - Linker */
10397   for(i=0;i<linkcount;i++)
10398   {
10399     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10400     literal_pool(64);
10401     if(!link_addr[i][2])
10402     {
10403       void *stub=out;
10404       void *addr=check_addr(link_addr[i][1]);
10405       emit_extjump(link_addr[i][0],link_addr[i][1]);
10406       if(addr) {
10407         set_jump_target(link_addr[i][0],(int)addr);
10408         add_link(link_addr[i][1],stub);
10409       }
10410       else set_jump_target(link_addr[i][0],(int)stub);
10411     }
10412     else
10413     {
10414       // Internal branch
10415       int target=(link_addr[i][1]-start)>>2;
10416       assert(target>=0&&target<slen);
10417       assert(instr_addr[target]);
10418       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10419       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10420       //#else
10421       set_jump_target(link_addr[i][0],instr_addr[target]);
10422       //#endif
10423     }
10424   }
10425   // External Branch Targets (jump_in)
10426   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10427   for(i=0;i<slen;i++)
10428   {
10429     if(bt[i]||i==0)
10430     {
10431       if(instr_addr[i]) // TODO - delay slots (=null)
10432       {
10433         u_int vaddr=start+i*4;
10434         u_int page=get_page(vaddr);
10435         u_int vpage=get_vpage(vaddr);
10436         literal_pool(256);
10437         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10438         if(!requires_32bit[i])
10439         {
10440           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10441           assem_debug("jump_in: %x\n",start+i*4);
10442           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10443           int entry_point=do_dirty_stub(i);
10444           ll_add(jump_in+page,vaddr,(void *)entry_point);
10445           // If there was an existing entry in the hash table,
10446           // replace it with the new address.
10447           // Don't add new entries.  We'll insert the
10448           // ones that actually get used in check_addr().
10449           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10450           if(ht_bin[0]==vaddr) {
10451             ht_bin[1]=entry_point;
10452           }
10453           if(ht_bin[2]==vaddr) {
10454             ht_bin[3]=entry_point;
10455           }
10456         }
10457         else
10458         {
10459           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10460           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10461           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10462           //int entry_point=(int)out;
10463           ////assem_debug("entry_point: %x\n",entry_point);
10464           //load_regs_entry(i);
10465           //if(entry_point==(int)out)
10466           //  entry_point=instr_addr[i];
10467           //else
10468           //  emit_jmp(instr_addr[i]);
10469           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10470           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10471           int entry_point=do_dirty_stub(i);
10472           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10473         }
10474       }
10475     }
10476   }
10477   // Write out the literal pool if necessary
10478   literal_pool(0);
10479   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10480   // Align code
10481   if(((u_int)out)&7) emit_addnop(13);
10482   #endif
10483   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10484   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10485   memcpy(copy,source,slen*4);
10486   copy+=slen*4;
10487   
10488   #ifdef __arm__
10489   __clear_cache((void *)beginning,out);
10490   #endif
10491   
10492   // If we're within 256K of the end of the buffer,
10493   // start over from the beginning. (Is 256K enough?)
10494   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10495   
10496   // Trap writes to any of the pages we compiled
10497   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10498     invalid_code[i]=0;
10499 #ifndef DISABLE_TLB
10500     memory_map[i]|=0x40000000;
10501     if((signed int)start>=(signed int)0xC0000000) {
10502       assert(using_tlb);
10503       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10504       invalid_code[j]=0;
10505       memory_map[j]|=0x40000000;
10506       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10507     }
10508 #endif
10509   }
10510   
10511   /* Pass 10 - Free memory by expiring oldest blocks */
10512   
10513   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10514   while(expirep!=end)
10515   {
10516     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10517     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10518     inv_debug("EXP: Phase %d\n",expirep);
10519     switch((expirep>>11)&3)
10520     {
10521       case 0:
10522         // Clear jump_in and jump_dirty
10523         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10524         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10525         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10526         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10527         break;
10528       case 1:
10529         // Clear pointers
10530         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10531         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10532         break;
10533       case 2:
10534         // Clear hash table
10535         for(i=0;i<32;i++) {
10536           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10537           if((ht_bin[3]>>shift)==(base>>shift) ||
10538              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10539             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10540             ht_bin[2]=ht_bin[3]=-1;
10541           }
10542           if((ht_bin[1]>>shift)==(base>>shift) ||
10543              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10544             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10545             ht_bin[0]=ht_bin[2];
10546             ht_bin[1]=ht_bin[3];
10547             ht_bin[2]=ht_bin[3]=-1;
10548           }
10549         }
10550         break;
10551       case 3:
10552         // Clear jump_out
10553         #ifdef __arm__
10554         if((expirep&2047)==0)
10555           __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
10556         #endif
10557         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10558         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10559         break;
10560     }
10561     expirep=(expirep+1)&65535;
10562   }
10563   return 0;
10564 }