a26356482f141982bef182710d41bf53d8b86fac
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <sys/mman.h>
25
26 #include "emu_if.h" //emulator interface
27
28 //#define DISASM
29 //#define assem_debug printf
30 //#define inv_debug printf
31 #define assem_debug(...)
32 #define inv_debug(...)
33
34 #ifdef __i386__
35 #include "assem_x86.h"
36 #endif
37 #ifdef __x86_64__
38 #include "assem_x64.h"
39 #endif
40 #ifdef __arm__
41 #include "assem_arm.h"
42 #endif
43
44 #define MAXBLOCK 4096
45 #define MAX_OUTPUT_BLOCK_SIZE 262144
46 #define CLOCK_DIVIDER 2
47
48 struct regstat
49 {
50   signed char regmap_entry[HOST_REGS];
51   signed char regmap[HOST_REGS];
52   uint64_t was32;
53   uint64_t is32;
54   uint64_t wasdirty;
55   uint64_t dirty;
56   uint64_t u;
57   uint64_t uu;
58   u_int wasconst;
59   u_int isconst;
60   uint64_t constmap[HOST_REGS];
61 };
62
63 struct ll_entry
64 {
65   u_int vaddr;
66   u_int reg32;
67   void *addr;
68   struct ll_entry *next;
69 };
70
71   u_int start;
72   u_int *source;
73   u_int pagelimit;
74   char insn[MAXBLOCK][10];
75   u_char itype[MAXBLOCK];
76   u_char opcode[MAXBLOCK];
77   u_char opcode2[MAXBLOCK];
78   u_char bt[MAXBLOCK];
79   u_char rs1[MAXBLOCK];
80   u_char rs2[MAXBLOCK];
81   u_char rt1[MAXBLOCK];
82   u_char rt2[MAXBLOCK];
83   u_char us1[MAXBLOCK];
84   u_char us2[MAXBLOCK];
85   u_char dep1[MAXBLOCK];
86   u_char dep2[MAXBLOCK];
87   u_char lt1[MAXBLOCK];
88   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
89   static uint64_t gte_rt[MAXBLOCK];
90   static uint64_t gte_unneeded[MAXBLOCK];
91   static int gte_reads_flags; // gte flag read encountered
92   int imm[MAXBLOCK];
93   u_int ba[MAXBLOCK];
94   char likely[MAXBLOCK];
95   char is_ds[MAXBLOCK];
96   char ooo[MAXBLOCK];
97   uint64_t unneeded_reg[MAXBLOCK];
98   uint64_t unneeded_reg_upper[MAXBLOCK];
99   uint64_t branch_unneeded_reg[MAXBLOCK];
100   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
101   uint64_t p32[MAXBLOCK];
102   uint64_t pr32[MAXBLOCK];
103   signed char regmap_pre[MAXBLOCK][HOST_REGS];
104   signed char regmap[MAXBLOCK][HOST_REGS];
105   signed char regmap_entry[MAXBLOCK][HOST_REGS];
106   uint64_t constmap[MAXBLOCK][HOST_REGS];
107   struct regstat regs[MAXBLOCK];
108   struct regstat branch_regs[MAXBLOCK];
109   signed char minimum_free_regs[MAXBLOCK];
110   u_int needed_reg[MAXBLOCK];
111   uint64_t requires_32bit[MAXBLOCK];
112   u_int wont_dirty[MAXBLOCK];
113   u_int will_dirty[MAXBLOCK];
114   int ccadj[MAXBLOCK];
115   int slen;
116   u_int instr_addr[MAXBLOCK];
117   u_int link_addr[MAXBLOCK][3];
118   int linkcount;
119   u_int stubs[MAXBLOCK*3][8];
120   int stubcount;
121   u_int literals[1024][2];
122   int literalcount;
123   int is_delayslot;
124   int cop1_usable;
125   u_char *out;
126   struct ll_entry *jump_in[4096];
127   struct ll_entry *jump_out[4096];
128   struct ll_entry *jump_dirty[4096];
129   u_int hash_table[65536][4]  __attribute__((aligned(16)));
130   char shadow[1048576]  __attribute__((aligned(16)));
131   void *copy;
132   int expirep;
133 #ifndef PCSX
134   u_int using_tlb;
135 #else
136   static const u_int using_tlb=0;
137 #endif
138   static u_int sp_in_mirror;
139   int new_dynarec_did_compile;
140   u_int stop_after_jal;
141   extern u_char restore_candidate[512];
142   extern int cycle_count;
143
144   /* registers that may be allocated */
145   /* 1-31 gpr */
146 #define HIREG 32 // hi
147 #define LOREG 33 // lo
148 #define FSREG 34 // FPU status (FCSR)
149 #define CSREG 35 // Coprocessor status
150 #define CCREG 36 // Cycle count
151 #define INVCP 37 // Pointer to invalid_code
152 #define MMREG 38 // Pointer to memory_map
153 #define ROREG 39 // ram offset (if rdram!=0x80000000)
154 #define TEMPREG 40
155 #define FTEMP 40 // FPU temporary register
156 #define PTEMP 41 // Prefetch temporary register
157 #define TLREG 42 // TLB mapping offset
158 #define RHASH 43 // Return address hash
159 #define RHTBL 44 // Return address hash table address
160 #define RTEMP 45 // JR/JALR address register
161 #define MAXREG 45
162 #define AGEN1 46 // Address generation temporary register
163 #define AGEN2 47 // Address generation temporary register
164 #define MGEN1 48 // Maptable address generation temporary register
165 #define MGEN2 49 // Maptable address generation temporary register
166 #define BTREG 50 // Branch target temporary register
167
168   /* instruction types */
169 #define NOP 0     // No operation
170 #define LOAD 1    // Load
171 #define STORE 2   // Store
172 #define LOADLR 3  // Unaligned load
173 #define STORELR 4 // Unaligned store
174 #define MOV 5     // Move 
175 #define ALU 6     // Arithmetic/logic
176 #define MULTDIV 7 // Multiply/divide
177 #define SHIFT 8   // Shift by register
178 #define SHIFTIMM 9// Shift by immediate
179 #define IMM16 10  // 16-bit immediate
180 #define RJUMP 11  // Unconditional jump to register
181 #define UJUMP 12  // Unconditional jump
182 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
183 #define SJUMP 14  // Conditional branch (regimm format)
184 #define COP0 15   // Coprocessor 0
185 #define COP1 16   // Coprocessor 1
186 #define C1LS 17   // Coprocessor 1 load/store
187 #define FJUMP 18  // Conditional branch (floating point)
188 #define FLOAT 19  // Floating point unit
189 #define FCONV 20  // Convert integer to float
190 #define FCOMP 21  // Floating point compare (sets FSREG)
191 #define SYSCALL 22// SYSCALL
192 #define OTHER 23  // Other
193 #define SPAN 24   // Branch/delay slot spans 2 pages
194 #define NI 25     // Not implemented
195 #define HLECALL 26// PCSX fake opcodes for HLE
196 #define COP2 27   // Coprocessor 2 move
197 #define C2LS 28   // Coprocessor 2 load/store
198 #define C2OP 29   // Coprocessor 2 operation
199 #define INTCALL 30// Call interpreter to handle rare corner cases
200
201   /* stubs */
202 #define CC_STUB 1
203 #define FP_STUB 2
204 #define LOADB_STUB 3
205 #define LOADH_STUB 4
206 #define LOADW_STUB 5
207 #define LOADD_STUB 6
208 #define LOADBU_STUB 7
209 #define LOADHU_STUB 8
210 #define STOREB_STUB 9
211 #define STOREH_STUB 10
212 #define STOREW_STUB 11
213 #define STORED_STUB 12
214 #define STORELR_STUB 13
215 #define INVCODE_STUB 14
216
217   /* branch codes */
218 #define TAKEN 1
219 #define NOTTAKEN 2
220 #define NULLDS 3
221
222 // asm linkage
223 int new_recompile_block(int addr);
224 void *get_addr_ht(u_int vaddr);
225 void invalidate_block(u_int block);
226 void invalidate_addr(u_int addr);
227 void remove_hash(int vaddr);
228 void jump_vaddr();
229 void dyna_linker();
230 void dyna_linker_ds();
231 void verify_code();
232 void verify_code_vm();
233 void verify_code_ds();
234 void cc_interrupt();
235 void fp_exception();
236 void fp_exception_ds();
237 void jump_syscall();
238 void jump_syscall_hle();
239 void jump_eret();
240 void jump_hlecall();
241 void jump_intcall();
242 void new_dyna_leave();
243
244 // TLB
245 void TLBWI_new();
246 void TLBWR_new();
247 void read_nomem_new();
248 void read_nomemb_new();
249 void read_nomemh_new();
250 void read_nomemd_new();
251 void write_nomem_new();
252 void write_nomemb_new();
253 void write_nomemh_new();
254 void write_nomemd_new();
255 void write_rdram_new();
256 void write_rdramb_new();
257 void write_rdramh_new();
258 void write_rdramd_new();
259 extern u_int memory_map[1048576];
260
261 // Needed by assembler
262 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
263 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
264 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
265 void load_all_regs(signed char i_regmap[]);
266 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
267 void load_regs_entry(int t);
268 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
269
270 int tracedebug=0;
271
272 //#define DEBUG_CYCLE_COUNT 1
273
274 static void tlb_hacks()
275 {
276 #ifndef DISABLE_TLB
277   // Goldeneye hack
278   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
279   {
280     u_int addr;
281     int n;
282     switch (ROM_HEADER->Country_code&0xFF) 
283     {
284       case 0x45: // U
285         addr=0x34b30;
286         break;                   
287       case 0x4A: // J 
288         addr=0x34b70;    
289         break;    
290       case 0x50: // E 
291         addr=0x329f0;
292         break;                        
293       default: 
294         // Unknown country code
295         addr=0;
296         break;
297     }
298     u_int rom_addr=(u_int)rom;
299     #ifdef ROM_COPY
300     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
301     // in the lower 4G of memory to use this hack.  Copy it if necessary.
302     if((void *)rom>(void *)0xffffffff) {
303       munmap(ROM_COPY, 67108864);
304       if(mmap(ROM_COPY, 12582912,
305               PROT_READ | PROT_WRITE,
306               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
307               -1, 0) <= 0) {printf("mmap() failed\n");}
308       memcpy(ROM_COPY,rom,12582912);
309       rom_addr=(u_int)ROM_COPY;
310     }
311     #endif
312     if(addr) {
313       for(n=0x7F000;n<0x80000;n++) {
314         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
315       }
316     }
317   }
318 #endif
319 }
320
321 static u_int get_page(u_int vaddr)
322 {
323 #ifndef PCSX
324   u_int page=(vaddr^0x80000000)>>12;
325 #else
326   u_int page=vaddr&~0xe0000000;
327   if (page < 0x1000000)
328     page &= ~0x0e00000; // RAM mirrors
329   page>>=12;
330 #endif
331 #ifndef DISABLE_TLB
332   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
333 #endif
334   if(page>2048) page=2048+(page&2047);
335   return page;
336 }
337
338 static u_int get_vpage(u_int vaddr)
339 {
340   u_int vpage=(vaddr^0x80000000)>>12;
341 #ifndef DISABLE_TLB
342   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
343 #endif
344   if(vpage>2048) vpage=2048+(vpage&2047);
345   return vpage;
346 }
347
348 // Get address from virtual address
349 // This is called from the recompiled JR/JALR instructions
350 void *get_addr(u_int vaddr)
351 {
352   u_int page=get_page(vaddr);
353   u_int vpage=get_vpage(vaddr);
354   struct ll_entry *head;
355   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
356   head=jump_in[page];
357   while(head!=NULL) {
358     if(head->vaddr==vaddr&&head->reg32==0) {
359   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
360       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
361       ht_bin[3]=ht_bin[1];
362       ht_bin[2]=ht_bin[0];
363       ht_bin[1]=(int)head->addr;
364       ht_bin[0]=vaddr;
365       return head->addr;
366     }
367     head=head->next;
368   }
369   head=jump_dirty[vpage];
370   while(head!=NULL) {
371     if(head->vaddr==vaddr&&head->reg32==0) {
372       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
373       // Don't restore blocks which are about to expire from the cache
374       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
375       if(verify_dirty(head->addr)) {
376         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
377         invalid_code[vaddr>>12]=0;
378         inv_code_start=inv_code_end=~0;
379         memory_map[vaddr>>12]|=0x40000000;
380         if(vpage<2048) {
381 #ifndef DISABLE_TLB
382           if(tlb_LUT_r[vaddr>>12]) {
383             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
384             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
385           }
386 #endif
387           restore_candidate[vpage>>3]|=1<<(vpage&7);
388         }
389         else restore_candidate[page>>3]|=1<<(page&7);
390         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
391         if(ht_bin[0]==vaddr) {
392           ht_bin[1]=(int)head->addr; // Replace existing entry
393         }
394         else
395         {
396           ht_bin[3]=ht_bin[1];
397           ht_bin[2]=ht_bin[0];
398           ht_bin[1]=(int)head->addr;
399           ht_bin[0]=vaddr;
400         }
401         return head->addr;
402       }
403     }
404     head=head->next;
405   }
406   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
407   int r=new_recompile_block(vaddr);
408   if(r==0) return get_addr(vaddr);
409   // Execute in unmapped page, generate pagefault execption
410   Status|=2;
411   Cause=(vaddr<<31)|0x8;
412   EPC=(vaddr&1)?vaddr-5:vaddr;
413   BadVAddr=(vaddr&~1);
414   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
415   EntryHi=BadVAddr&0xFFFFE000;
416   return get_addr_ht(0x80000000);
417 }
418 // Look up address in hash table first
419 void *get_addr_ht(u_int vaddr)
420 {
421   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
422   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
423   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
424   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
425   return get_addr(vaddr);
426 }
427
428 void *get_addr_32(u_int vaddr,u_int flags)
429 {
430 #ifdef FORCE32
431   return get_addr(vaddr);
432 #else
433   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
434   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
435   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
436   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
437   u_int page=get_page(vaddr);
438   u_int vpage=get_vpage(vaddr);
439   struct ll_entry *head;
440   head=jump_in[page];
441   while(head!=NULL) {
442     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
443       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
444       if(head->reg32==0) {
445         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
446         if(ht_bin[0]==-1) {
447           ht_bin[1]=(int)head->addr;
448           ht_bin[0]=vaddr;
449         }else if(ht_bin[2]==-1) {
450           ht_bin[3]=(int)head->addr;
451           ht_bin[2]=vaddr;
452         }
453         //ht_bin[3]=ht_bin[1];
454         //ht_bin[2]=ht_bin[0];
455         //ht_bin[1]=(int)head->addr;
456         //ht_bin[0]=vaddr;
457       }
458       return head->addr;
459     }
460     head=head->next;
461   }
462   head=jump_dirty[vpage];
463   while(head!=NULL) {
464     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
465       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
466       // Don't restore blocks which are about to expire from the cache
467       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
468       if(verify_dirty(head->addr)) {
469         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
470         invalid_code[vaddr>>12]=0;
471         inv_code_start=inv_code_end=~0;
472         memory_map[vaddr>>12]|=0x40000000;
473         if(vpage<2048) {
474 #ifndef DISABLE_TLB
475           if(tlb_LUT_r[vaddr>>12]) {
476             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
477             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
478           }
479 #endif
480           restore_candidate[vpage>>3]|=1<<(vpage&7);
481         }
482         else restore_candidate[page>>3]|=1<<(page&7);
483         if(head->reg32==0) {
484           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
485           if(ht_bin[0]==-1) {
486             ht_bin[1]=(int)head->addr;
487             ht_bin[0]=vaddr;
488           }else if(ht_bin[2]==-1) {
489             ht_bin[3]=(int)head->addr;
490             ht_bin[2]=vaddr;
491           }
492           //ht_bin[3]=ht_bin[1];
493           //ht_bin[2]=ht_bin[0];
494           //ht_bin[1]=(int)head->addr;
495           //ht_bin[0]=vaddr;
496         }
497         return head->addr;
498       }
499     }
500     head=head->next;
501   }
502   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
503   int r=new_recompile_block(vaddr);
504   if(r==0) return get_addr(vaddr);
505   // Execute in unmapped page, generate pagefault execption
506   Status|=2;
507   Cause=(vaddr<<31)|0x8;
508   EPC=(vaddr&1)?vaddr-5:vaddr;
509   BadVAddr=(vaddr&~1);
510   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
511   EntryHi=BadVAddr&0xFFFFE000;
512   return get_addr_ht(0x80000000);
513 #endif
514 }
515
516 void clear_all_regs(signed char regmap[])
517 {
518   int hr;
519   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
520 }
521
522 signed char get_reg(signed char regmap[],int r)
523 {
524   int hr;
525   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
526   return -1;
527 }
528
529 // Find a register that is available for two consecutive cycles
530 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
531 {
532   int hr;
533   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
534   return -1;
535 }
536
537 int count_free_regs(signed char regmap[])
538 {
539   int count=0;
540   int hr;
541   for(hr=0;hr<HOST_REGS;hr++)
542   {
543     if(hr!=EXCLUDE_REG) {
544       if(regmap[hr]<0) count++;
545     }
546   }
547   return count;
548 }
549
550 void dirty_reg(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if((cur->regmap[hr]&63)==reg) {
556       cur->dirty|=1<<hr;
557     }
558   }
559 }
560
561 // If we dirty the lower half of a 64 bit register which is now being
562 // sign-extended, we need to dump the upper half.
563 // Note: Do this only after completion of the instruction, because
564 // some instructions may need to read the full 64-bit value even if
565 // overwriting it (eg SLTI, DSRA32).
566 static void flush_dirty_uppers(struct regstat *cur)
567 {
568   int hr,reg;
569   for (hr=0;hr<HOST_REGS;hr++) {
570     if((cur->dirty>>hr)&1) {
571       reg=cur->regmap[hr];
572       if(reg>=64) 
573         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
574     }
575   }
576 }
577
578 void set_const(struct regstat *cur,signed char reg,uint64_t value)
579 {
580   int hr;
581   if(!reg) return;
582   for (hr=0;hr<HOST_REGS;hr++) {
583     if(cur->regmap[hr]==reg) {
584       cur->isconst|=1<<hr;
585       cur->constmap[hr]=value;
586     }
587     else if((cur->regmap[hr]^64)==reg) {
588       cur->isconst|=1<<hr;
589       cur->constmap[hr]=value>>32;
590     }
591   }
592 }
593
594 void clear_const(struct regstat *cur,signed char reg)
595 {
596   int hr;
597   if(!reg) return;
598   for (hr=0;hr<HOST_REGS;hr++) {
599     if((cur->regmap[hr]&63)==reg) {
600       cur->isconst&=~(1<<hr);
601     }
602   }
603 }
604
605 int is_const(struct regstat *cur,signed char reg)
606 {
607   int hr;
608   if(reg<0) return 0;
609   if(!reg) return 1;
610   for (hr=0;hr<HOST_REGS;hr++) {
611     if((cur->regmap[hr]&63)==reg) {
612       return (cur->isconst>>hr)&1;
613     }
614   }
615   return 0;
616 }
617 uint64_t get_const(struct regstat *cur,signed char reg)
618 {
619   int hr;
620   if(!reg) return 0;
621   for (hr=0;hr<HOST_REGS;hr++) {
622     if(cur->regmap[hr]==reg) {
623       return cur->constmap[hr];
624     }
625   }
626   printf("Unknown constant in r%d\n",reg);
627   exit(1);
628 }
629
630 // Least soon needed registers
631 // Look at the next ten instructions and see which registers
632 // will be used.  Try not to reallocate these.
633 void lsn(u_char hsn[], int i, int *preferred_reg)
634 {
635   int j;
636   int b=-1;
637   for(j=0;j<9;j++)
638   {
639     if(i+j>=slen) {
640       j=slen-i-1;
641       break;
642     }
643     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
644     {
645       // Don't go past an unconditonal jump
646       j++;
647       break;
648     }
649   }
650   for(;j>=0;j--)
651   {
652     if(rs1[i+j]) hsn[rs1[i+j]]=j;
653     if(rs2[i+j]) hsn[rs2[i+j]]=j;
654     if(rt1[i+j]) hsn[rt1[i+j]]=j;
655     if(rt2[i+j]) hsn[rt2[i+j]]=j;
656     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
657       // Stores can allocate zero
658       hsn[rs1[i+j]]=j;
659       hsn[rs2[i+j]]=j;
660     }
661     // On some architectures stores need invc_ptr
662     #if defined(HOST_IMM8)
663     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
664       hsn[INVCP]=j;
665     }
666     #endif
667     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
668     {
669       hsn[CCREG]=j;
670       b=j;
671     }
672   }
673   if(b>=0)
674   {
675     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
676     {
677       // Follow first branch
678       int t=(ba[i+b]-start)>>2;
679       j=7-b;if(t+j>=slen) j=slen-t-1;
680       for(;j>=0;j--)
681       {
682         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
683         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
684         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
685         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
686       }
687     }
688     // TODO: preferred register based on backward branch
689   }
690   // Delay slot should preferably not overwrite branch conditions or cycle count
691   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
692     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
693     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
694     hsn[CCREG]=1;
695     // ...or hash tables
696     hsn[RHASH]=1;
697     hsn[RHTBL]=1;
698   }
699   // Coprocessor load/store needs FTEMP, even if not declared
700   if(itype[i]==C1LS||itype[i]==C2LS) {
701     hsn[FTEMP]=0;
702   }
703   // Load L/R also uses FTEMP as a temporary register
704   if(itype[i]==LOADLR) {
705     hsn[FTEMP]=0;
706   }
707   // Also SWL/SWR/SDL/SDR
708   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
709     hsn[FTEMP]=0;
710   }
711   // Don't remove the TLB registers either
712   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
713     hsn[TLREG]=0;
714   }
715   // Don't remove the miniht registers
716   if(itype[i]==UJUMP||itype[i]==RJUMP)
717   {
718     hsn[RHASH]=0;
719     hsn[RHTBL]=0;
720   }
721 }
722
723 // We only want to allocate registers if we're going to use them again soon
724 int needed_again(int r, int i)
725 {
726   int j;
727   int b=-1;
728   int rn=10;
729   
730   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
731   {
732     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
733       return 0; // Don't need any registers if exiting the block
734   }
735   for(j=0;j<9;j++)
736   {
737     if(i+j>=slen) {
738       j=slen-i-1;
739       break;
740     }
741     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
742     {
743       // Don't go past an unconditonal jump
744       j++;
745       break;
746     }
747     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
748     {
749       break;
750     }
751   }
752   for(;j>=1;j--)
753   {
754     if(rs1[i+j]==r) rn=j;
755     if(rs2[i+j]==r) rn=j;
756     if((unneeded_reg[i+j]>>r)&1) rn=10;
757     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
758     {
759       b=j;
760     }
761   }
762   /*
763   if(b>=0)
764   {
765     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
766     {
767       // Follow first branch
768       int o=rn;
769       int t=(ba[i+b]-start)>>2;
770       j=7-b;if(t+j>=slen) j=slen-t-1;
771       for(;j>=0;j--)
772       {
773         if(!((unneeded_reg[t+j]>>r)&1)) {
774           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
775           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
776         }
777         else rn=o;
778       }
779     }
780   }*/
781   if(rn<10) return 1;
782   return 0;
783 }
784
785 // Try to match register allocations at the end of a loop with those
786 // at the beginning
787 int loop_reg(int i, int r, int hr)
788 {
789   int j,k;
790   for(j=0;j<9;j++)
791   {
792     if(i+j>=slen) {
793       j=slen-i-1;
794       break;
795     }
796     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
797     {
798       // Don't go past an unconditonal jump
799       j++;
800       break;
801     }
802   }
803   k=0;
804   if(i>0){
805     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
806       k--;
807   }
808   for(;k<j;k++)
809   {
810     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
811     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
812     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
813     {
814       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
815       {
816         int t=(ba[i+k]-start)>>2;
817         int reg=get_reg(regs[t].regmap_entry,r);
818         if(reg>=0) return reg;
819         //reg=get_reg(regs[t+1].regmap_entry,r);
820         //if(reg>=0) return reg;
821       }
822     }
823   }
824   return hr;
825 }
826
827
828 // Allocate every register, preserving source/target regs
829 void alloc_all(struct regstat *cur,int i)
830 {
831   int hr;
832   
833   for(hr=0;hr<HOST_REGS;hr++) {
834     if(hr!=EXCLUDE_REG) {
835       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
836          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
837       {
838         cur->regmap[hr]=-1;
839         cur->dirty&=~(1<<hr);
840       }
841       // Don't need zeros
842       if((cur->regmap[hr]&63)==0)
843       {
844         cur->regmap[hr]=-1;
845         cur->dirty&=~(1<<hr);
846       }
847     }
848   }
849 }
850
851 #ifndef FORCE32
852 void div64(int64_t dividend,int64_t divisor)
853 {
854   lo=dividend/divisor;
855   hi=dividend%divisor;
856   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
857   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
858 }
859 void divu64(uint64_t dividend,uint64_t divisor)
860 {
861   lo=dividend/divisor;
862   hi=dividend%divisor;
863   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
864   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
865 }
866
867 void mult64(uint64_t m1,uint64_t m2)
868 {
869    unsigned long long int op1, op2, op3, op4;
870    unsigned long long int result1, result2, result3, result4;
871    unsigned long long int temp1, temp2, temp3, temp4;
872    int sign = 0;
873    
874    if (m1 < 0)
875      {
876     op2 = -m1;
877     sign = 1 - sign;
878      }
879    else op2 = m1;
880    if (m2 < 0)
881      {
882     op4 = -m2;
883     sign = 1 - sign;
884      }
885    else op4 = m2;
886    
887    op1 = op2 & 0xFFFFFFFF;
888    op2 = (op2 >> 32) & 0xFFFFFFFF;
889    op3 = op4 & 0xFFFFFFFF;
890    op4 = (op4 >> 32) & 0xFFFFFFFF;
891    
892    temp1 = op1 * op3;
893    temp2 = (temp1 >> 32) + op1 * op4;
894    temp3 = op2 * op3;
895    temp4 = (temp3 >> 32) + op2 * op4;
896    
897    result1 = temp1 & 0xFFFFFFFF;
898    result2 = temp2 + (temp3 & 0xFFFFFFFF);
899    result3 = (result2 >> 32) + temp4;
900    result4 = (result3 >> 32);
901    
902    lo = result1 | (result2 << 32);
903    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
904    if (sign)
905      {
906     hi = ~hi;
907     if (!lo) hi++;
908     else lo = ~lo + 1;
909      }
910 }
911
912 void multu64(uint64_t m1,uint64_t m2)
913 {
914    unsigned long long int op1, op2, op3, op4;
915    unsigned long long int result1, result2, result3, result4;
916    unsigned long long int temp1, temp2, temp3, temp4;
917    
918    op1 = m1 & 0xFFFFFFFF;
919    op2 = (m1 >> 32) & 0xFFFFFFFF;
920    op3 = m2 & 0xFFFFFFFF;
921    op4 = (m2 >> 32) & 0xFFFFFFFF;
922    
923    temp1 = op1 * op3;
924    temp2 = (temp1 >> 32) + op1 * op4;
925    temp3 = op2 * op3;
926    temp4 = (temp3 >> 32) + op2 * op4;
927    
928    result1 = temp1 & 0xFFFFFFFF;
929    result2 = temp2 + (temp3 & 0xFFFFFFFF);
930    result3 = (result2 >> 32) + temp4;
931    result4 = (result3 >> 32);
932    
933    lo = result1 | (result2 << 32);
934    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
935    
936   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
937   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
938 }
939
940 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
941 {
942   if(bits) {
943     original<<=64-bits;
944     original>>=64-bits;
945     loaded<<=bits;
946     original|=loaded;
947   }
948   else original=loaded;
949   return original;
950 }
951 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
952 {
953   if(bits^56) {
954     original>>=64-(bits^56);
955     original<<=64-(bits^56);
956     loaded>>=bits^56;
957     original|=loaded;
958   }
959   else original=loaded;
960   return original;
961 }
962 #endif
963
964 #ifdef __i386__
965 #include "assem_x86.c"
966 #endif
967 #ifdef __x86_64__
968 #include "assem_x64.c"
969 #endif
970 #ifdef __arm__
971 #include "assem_arm.c"
972 #endif
973
974 // Add virtual address mapping to linked list
975 void ll_add(struct ll_entry **head,int vaddr,void *addr)
976 {
977   struct ll_entry *new_entry;
978   new_entry=malloc(sizeof(struct ll_entry));
979   assert(new_entry!=NULL);
980   new_entry->vaddr=vaddr;
981   new_entry->reg32=0;
982   new_entry->addr=addr;
983   new_entry->next=*head;
984   *head=new_entry;
985 }
986
987 // Add virtual address mapping for 32-bit compiled block
988 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
989 {
990   ll_add(head,vaddr,addr);
991 #ifndef FORCE32
992   (*head)->reg32=reg32;
993 #endif
994 }
995
996 // Check if an address is already compiled
997 // but don't return addresses which are about to expire from the cache
998 void *check_addr(u_int vaddr)
999 {
1000   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1001   if(ht_bin[0]==vaddr) {
1002     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1003       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1004   }
1005   if(ht_bin[2]==vaddr) {
1006     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1007       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1008   }
1009   u_int page=get_page(vaddr);
1010   struct ll_entry *head;
1011   head=jump_in[page];
1012   while(head!=NULL) {
1013     if(head->vaddr==vaddr&&head->reg32==0) {
1014       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1015         // Update existing entry with current address
1016         if(ht_bin[0]==vaddr) {
1017           ht_bin[1]=(int)head->addr;
1018           return head->addr;
1019         }
1020         if(ht_bin[2]==vaddr) {
1021           ht_bin[3]=(int)head->addr;
1022           return head->addr;
1023         }
1024         // Insert into hash table with low priority.
1025         // Don't evict existing entries, as they are probably
1026         // addresses that are being accessed frequently.
1027         if(ht_bin[0]==-1) {
1028           ht_bin[1]=(int)head->addr;
1029           ht_bin[0]=vaddr;
1030         }else if(ht_bin[2]==-1) {
1031           ht_bin[3]=(int)head->addr;
1032           ht_bin[2]=vaddr;
1033         }
1034         return head->addr;
1035       }
1036     }
1037     head=head->next;
1038   }
1039   return 0;
1040 }
1041
1042 void remove_hash(int vaddr)
1043 {
1044   //printf("remove hash: %x\n",vaddr);
1045   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1046   if(ht_bin[2]==vaddr) {
1047     ht_bin[2]=ht_bin[3]=-1;
1048   }
1049   if(ht_bin[0]==vaddr) {
1050     ht_bin[0]=ht_bin[2];
1051     ht_bin[1]=ht_bin[3];
1052     ht_bin[2]=ht_bin[3]=-1;
1053   }
1054 }
1055
1056 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1057 {
1058   struct ll_entry *next;
1059   while(*head) {
1060     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1061        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1062     {
1063       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1064       remove_hash((*head)->vaddr);
1065       next=(*head)->next;
1066       free(*head);
1067       *head=next;
1068     }
1069     else
1070     {
1071       head=&((*head)->next);
1072     }
1073   }
1074 }
1075
1076 // Remove all entries from linked list
1077 void ll_clear(struct ll_entry **head)
1078 {
1079   struct ll_entry *cur;
1080   struct ll_entry *next;
1081   if(cur=*head) {
1082     *head=0;
1083     while(cur) {
1084       next=cur->next;
1085       free(cur);
1086       cur=next;
1087     }
1088   }
1089 }
1090
1091 // Dereference the pointers and remove if it matches
1092 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1093 {
1094   while(head) {
1095     int ptr=get_pointer(head->addr);
1096     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1097     if(((ptr>>shift)==(addr>>shift)) ||
1098        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1099     {
1100       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1101       u_int host_addr=(u_int)kill_pointer(head->addr);
1102       #ifdef __arm__
1103         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1104       #endif
1105     }
1106     head=head->next;
1107   }
1108 }
1109
1110 // This is called when we write to a compiled block (see do_invstub)
1111 void invalidate_page(u_int page)
1112 {
1113   struct ll_entry *head;
1114   struct ll_entry *next;
1115   head=jump_in[page];
1116   jump_in[page]=0;
1117   while(head!=NULL) {
1118     inv_debug("INVALIDATE: %x\n",head->vaddr);
1119     remove_hash(head->vaddr);
1120     next=head->next;
1121     free(head);
1122     head=next;
1123   }
1124   head=jump_out[page];
1125   jump_out[page]=0;
1126   while(head!=NULL) {
1127     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1128     u_int host_addr=(u_int)kill_pointer(head->addr);
1129     #ifdef __arm__
1130       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1131     #endif
1132     next=head->next;
1133     free(head);
1134     head=next;
1135   }
1136 }
1137
1138 static void invalidate_block_range(u_int block, u_int first, u_int last)
1139 {
1140   u_int page=get_page(block<<12);
1141   //printf("first=%d last=%d\n",first,last);
1142   invalidate_page(page);
1143   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1144   assert(last<page+5);
1145   // Invalidate the adjacent pages if a block crosses a 4K boundary
1146   while(first<page) {
1147     invalidate_page(first);
1148     first++;
1149   }
1150   for(first=page+1;first<last;first++) {
1151     invalidate_page(first);
1152   }
1153   #ifdef __arm__
1154     do_clear_cache();
1155   #endif
1156   
1157   // Don't trap writes
1158   invalid_code[block]=1;
1159 #ifndef DISABLE_TLB
1160   // If there is a valid TLB entry for this page, remove write protect
1161   if(tlb_LUT_w[block]) {
1162     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1163     // CHECK: Is this right?
1164     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1165     u_int real_block=tlb_LUT_w[block]>>12;
1166     invalid_code[real_block]=1;
1167     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1168   }
1169   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1170 #endif
1171
1172   #ifdef USE_MINI_HT
1173   memset(mini_ht,-1,sizeof(mini_ht));
1174   #endif
1175 }
1176
1177 void invalidate_block(u_int block)
1178 {
1179   u_int page=get_page(block<<12);
1180   u_int vpage=get_vpage(block<<12);
1181   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1182   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1183   u_int first,last;
1184   first=last=page;
1185   struct ll_entry *head;
1186   head=jump_dirty[vpage];
1187   //printf("page=%d vpage=%d\n",page,vpage);
1188   while(head!=NULL) {
1189     u_int start,end;
1190     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1191       get_bounds((int)head->addr,&start,&end);
1192       //printf("start: %x end: %x\n",start,end);
1193       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1194         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1195           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1196           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1197         }
1198       }
1199 #ifndef DISABLE_TLB
1200       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1201         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1202           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1203           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1204         }
1205       }
1206 #endif
1207     }
1208     head=head->next;
1209   }
1210   invalidate_block_range(block,first,last);
1211 }
1212
1213 void invalidate_addr(u_int addr)
1214 {
1215 #ifdef PCSX
1216   //static int rhits;
1217   // this check is done by the caller
1218   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1219   u_int page=get_page(addr);
1220   if(page<2048) { // RAM
1221     struct ll_entry *head;
1222     u_int addr_min=~0, addr_max=0;
1223     int mask=RAM_SIZE-1;
1224     int pg1;
1225     inv_code_start=addr&~0xfff;
1226     inv_code_end=addr|0xfff;
1227     pg1=page;
1228     if (pg1>0) {
1229       // must check previous page too because of spans..
1230       pg1--;
1231       inv_code_start-=0x1000;
1232     }
1233     for(;pg1<=page;pg1++) {
1234       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1235         u_int start,end;
1236         get_bounds((int)head->addr,&start,&end);
1237         if((start&mask)<=(addr&mask)&&(addr&mask)<(end&mask)) {
1238           if(start<addr_min) addr_min=start;
1239           if(end>addr_max) addr_max=end;
1240         }
1241         else if(addr<start) {
1242           if(start<inv_code_end)
1243             inv_code_end=start-1;
1244         }
1245         else {
1246           if(end>inv_code_start)
1247             inv_code_start=end;
1248         }
1249       }
1250     }
1251     if (addr_min!=~0) {
1252       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1253       inv_code_start=inv_code_end=~0;
1254       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1255       return;
1256     }
1257     else {
1258       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);//rhits);
1259     }
1260     //rhits=0;
1261     if(page!=0) // FIXME: don't know what's up with page 0 (Klonoa)
1262       return;
1263   }
1264 #endif
1265   invalidate_block(addr>>12);
1266 }
1267
1268 // This is called when loading a save state.
1269 // Anything could have changed, so invalidate everything.
1270 void invalidate_all_pages()
1271 {
1272   u_int page,n;
1273   for(page=0;page<4096;page++)
1274     invalidate_page(page);
1275   for(page=0;page<1048576;page++)
1276     if(!invalid_code[page]) {
1277       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1278       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1279     }
1280   #ifdef __arm__
1281   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1282   #endif
1283   #ifdef USE_MINI_HT
1284   memset(mini_ht,-1,sizeof(mini_ht));
1285   #endif
1286   #ifndef DISABLE_TLB
1287   // TLB
1288   for(page=0;page<0x100000;page++) {
1289     if(tlb_LUT_r[page]) {
1290       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1291       if(!tlb_LUT_w[page]||!invalid_code[page])
1292         memory_map[page]|=0x40000000; // Write protect
1293     }
1294     else memory_map[page]=-1;
1295     if(page==0x80000) page=0xC0000;
1296   }
1297   tlb_hacks();
1298   #endif
1299 }
1300
1301 // Add an entry to jump_out after making a link
1302 void add_link(u_int vaddr,void *src)
1303 {
1304   u_int page=get_page(vaddr);
1305   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1306   int *ptr=(int *)(src+4);
1307   assert((*ptr&0x0fff0000)==0x059f0000);
1308   ll_add(jump_out+page,vaddr,src);
1309   //int ptr=get_pointer(src);
1310   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1311 }
1312
1313 // If a code block was found to be unmodified (bit was set in
1314 // restore_candidate) and it remains unmodified (bit is clear
1315 // in invalid_code) then move the entries for that 4K page from
1316 // the dirty list to the clean list.
1317 void clean_blocks(u_int page)
1318 {
1319   struct ll_entry *head;
1320   inv_debug("INV: clean_blocks page=%d\n",page);
1321   head=jump_dirty[page];
1322   while(head!=NULL) {
1323     if(!invalid_code[head->vaddr>>12]) {
1324       // Don't restore blocks which are about to expire from the cache
1325       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1326         u_int start,end;
1327         if(verify_dirty((int)head->addr)) {
1328           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1329           u_int i;
1330           u_int inv=0;
1331           get_bounds((int)head->addr,&start,&end);
1332           if(start-(u_int)rdram<RAM_SIZE) {
1333             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1334               inv|=invalid_code[i];
1335             }
1336           }
1337           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1338             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1339             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1340             if(addr<start||addr>=end) inv=1;
1341           }
1342           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1343             inv=1;
1344           }
1345           if(!inv) {
1346             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1347             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1348               u_int ppage=page;
1349 #ifndef DISABLE_TLB
1350               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1351 #endif
1352               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1353               //printf("page=%x, addr=%x\n",page,head->vaddr);
1354               //assert(head->vaddr>>12==(page|0x80000));
1355               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1356               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1357               if(!head->reg32) {
1358                 if(ht_bin[0]==head->vaddr) {
1359                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1360                 }
1361                 if(ht_bin[2]==head->vaddr) {
1362                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1363                 }
1364               }
1365             }
1366           }
1367         }
1368       }
1369     }
1370     head=head->next;
1371   }
1372 }
1373
1374
1375 void mov_alloc(struct regstat *current,int i)
1376 {
1377   // Note: Don't need to actually alloc the source registers
1378   if((~current->is32>>rs1[i])&1) {
1379     //alloc_reg64(current,i,rs1[i]);
1380     alloc_reg64(current,i,rt1[i]);
1381     current->is32&=~(1LL<<rt1[i]);
1382   } else {
1383     //alloc_reg(current,i,rs1[i]);
1384     alloc_reg(current,i,rt1[i]);
1385     current->is32|=(1LL<<rt1[i]);
1386   }
1387   clear_const(current,rs1[i]);
1388   clear_const(current,rt1[i]);
1389   dirty_reg(current,rt1[i]);
1390 }
1391
1392 void shiftimm_alloc(struct regstat *current,int i)
1393 {
1394   clear_const(current,rs1[i]);
1395   clear_const(current,rt1[i]);
1396   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1397   {
1398     if(rt1[i]) {
1399       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1400       else lt1[i]=rs1[i];
1401       alloc_reg(current,i,rt1[i]);
1402       current->is32|=1LL<<rt1[i];
1403       dirty_reg(current,rt1[i]);
1404     }
1405   }
1406   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1407   {
1408     if(rt1[i]) {
1409       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1410       alloc_reg64(current,i,rt1[i]);
1411       current->is32&=~(1LL<<rt1[i]);
1412       dirty_reg(current,rt1[i]);
1413     }
1414   }
1415   if(opcode2[i]==0x3c) // DSLL32
1416   {
1417     if(rt1[i]) {
1418       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1419       alloc_reg64(current,i,rt1[i]);
1420       current->is32&=~(1LL<<rt1[i]);
1421       dirty_reg(current,rt1[i]);
1422     }
1423   }
1424   if(opcode2[i]==0x3e) // DSRL32
1425   {
1426     if(rt1[i]) {
1427       alloc_reg64(current,i,rs1[i]);
1428       if(imm[i]==32) {
1429         alloc_reg64(current,i,rt1[i]);
1430         current->is32&=~(1LL<<rt1[i]);
1431       } else {
1432         alloc_reg(current,i,rt1[i]);
1433         current->is32|=1LL<<rt1[i];
1434       }
1435       dirty_reg(current,rt1[i]);
1436     }
1437   }
1438   if(opcode2[i]==0x3f) // DSRA32
1439   {
1440     if(rt1[i]) {
1441       alloc_reg64(current,i,rs1[i]);
1442       alloc_reg(current,i,rt1[i]);
1443       current->is32|=1LL<<rt1[i];
1444       dirty_reg(current,rt1[i]);
1445     }
1446   }
1447 }
1448
1449 void shift_alloc(struct regstat *current,int i)
1450 {
1451   if(rt1[i]) {
1452     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1453     {
1454       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1455       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1456       alloc_reg(current,i,rt1[i]);
1457       if(rt1[i]==rs2[i]) {
1458         alloc_reg_temp(current,i,-1);
1459         minimum_free_regs[i]=1;
1460       }
1461       current->is32|=1LL<<rt1[i];
1462     } else { // DSLLV/DSRLV/DSRAV
1463       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1464       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1465       alloc_reg64(current,i,rt1[i]);
1466       current->is32&=~(1LL<<rt1[i]);
1467       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1468       {
1469         alloc_reg_temp(current,i,-1);
1470         minimum_free_regs[i]=1;
1471       }
1472     }
1473     clear_const(current,rs1[i]);
1474     clear_const(current,rs2[i]);
1475     clear_const(current,rt1[i]);
1476     dirty_reg(current,rt1[i]);
1477   }
1478 }
1479
1480 void alu_alloc(struct regstat *current,int i)
1481 {
1482   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1483     if(rt1[i]) {
1484       if(rs1[i]&&rs2[i]) {
1485         alloc_reg(current,i,rs1[i]);
1486         alloc_reg(current,i,rs2[i]);
1487       }
1488       else {
1489         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1490         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1491       }
1492       alloc_reg(current,i,rt1[i]);
1493     }
1494     current->is32|=1LL<<rt1[i];
1495   }
1496   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1497     if(rt1[i]) {
1498       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1499       {
1500         alloc_reg64(current,i,rs1[i]);
1501         alloc_reg64(current,i,rs2[i]);
1502         alloc_reg(current,i,rt1[i]);
1503       } else {
1504         alloc_reg(current,i,rs1[i]);
1505         alloc_reg(current,i,rs2[i]);
1506         alloc_reg(current,i,rt1[i]);
1507       }
1508     }
1509     current->is32|=1LL<<rt1[i];
1510   }
1511   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1512     if(rt1[i]) {
1513       if(rs1[i]&&rs2[i]) {
1514         alloc_reg(current,i,rs1[i]);
1515         alloc_reg(current,i,rs2[i]);
1516       }
1517       else
1518       {
1519         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1520         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1521       }
1522       alloc_reg(current,i,rt1[i]);
1523       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1524       {
1525         if(!((current->uu>>rt1[i])&1)) {
1526           alloc_reg64(current,i,rt1[i]);
1527         }
1528         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1529           if(rs1[i]&&rs2[i]) {
1530             alloc_reg64(current,i,rs1[i]);
1531             alloc_reg64(current,i,rs2[i]);
1532           }
1533           else
1534           {
1535             // Is is really worth it to keep 64-bit values in registers?
1536             #ifdef NATIVE_64BIT
1537             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1538             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1539             #endif
1540           }
1541         }
1542         current->is32&=~(1LL<<rt1[i]);
1543       } else {
1544         current->is32|=1LL<<rt1[i];
1545       }
1546     }
1547   }
1548   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1549     if(rt1[i]) {
1550       if(rs1[i]&&rs2[i]) {
1551         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1552           alloc_reg64(current,i,rs1[i]);
1553           alloc_reg64(current,i,rs2[i]);
1554           alloc_reg64(current,i,rt1[i]);
1555         } else {
1556           alloc_reg(current,i,rs1[i]);
1557           alloc_reg(current,i,rs2[i]);
1558           alloc_reg(current,i,rt1[i]);
1559         }
1560       }
1561       else {
1562         alloc_reg(current,i,rt1[i]);
1563         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1564           // DADD used as move, or zeroing
1565           // If we have a 64-bit source, then make the target 64 bits too
1566           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1567             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1568             alloc_reg64(current,i,rt1[i]);
1569           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1570             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1571             alloc_reg64(current,i,rt1[i]);
1572           }
1573           if(opcode2[i]>=0x2e&&rs2[i]) {
1574             // DSUB used as negation - 64-bit result
1575             // If we have a 32-bit register, extend it to 64 bits
1576             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1577             alloc_reg64(current,i,rt1[i]);
1578           }
1579         }
1580       }
1581       if(rs1[i]&&rs2[i]) {
1582         current->is32&=~(1LL<<rt1[i]);
1583       } else if(rs1[i]) {
1584         current->is32&=~(1LL<<rt1[i]);
1585         if((current->is32>>rs1[i])&1)
1586           current->is32|=1LL<<rt1[i];
1587       } else if(rs2[i]) {
1588         current->is32&=~(1LL<<rt1[i]);
1589         if((current->is32>>rs2[i])&1)
1590           current->is32|=1LL<<rt1[i];
1591       } else {
1592         current->is32|=1LL<<rt1[i];
1593       }
1594     }
1595   }
1596   clear_const(current,rs1[i]);
1597   clear_const(current,rs2[i]);
1598   clear_const(current,rt1[i]);
1599   dirty_reg(current,rt1[i]);
1600 }
1601
1602 void imm16_alloc(struct regstat *current,int i)
1603 {
1604   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1605   else lt1[i]=rs1[i];
1606   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1607   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1608     current->is32&=~(1LL<<rt1[i]);
1609     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1610       // TODO: Could preserve the 32-bit flag if the immediate is zero
1611       alloc_reg64(current,i,rt1[i]);
1612       alloc_reg64(current,i,rs1[i]);
1613     }
1614     clear_const(current,rs1[i]);
1615     clear_const(current,rt1[i]);
1616   }
1617   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1618     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1619     current->is32|=1LL<<rt1[i];
1620     clear_const(current,rs1[i]);
1621     clear_const(current,rt1[i]);
1622   }
1623   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1624     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1625       if(rs1[i]!=rt1[i]) {
1626         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1627         alloc_reg64(current,i,rt1[i]);
1628         current->is32&=~(1LL<<rt1[i]);
1629       }
1630     }
1631     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1632     if(is_const(current,rs1[i])) {
1633       int v=get_const(current,rs1[i]);
1634       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1635       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1636       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1637     }
1638     else clear_const(current,rt1[i]);
1639   }
1640   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1641     if(is_const(current,rs1[i])) {
1642       int v=get_const(current,rs1[i]);
1643       set_const(current,rt1[i],v+imm[i]);
1644     }
1645     else clear_const(current,rt1[i]);
1646     current->is32|=1LL<<rt1[i];
1647   }
1648   else {
1649     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1650     current->is32|=1LL<<rt1[i];
1651   }
1652   dirty_reg(current,rt1[i]);
1653 }
1654
1655 void load_alloc(struct regstat *current,int i)
1656 {
1657   clear_const(current,rt1[i]);
1658   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1659   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1660   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1661   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1662     alloc_reg(current,i,rt1[i]);
1663     assert(get_reg(current->regmap,rt1[i])>=0);
1664     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1665     {
1666       current->is32&=~(1LL<<rt1[i]);
1667       alloc_reg64(current,i,rt1[i]);
1668     }
1669     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1670     {
1671       current->is32&=~(1LL<<rt1[i]);
1672       alloc_reg64(current,i,rt1[i]);
1673       alloc_all(current,i);
1674       alloc_reg64(current,i,FTEMP);
1675       minimum_free_regs[i]=HOST_REGS;
1676     }
1677     else current->is32|=1LL<<rt1[i];
1678     dirty_reg(current,rt1[i]);
1679     // If using TLB, need a register for pointer to the mapping table
1680     if(using_tlb) alloc_reg(current,i,TLREG);
1681     // LWL/LWR need a temporary register for the old value
1682     if(opcode[i]==0x22||opcode[i]==0x26)
1683     {
1684       alloc_reg(current,i,FTEMP);
1685       alloc_reg_temp(current,i,-1);
1686       minimum_free_regs[i]=1;
1687     }
1688   }
1689   else
1690   {
1691     // Load to r0 or unneeded register (dummy load)
1692     // but we still need a register to calculate the address
1693     if(opcode[i]==0x22||opcode[i]==0x26)
1694     {
1695       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1696     }
1697     // If using TLB, need a register for pointer to the mapping table
1698     if(using_tlb) alloc_reg(current,i,TLREG);
1699     alloc_reg_temp(current,i,-1);
1700     minimum_free_regs[i]=1;
1701     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1702     {
1703       alloc_all(current,i);
1704       alloc_reg64(current,i,FTEMP);
1705       minimum_free_regs[i]=HOST_REGS;
1706     }
1707   }
1708 }
1709
1710 void store_alloc(struct regstat *current,int i)
1711 {
1712   clear_const(current,rs2[i]);
1713   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1714   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1715   alloc_reg(current,i,rs2[i]);
1716   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1717     alloc_reg64(current,i,rs2[i]);
1718     if(rs2[i]) alloc_reg(current,i,FTEMP);
1719   }
1720   // If using TLB, need a register for pointer to the mapping table
1721   if(using_tlb) alloc_reg(current,i,TLREG);
1722   #if defined(HOST_IMM8)
1723   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1724   else alloc_reg(current,i,INVCP);
1725   #endif
1726   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1727     alloc_reg(current,i,FTEMP);
1728   }
1729   // We need a temporary register for address generation
1730   alloc_reg_temp(current,i,-1);
1731   minimum_free_regs[i]=1;
1732 }
1733
1734 void c1ls_alloc(struct regstat *current,int i)
1735 {
1736   //clear_const(current,rs1[i]); // FIXME
1737   clear_const(current,rt1[i]);
1738   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1739   alloc_reg(current,i,CSREG); // Status
1740   alloc_reg(current,i,FTEMP);
1741   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1742     alloc_reg64(current,i,FTEMP);
1743   }
1744   // If using TLB, need a register for pointer to the mapping table
1745   if(using_tlb) alloc_reg(current,i,TLREG);
1746   #if defined(HOST_IMM8)
1747   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1748   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1749     alloc_reg(current,i,INVCP);
1750   #endif
1751   // We need a temporary register for address generation
1752   alloc_reg_temp(current,i,-1);
1753 }
1754
1755 void c2ls_alloc(struct regstat *current,int i)
1756 {
1757   clear_const(current,rt1[i]);
1758   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1759   alloc_reg(current,i,FTEMP);
1760   // If using TLB, need a register for pointer to the mapping table
1761   if(using_tlb) alloc_reg(current,i,TLREG);
1762   #if defined(HOST_IMM8)
1763   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1764   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1765     alloc_reg(current,i,INVCP);
1766   #endif
1767   // We need a temporary register for address generation
1768   alloc_reg_temp(current,i,-1);
1769   minimum_free_regs[i]=1;
1770 }
1771
1772 #ifndef multdiv_alloc
1773 void multdiv_alloc(struct regstat *current,int i)
1774 {
1775   //  case 0x18: MULT
1776   //  case 0x19: MULTU
1777   //  case 0x1A: DIV
1778   //  case 0x1B: DIVU
1779   //  case 0x1C: DMULT
1780   //  case 0x1D: DMULTU
1781   //  case 0x1E: DDIV
1782   //  case 0x1F: DDIVU
1783   clear_const(current,rs1[i]);
1784   clear_const(current,rs2[i]);
1785   if(rs1[i]&&rs2[i])
1786   {
1787     if((opcode2[i]&4)==0) // 32-bit
1788     {
1789       current->u&=~(1LL<<HIREG);
1790       current->u&=~(1LL<<LOREG);
1791       alloc_reg(current,i,HIREG);
1792       alloc_reg(current,i,LOREG);
1793       alloc_reg(current,i,rs1[i]);
1794       alloc_reg(current,i,rs2[i]);
1795       current->is32|=1LL<<HIREG;
1796       current->is32|=1LL<<LOREG;
1797       dirty_reg(current,HIREG);
1798       dirty_reg(current,LOREG);
1799     }
1800     else // 64-bit
1801     {
1802       current->u&=~(1LL<<HIREG);
1803       current->u&=~(1LL<<LOREG);
1804       current->uu&=~(1LL<<HIREG);
1805       current->uu&=~(1LL<<LOREG);
1806       alloc_reg64(current,i,HIREG);
1807       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1808       alloc_reg64(current,i,rs1[i]);
1809       alloc_reg64(current,i,rs2[i]);
1810       alloc_all(current,i);
1811       current->is32&=~(1LL<<HIREG);
1812       current->is32&=~(1LL<<LOREG);
1813       dirty_reg(current,HIREG);
1814       dirty_reg(current,LOREG);
1815       minimum_free_regs[i]=HOST_REGS;
1816     }
1817   }
1818   else
1819   {
1820     // Multiply by zero is zero.
1821     // MIPS does not have a divide by zero exception.
1822     // The result is undefined, we return zero.
1823     alloc_reg(current,i,HIREG);
1824     alloc_reg(current,i,LOREG);
1825     current->is32|=1LL<<HIREG;
1826     current->is32|=1LL<<LOREG;
1827     dirty_reg(current,HIREG);
1828     dirty_reg(current,LOREG);
1829   }
1830 }
1831 #endif
1832
1833 void cop0_alloc(struct regstat *current,int i)
1834 {
1835   if(opcode2[i]==0) // MFC0
1836   {
1837     if(rt1[i]) {
1838       clear_const(current,rt1[i]);
1839       alloc_all(current,i);
1840       alloc_reg(current,i,rt1[i]);
1841       current->is32|=1LL<<rt1[i];
1842       dirty_reg(current,rt1[i]);
1843     }
1844   }
1845   else if(opcode2[i]==4) // MTC0
1846   {
1847     if(rs1[i]){
1848       clear_const(current,rs1[i]);
1849       alloc_reg(current,i,rs1[i]);
1850       alloc_all(current,i);
1851     }
1852     else {
1853       alloc_all(current,i); // FIXME: Keep r0
1854       current->u&=~1LL;
1855       alloc_reg(current,i,0);
1856     }
1857   }
1858   else
1859   {
1860     // TLBR/TLBWI/TLBWR/TLBP/ERET
1861     assert(opcode2[i]==0x10);
1862     alloc_all(current,i);
1863   }
1864   minimum_free_regs[i]=HOST_REGS;
1865 }
1866
1867 void cop1_alloc(struct regstat *current,int i)
1868 {
1869   alloc_reg(current,i,CSREG); // Load status
1870   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1871   {
1872     if(rt1[i]){
1873       clear_const(current,rt1[i]);
1874       if(opcode2[i]==1) {
1875         alloc_reg64(current,i,rt1[i]); // DMFC1
1876         current->is32&=~(1LL<<rt1[i]);
1877       }else{
1878         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1879         current->is32|=1LL<<rt1[i];
1880       }
1881       dirty_reg(current,rt1[i]);
1882     }
1883     alloc_reg_temp(current,i,-1);
1884   }
1885   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1886   {
1887     if(rs1[i]){
1888       clear_const(current,rs1[i]);
1889       if(opcode2[i]==5)
1890         alloc_reg64(current,i,rs1[i]); // DMTC1
1891       else
1892         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1893       alloc_reg_temp(current,i,-1);
1894     }
1895     else {
1896       current->u&=~1LL;
1897       alloc_reg(current,i,0);
1898       alloc_reg_temp(current,i,-1);
1899     }
1900   }
1901   minimum_free_regs[i]=1;
1902 }
1903 void fconv_alloc(struct regstat *current,int i)
1904 {
1905   alloc_reg(current,i,CSREG); // Load status
1906   alloc_reg_temp(current,i,-1);
1907   minimum_free_regs[i]=1;
1908 }
1909 void float_alloc(struct regstat *current,int i)
1910 {
1911   alloc_reg(current,i,CSREG); // Load status
1912   alloc_reg_temp(current,i,-1);
1913   minimum_free_regs[i]=1;
1914 }
1915 void c2op_alloc(struct regstat *current,int i)
1916 {
1917   alloc_reg_temp(current,i,-1);
1918 }
1919 void fcomp_alloc(struct regstat *current,int i)
1920 {
1921   alloc_reg(current,i,CSREG); // Load status
1922   alloc_reg(current,i,FSREG); // Load flags
1923   dirty_reg(current,FSREG); // Flag will be modified
1924   alloc_reg_temp(current,i,-1);
1925   minimum_free_regs[i]=1;
1926 }
1927
1928 void syscall_alloc(struct regstat *current,int i)
1929 {
1930   alloc_cc(current,i);
1931   dirty_reg(current,CCREG);
1932   alloc_all(current,i);
1933   minimum_free_regs[i]=HOST_REGS;
1934   current->isconst=0;
1935 }
1936
1937 void delayslot_alloc(struct regstat *current,int i)
1938 {
1939   switch(itype[i]) {
1940     case UJUMP:
1941     case CJUMP:
1942     case SJUMP:
1943     case RJUMP:
1944     case FJUMP:
1945     case SYSCALL:
1946     case HLECALL:
1947     case SPAN:
1948       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1949       printf("Disabled speculative precompilation\n");
1950       stop_after_jal=1;
1951       break;
1952     case IMM16:
1953       imm16_alloc(current,i);
1954       break;
1955     case LOAD:
1956     case LOADLR:
1957       load_alloc(current,i);
1958       break;
1959     case STORE:
1960     case STORELR:
1961       store_alloc(current,i);
1962       break;
1963     case ALU:
1964       alu_alloc(current,i);
1965       break;
1966     case SHIFT:
1967       shift_alloc(current,i);
1968       break;
1969     case MULTDIV:
1970       multdiv_alloc(current,i);
1971       break;
1972     case SHIFTIMM:
1973       shiftimm_alloc(current,i);
1974       break;
1975     case MOV:
1976       mov_alloc(current,i);
1977       break;
1978     case COP0:
1979       cop0_alloc(current,i);
1980       break;
1981     case COP1:
1982     case COP2:
1983       cop1_alloc(current,i);
1984       break;
1985     case C1LS:
1986       c1ls_alloc(current,i);
1987       break;
1988     case C2LS:
1989       c2ls_alloc(current,i);
1990       break;
1991     case FCONV:
1992       fconv_alloc(current,i);
1993       break;
1994     case FLOAT:
1995       float_alloc(current,i);
1996       break;
1997     case FCOMP:
1998       fcomp_alloc(current,i);
1999       break;
2000     case C2OP:
2001       c2op_alloc(current,i);
2002       break;
2003   }
2004 }
2005
2006 // Special case where a branch and delay slot span two pages in virtual memory
2007 static void pagespan_alloc(struct regstat *current,int i)
2008 {
2009   current->isconst=0;
2010   current->wasconst=0;
2011   regs[i].wasconst=0;
2012   minimum_free_regs[i]=HOST_REGS;
2013   alloc_all(current,i);
2014   alloc_cc(current,i);
2015   dirty_reg(current,CCREG);
2016   if(opcode[i]==3) // JAL
2017   {
2018     alloc_reg(current,i,31);
2019     dirty_reg(current,31);
2020   }
2021   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2022   {
2023     alloc_reg(current,i,rs1[i]);
2024     if (rt1[i]!=0) {
2025       alloc_reg(current,i,rt1[i]);
2026       dirty_reg(current,rt1[i]);
2027     }
2028   }
2029   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2030   {
2031     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2032     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2033     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2034     {
2035       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2036       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2037     }
2038   }
2039   else
2040   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2041   {
2042     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2043     if(!((current->is32>>rs1[i])&1))
2044     {
2045       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2046     }
2047   }
2048   else
2049   if(opcode[i]==0x11) // BC1
2050   {
2051     alloc_reg(current,i,FSREG);
2052     alloc_reg(current,i,CSREG);
2053   }
2054   //else ...
2055 }
2056
2057 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2058 {
2059   stubs[stubcount][0]=type;
2060   stubs[stubcount][1]=addr;
2061   stubs[stubcount][2]=retaddr;
2062   stubs[stubcount][3]=a;
2063   stubs[stubcount][4]=b;
2064   stubs[stubcount][5]=c;
2065   stubs[stubcount][6]=d;
2066   stubs[stubcount][7]=e;
2067   stubcount++;
2068 }
2069
2070 // Write out a single register
2071 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2072 {
2073   int hr;
2074   for(hr=0;hr<HOST_REGS;hr++) {
2075     if(hr!=EXCLUDE_REG) {
2076       if((regmap[hr]&63)==r) {
2077         if((dirty>>hr)&1) {
2078           if(regmap[hr]<64) {
2079             emit_storereg(r,hr);
2080 #ifndef FORCE32
2081             if((is32>>regmap[hr])&1) {
2082               emit_sarimm(hr,31,hr);
2083               emit_storereg(r|64,hr);
2084             }
2085 #endif
2086           }else{
2087             emit_storereg(r|64,hr);
2088           }
2089         }
2090       }
2091     }
2092   }
2093 }
2094
2095 int mchecksum()
2096 {
2097   //if(!tracedebug) return 0;
2098   int i;
2099   int sum=0;
2100   for(i=0;i<2097152;i++) {
2101     unsigned int temp=sum;
2102     sum<<=1;
2103     sum|=(~temp)>>31;
2104     sum^=((u_int *)rdram)[i];
2105   }
2106   return sum;
2107 }
2108 int rchecksum()
2109 {
2110   int i;
2111   int sum=0;
2112   for(i=0;i<64;i++)
2113     sum^=((u_int *)reg)[i];
2114   return sum;
2115 }
2116 void rlist()
2117 {
2118   int i;
2119   printf("TRACE: ");
2120   for(i=0;i<32;i++)
2121     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2122   printf("\n");
2123 #ifndef DISABLE_COP1
2124   printf("TRACE: ");
2125   for(i=0;i<32;i++)
2126     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2127   printf("\n");
2128 #endif
2129 }
2130
2131 void enabletrace()
2132 {
2133   tracedebug=1;
2134 }
2135
2136 void memdebug(int i)
2137 {
2138   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2139   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2140   //rlist();
2141   //if(tracedebug) {
2142   //if(Count>=-2084597794) {
2143   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2144   //if(0) {
2145     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2146     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2147     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2148     rlist();
2149     #ifdef __i386__
2150     printf("TRACE: %x\n",(&i)[-1]);
2151     #endif
2152     #ifdef __arm__
2153     int j;
2154     printf("TRACE: %x \n",(&j)[10]);
2155     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2156     #endif
2157     //fflush(stdout);
2158   }
2159   //printf("TRACE: %x\n",(&i)[-1]);
2160 }
2161
2162 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2163 {
2164   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2165 }
2166
2167 void alu_assemble(int i,struct regstat *i_regs)
2168 {
2169   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2170     if(rt1[i]) {
2171       signed char s1,s2,t;
2172       t=get_reg(i_regs->regmap,rt1[i]);
2173       if(t>=0) {
2174         s1=get_reg(i_regs->regmap,rs1[i]);
2175         s2=get_reg(i_regs->regmap,rs2[i]);
2176         if(rs1[i]&&rs2[i]) {
2177           assert(s1>=0);
2178           assert(s2>=0);
2179           if(opcode2[i]&2) emit_sub(s1,s2,t);
2180           else emit_add(s1,s2,t);
2181         }
2182         else if(rs1[i]) {
2183           if(s1>=0) emit_mov(s1,t);
2184           else emit_loadreg(rs1[i],t);
2185         }
2186         else if(rs2[i]) {
2187           if(s2>=0) {
2188             if(opcode2[i]&2) emit_neg(s2,t);
2189             else emit_mov(s2,t);
2190           }
2191           else {
2192             emit_loadreg(rs2[i],t);
2193             if(opcode2[i]&2) emit_neg(t,t);
2194           }
2195         }
2196         else emit_zeroreg(t);
2197       }
2198     }
2199   }
2200   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2201     if(rt1[i]) {
2202       signed char s1l,s2l,s1h,s2h,tl,th;
2203       tl=get_reg(i_regs->regmap,rt1[i]);
2204       th=get_reg(i_regs->regmap,rt1[i]|64);
2205       if(tl>=0) {
2206         s1l=get_reg(i_regs->regmap,rs1[i]);
2207         s2l=get_reg(i_regs->regmap,rs2[i]);
2208         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2209         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2210         if(rs1[i]&&rs2[i]) {
2211           assert(s1l>=0);
2212           assert(s2l>=0);
2213           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2214           else emit_adds(s1l,s2l,tl);
2215           if(th>=0) {
2216             #ifdef INVERTED_CARRY
2217             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2218             #else
2219             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2220             #endif
2221             else emit_add(s1h,s2h,th);
2222           }
2223         }
2224         else if(rs1[i]) {
2225           if(s1l>=0) emit_mov(s1l,tl);
2226           else emit_loadreg(rs1[i],tl);
2227           if(th>=0) {
2228             if(s1h>=0) emit_mov(s1h,th);
2229             else emit_loadreg(rs1[i]|64,th);
2230           }
2231         }
2232         else if(rs2[i]) {
2233           if(s2l>=0) {
2234             if(opcode2[i]&2) emit_negs(s2l,tl);
2235             else emit_mov(s2l,tl);
2236           }
2237           else {
2238             emit_loadreg(rs2[i],tl);
2239             if(opcode2[i]&2) emit_negs(tl,tl);
2240           }
2241           if(th>=0) {
2242             #ifdef INVERTED_CARRY
2243             if(s2h>=0) emit_mov(s2h,th);
2244             else emit_loadreg(rs2[i]|64,th);
2245             if(opcode2[i]&2) {
2246               emit_adcimm(-1,th); // x86 has inverted carry flag
2247               emit_not(th,th);
2248             }
2249             #else
2250             if(opcode2[i]&2) {
2251               if(s2h>=0) emit_rscimm(s2h,0,th);
2252               else {
2253                 emit_loadreg(rs2[i]|64,th);
2254                 emit_rscimm(th,0,th);
2255               }
2256             }else{
2257               if(s2h>=0) emit_mov(s2h,th);
2258               else emit_loadreg(rs2[i]|64,th);
2259             }
2260             #endif
2261           }
2262         }
2263         else {
2264           emit_zeroreg(tl);
2265           if(th>=0) emit_zeroreg(th);
2266         }
2267       }
2268     }
2269   }
2270   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2271     if(rt1[i]) {
2272       signed char s1l,s1h,s2l,s2h,t;
2273       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2274       {
2275         t=get_reg(i_regs->regmap,rt1[i]);
2276         //assert(t>=0);
2277         if(t>=0) {
2278           s1l=get_reg(i_regs->regmap,rs1[i]);
2279           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2280           s2l=get_reg(i_regs->regmap,rs2[i]);
2281           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2282           if(rs2[i]==0) // rx<r0
2283           {
2284             assert(s1h>=0);
2285             if(opcode2[i]==0x2a) // SLT
2286               emit_shrimm(s1h,31,t);
2287             else // SLTU (unsigned can not be less than zero)
2288               emit_zeroreg(t);
2289           }
2290           else if(rs1[i]==0) // r0<rx
2291           {
2292             assert(s2h>=0);
2293             if(opcode2[i]==0x2a) // SLT
2294               emit_set_gz64_32(s2h,s2l,t);
2295             else // SLTU (set if not zero)
2296               emit_set_nz64_32(s2h,s2l,t);
2297           }
2298           else {
2299             assert(s1l>=0);assert(s1h>=0);
2300             assert(s2l>=0);assert(s2h>=0);
2301             if(opcode2[i]==0x2a) // SLT
2302               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2303             else // SLTU
2304               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2305           }
2306         }
2307       } else {
2308         t=get_reg(i_regs->regmap,rt1[i]);
2309         //assert(t>=0);
2310         if(t>=0) {
2311           s1l=get_reg(i_regs->regmap,rs1[i]);
2312           s2l=get_reg(i_regs->regmap,rs2[i]);
2313           if(rs2[i]==0) // rx<r0
2314           {
2315             assert(s1l>=0);
2316             if(opcode2[i]==0x2a) // SLT
2317               emit_shrimm(s1l,31,t);
2318             else // SLTU (unsigned can not be less than zero)
2319               emit_zeroreg(t);
2320           }
2321           else if(rs1[i]==0) // r0<rx
2322           {
2323             assert(s2l>=0);
2324             if(opcode2[i]==0x2a) // SLT
2325               emit_set_gz32(s2l,t);
2326             else // SLTU (set if not zero)
2327               emit_set_nz32(s2l,t);
2328           }
2329           else{
2330             assert(s1l>=0);assert(s2l>=0);
2331             if(opcode2[i]==0x2a) // SLT
2332               emit_set_if_less32(s1l,s2l,t);
2333             else // SLTU
2334               emit_set_if_carry32(s1l,s2l,t);
2335           }
2336         }
2337       }
2338     }
2339   }
2340   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2341     if(rt1[i]) {
2342       signed char s1l,s1h,s2l,s2h,th,tl;
2343       tl=get_reg(i_regs->regmap,rt1[i]);
2344       th=get_reg(i_regs->regmap,rt1[i]|64);
2345       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2346       {
2347         assert(tl>=0);
2348         if(tl>=0) {
2349           s1l=get_reg(i_regs->regmap,rs1[i]);
2350           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2351           s2l=get_reg(i_regs->regmap,rs2[i]);
2352           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2353           if(rs1[i]&&rs2[i]) {
2354             assert(s1l>=0);assert(s1h>=0);
2355             assert(s2l>=0);assert(s2h>=0);
2356             if(opcode2[i]==0x24) { // AND
2357               emit_and(s1l,s2l,tl);
2358               emit_and(s1h,s2h,th);
2359             } else
2360             if(opcode2[i]==0x25) { // OR
2361               emit_or(s1l,s2l,tl);
2362               emit_or(s1h,s2h,th);
2363             } else
2364             if(opcode2[i]==0x26) { // XOR
2365               emit_xor(s1l,s2l,tl);
2366               emit_xor(s1h,s2h,th);
2367             } else
2368             if(opcode2[i]==0x27) { // NOR
2369               emit_or(s1l,s2l,tl);
2370               emit_or(s1h,s2h,th);
2371               emit_not(tl,tl);
2372               emit_not(th,th);
2373             }
2374           }
2375           else
2376           {
2377             if(opcode2[i]==0x24) { // AND
2378               emit_zeroreg(tl);
2379               emit_zeroreg(th);
2380             } else
2381             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2382               if(rs1[i]){
2383                 if(s1l>=0) emit_mov(s1l,tl);
2384                 else emit_loadreg(rs1[i],tl);
2385                 if(s1h>=0) emit_mov(s1h,th);
2386                 else emit_loadreg(rs1[i]|64,th);
2387               }
2388               else
2389               if(rs2[i]){
2390                 if(s2l>=0) emit_mov(s2l,tl);
2391                 else emit_loadreg(rs2[i],tl);
2392                 if(s2h>=0) emit_mov(s2h,th);
2393                 else emit_loadreg(rs2[i]|64,th);
2394               }
2395               else{
2396                 emit_zeroreg(tl);
2397                 emit_zeroreg(th);
2398               }
2399             } else
2400             if(opcode2[i]==0x27) { // NOR
2401               if(rs1[i]){
2402                 if(s1l>=0) emit_not(s1l,tl);
2403                 else{
2404                   emit_loadreg(rs1[i],tl);
2405                   emit_not(tl,tl);
2406                 }
2407                 if(s1h>=0) emit_not(s1h,th);
2408                 else{
2409                   emit_loadreg(rs1[i]|64,th);
2410                   emit_not(th,th);
2411                 }
2412               }
2413               else
2414               if(rs2[i]){
2415                 if(s2l>=0) emit_not(s2l,tl);
2416                 else{
2417                   emit_loadreg(rs2[i],tl);
2418                   emit_not(tl,tl);
2419                 }
2420                 if(s2h>=0) emit_not(s2h,th);
2421                 else{
2422                   emit_loadreg(rs2[i]|64,th);
2423                   emit_not(th,th);
2424                 }
2425               }
2426               else {
2427                 emit_movimm(-1,tl);
2428                 emit_movimm(-1,th);
2429               }
2430             }
2431           }
2432         }
2433       }
2434       else
2435       {
2436         // 32 bit
2437         if(tl>=0) {
2438           s1l=get_reg(i_regs->regmap,rs1[i]);
2439           s2l=get_reg(i_regs->regmap,rs2[i]);
2440           if(rs1[i]&&rs2[i]) {
2441             assert(s1l>=0);
2442             assert(s2l>=0);
2443             if(opcode2[i]==0x24) { // AND
2444               emit_and(s1l,s2l,tl);
2445             } else
2446             if(opcode2[i]==0x25) { // OR
2447               emit_or(s1l,s2l,tl);
2448             } else
2449             if(opcode2[i]==0x26) { // XOR
2450               emit_xor(s1l,s2l,tl);
2451             } else
2452             if(opcode2[i]==0x27) { // NOR
2453               emit_or(s1l,s2l,tl);
2454               emit_not(tl,tl);
2455             }
2456           }
2457           else
2458           {
2459             if(opcode2[i]==0x24) { // AND
2460               emit_zeroreg(tl);
2461             } else
2462             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2463               if(rs1[i]){
2464                 if(s1l>=0) emit_mov(s1l,tl);
2465                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2466               }
2467               else
2468               if(rs2[i]){
2469                 if(s2l>=0) emit_mov(s2l,tl);
2470                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2471               }
2472               else emit_zeroreg(tl);
2473             } else
2474             if(opcode2[i]==0x27) { // NOR
2475               if(rs1[i]){
2476                 if(s1l>=0) emit_not(s1l,tl);
2477                 else {
2478                   emit_loadreg(rs1[i],tl);
2479                   emit_not(tl,tl);
2480                 }
2481               }
2482               else
2483               if(rs2[i]){
2484                 if(s2l>=0) emit_not(s2l,tl);
2485                 else {
2486                   emit_loadreg(rs2[i],tl);
2487                   emit_not(tl,tl);
2488                 }
2489               }
2490               else emit_movimm(-1,tl);
2491             }
2492           }
2493         }
2494       }
2495     }
2496   }
2497 }
2498
2499 void imm16_assemble(int i,struct regstat *i_regs)
2500 {
2501   if (opcode[i]==0x0f) { // LUI
2502     if(rt1[i]) {
2503       signed char t;
2504       t=get_reg(i_regs->regmap,rt1[i]);
2505       //assert(t>=0);
2506       if(t>=0) {
2507         if(!((i_regs->isconst>>t)&1))
2508           emit_movimm(imm[i]<<16,t);
2509       }
2510     }
2511   }
2512   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2513     if(rt1[i]) {
2514       signed char s,t;
2515       t=get_reg(i_regs->regmap,rt1[i]);
2516       s=get_reg(i_regs->regmap,rs1[i]);
2517       if(rs1[i]) {
2518         //assert(t>=0);
2519         //assert(s>=0);
2520         if(t>=0) {
2521           if(!((i_regs->isconst>>t)&1)) {
2522             if(s<0) {
2523               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2524               emit_addimm(t,imm[i],t);
2525             }else{
2526               if(!((i_regs->wasconst>>s)&1))
2527                 emit_addimm(s,imm[i],t);
2528               else
2529                 emit_movimm(constmap[i][s]+imm[i],t);
2530             }
2531           }
2532         }
2533       } else {
2534         if(t>=0) {
2535           if(!((i_regs->isconst>>t)&1))
2536             emit_movimm(imm[i],t);
2537         }
2538       }
2539     }
2540   }
2541   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2542     if(rt1[i]) {
2543       signed char sh,sl,th,tl;
2544       th=get_reg(i_regs->regmap,rt1[i]|64);
2545       tl=get_reg(i_regs->regmap,rt1[i]);
2546       sh=get_reg(i_regs->regmap,rs1[i]|64);
2547       sl=get_reg(i_regs->regmap,rs1[i]);
2548       if(tl>=0) {
2549         if(rs1[i]) {
2550           assert(sh>=0);
2551           assert(sl>=0);
2552           if(th>=0) {
2553             emit_addimm64_32(sh,sl,imm[i],th,tl);
2554           }
2555           else {
2556             emit_addimm(sl,imm[i],tl);
2557           }
2558         } else {
2559           emit_movimm(imm[i],tl);
2560           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2561         }
2562       }
2563     }
2564   }
2565   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2566     if(rt1[i]) {
2567       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2568       signed char sh,sl,t;
2569       t=get_reg(i_regs->regmap,rt1[i]);
2570       sh=get_reg(i_regs->regmap,rs1[i]|64);
2571       sl=get_reg(i_regs->regmap,rs1[i]);
2572       //assert(t>=0);
2573       if(t>=0) {
2574         if(rs1[i]>0) {
2575           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2576           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2577             if(opcode[i]==0x0a) { // SLTI
2578               if(sl<0) {
2579                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2580                 emit_slti32(t,imm[i],t);
2581               }else{
2582                 emit_slti32(sl,imm[i],t);
2583               }
2584             }
2585             else { // SLTIU
2586               if(sl<0) {
2587                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2588                 emit_sltiu32(t,imm[i],t);
2589               }else{
2590                 emit_sltiu32(sl,imm[i],t);
2591               }
2592             }
2593           }else{ // 64-bit
2594             assert(sl>=0);
2595             if(opcode[i]==0x0a) // SLTI
2596               emit_slti64_32(sh,sl,imm[i],t);
2597             else // SLTIU
2598               emit_sltiu64_32(sh,sl,imm[i],t);
2599           }
2600         }else{
2601           // SLTI(U) with r0 is just stupid,
2602           // nonetheless examples can be found
2603           if(opcode[i]==0x0a) // SLTI
2604             if(0<imm[i]) emit_movimm(1,t);
2605             else emit_zeroreg(t);
2606           else // SLTIU
2607           {
2608             if(imm[i]) emit_movimm(1,t);
2609             else emit_zeroreg(t);
2610           }
2611         }
2612       }
2613     }
2614   }
2615   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2616     if(rt1[i]) {
2617       signed char sh,sl,th,tl;
2618       th=get_reg(i_regs->regmap,rt1[i]|64);
2619       tl=get_reg(i_regs->regmap,rt1[i]);
2620       sh=get_reg(i_regs->regmap,rs1[i]|64);
2621       sl=get_reg(i_regs->regmap,rs1[i]);
2622       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2623         if(opcode[i]==0x0c) //ANDI
2624         {
2625           if(rs1[i]) {
2626             if(sl<0) {
2627               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2628               emit_andimm(tl,imm[i],tl);
2629             }else{
2630               if(!((i_regs->wasconst>>sl)&1))
2631                 emit_andimm(sl,imm[i],tl);
2632               else
2633                 emit_movimm(constmap[i][sl]&imm[i],tl);
2634             }
2635           }
2636           else
2637             emit_zeroreg(tl);
2638           if(th>=0) emit_zeroreg(th);
2639         }
2640         else
2641         {
2642           if(rs1[i]) {
2643             if(sl<0) {
2644               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2645             }
2646             if(th>=0) {
2647               if(sh<0) {
2648                 emit_loadreg(rs1[i]|64,th);
2649               }else{
2650                 emit_mov(sh,th);
2651               }
2652             }
2653             if(opcode[i]==0x0d) //ORI
2654             if(sl<0) {
2655               emit_orimm(tl,imm[i],tl);
2656             }else{
2657               if(!((i_regs->wasconst>>sl)&1))
2658                 emit_orimm(sl,imm[i],tl);
2659               else
2660                 emit_movimm(constmap[i][sl]|imm[i],tl);
2661             }
2662             if(opcode[i]==0x0e) //XORI
2663             if(sl<0) {
2664               emit_xorimm(tl,imm[i],tl);
2665             }else{
2666               if(!((i_regs->wasconst>>sl)&1))
2667                 emit_xorimm(sl,imm[i],tl);
2668               else
2669                 emit_movimm(constmap[i][sl]^imm[i],tl);
2670             }
2671           }
2672           else {
2673             emit_movimm(imm[i],tl);
2674             if(th>=0) emit_zeroreg(th);
2675           }
2676         }
2677       }
2678     }
2679   }
2680 }
2681
2682 void shiftimm_assemble(int i,struct regstat *i_regs)
2683 {
2684   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2685   {
2686     if(rt1[i]) {
2687       signed char s,t;
2688       t=get_reg(i_regs->regmap,rt1[i]);
2689       s=get_reg(i_regs->regmap,rs1[i]);
2690       //assert(t>=0);
2691       if(t>=0){
2692         if(rs1[i]==0)
2693         {
2694           emit_zeroreg(t);
2695         }
2696         else
2697         {
2698           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2699           if(imm[i]) {
2700             if(opcode2[i]==0) // SLL
2701             {
2702               emit_shlimm(s<0?t:s,imm[i],t);
2703             }
2704             if(opcode2[i]==2) // SRL
2705             {
2706               emit_shrimm(s<0?t:s,imm[i],t);
2707             }
2708             if(opcode2[i]==3) // SRA
2709             {
2710               emit_sarimm(s<0?t:s,imm[i],t);
2711             }
2712           }else{
2713             // Shift by zero
2714             if(s>=0 && s!=t) emit_mov(s,t);
2715           }
2716         }
2717       }
2718       //emit_storereg(rt1[i],t); //DEBUG
2719     }
2720   }
2721   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2722   {
2723     if(rt1[i]) {
2724       signed char sh,sl,th,tl;
2725       th=get_reg(i_regs->regmap,rt1[i]|64);
2726       tl=get_reg(i_regs->regmap,rt1[i]);
2727       sh=get_reg(i_regs->regmap,rs1[i]|64);
2728       sl=get_reg(i_regs->regmap,rs1[i]);
2729       if(tl>=0) {
2730         if(rs1[i]==0)
2731         {
2732           emit_zeroreg(tl);
2733           if(th>=0) emit_zeroreg(th);
2734         }
2735         else
2736         {
2737           assert(sl>=0);
2738           assert(sh>=0);
2739           if(imm[i]) {
2740             if(opcode2[i]==0x38) // DSLL
2741             {
2742               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2743               emit_shlimm(sl,imm[i],tl);
2744             }
2745             if(opcode2[i]==0x3a) // DSRL
2746             {
2747               emit_shrdimm(sl,sh,imm[i],tl);
2748               if(th>=0) emit_shrimm(sh,imm[i],th);
2749             }
2750             if(opcode2[i]==0x3b) // DSRA
2751             {
2752               emit_shrdimm(sl,sh,imm[i],tl);
2753               if(th>=0) emit_sarimm(sh,imm[i],th);
2754             }
2755           }else{
2756             // Shift by zero
2757             if(sl!=tl) emit_mov(sl,tl);
2758             if(th>=0&&sh!=th) emit_mov(sh,th);
2759           }
2760         }
2761       }
2762     }
2763   }
2764   if(opcode2[i]==0x3c) // DSLL32
2765   {
2766     if(rt1[i]) {
2767       signed char sl,tl,th;
2768       tl=get_reg(i_regs->regmap,rt1[i]);
2769       th=get_reg(i_regs->regmap,rt1[i]|64);
2770       sl=get_reg(i_regs->regmap,rs1[i]);
2771       if(th>=0||tl>=0){
2772         assert(tl>=0);
2773         assert(th>=0);
2774         assert(sl>=0);
2775         emit_mov(sl,th);
2776         emit_zeroreg(tl);
2777         if(imm[i]>32)
2778         {
2779           emit_shlimm(th,imm[i]&31,th);
2780         }
2781       }
2782     }
2783   }
2784   if(opcode2[i]==0x3e) // DSRL32
2785   {
2786     if(rt1[i]) {
2787       signed char sh,tl,th;
2788       tl=get_reg(i_regs->regmap,rt1[i]);
2789       th=get_reg(i_regs->regmap,rt1[i]|64);
2790       sh=get_reg(i_regs->regmap,rs1[i]|64);
2791       if(tl>=0){
2792         assert(sh>=0);
2793         emit_mov(sh,tl);
2794         if(th>=0) emit_zeroreg(th);
2795         if(imm[i]>32)
2796         {
2797           emit_shrimm(tl,imm[i]&31,tl);
2798         }
2799       }
2800     }
2801   }
2802   if(opcode2[i]==0x3f) // DSRA32
2803   {
2804     if(rt1[i]) {
2805       signed char sh,tl;
2806       tl=get_reg(i_regs->regmap,rt1[i]);
2807       sh=get_reg(i_regs->regmap,rs1[i]|64);
2808       if(tl>=0){
2809         assert(sh>=0);
2810         emit_mov(sh,tl);
2811         if(imm[i]>32)
2812         {
2813           emit_sarimm(tl,imm[i]&31,tl);
2814         }
2815       }
2816     }
2817   }
2818 }
2819
2820 #ifndef shift_assemble
2821 void shift_assemble(int i,struct regstat *i_regs)
2822 {
2823   printf("Need shift_assemble for this architecture.\n");
2824   exit(1);
2825 }
2826 #endif
2827
2828 void load_assemble(int i,struct regstat *i_regs)
2829 {
2830   int s,th,tl,addr,map=-1;
2831   int offset;
2832   int jaddr=0;
2833   int memtarget=0,c=0;
2834   int fastload_reg_override=0;
2835   u_int hr,reglist=0;
2836   th=get_reg(i_regs->regmap,rt1[i]|64);
2837   tl=get_reg(i_regs->regmap,rt1[i]);
2838   s=get_reg(i_regs->regmap,rs1[i]);
2839   offset=imm[i];
2840   for(hr=0;hr<HOST_REGS;hr++) {
2841     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2842   }
2843   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2844   if(s>=0) {
2845     c=(i_regs->wasconst>>s)&1;
2846     if (c) {
2847       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2848       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2849     }
2850   }
2851   //printf("load_assemble: c=%d\n",c);
2852   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2853   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2854 #ifdef PCSX
2855   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2856     ||rt1[i]==0) {
2857       // could be FIFO, must perform the read
2858       // ||dummy read
2859       assem_debug("(forced read)\n");
2860       tl=get_reg(i_regs->regmap,-1);
2861       assert(tl>=0);
2862   }
2863 #endif
2864   if(offset||s<0||c) addr=tl;
2865   else addr=s;
2866   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2867  if(tl>=0) {
2868   //printf("load_assemble: c=%d\n",c);
2869   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2870   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2871   reglist&=~(1<<tl);
2872   if(th>=0) reglist&=~(1<<th);
2873   if(!using_tlb) {
2874     if(!c) {
2875       #ifdef RAM_OFFSET
2876       map=get_reg(i_regs->regmap,ROREG);
2877       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2878       #endif
2879 //#define R29_HACK 1
2880       #ifdef R29_HACK
2881       // Strmnnrmn's speed hack
2882       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2883       #endif
2884       {
2885         #ifdef PCSX
2886         if(sp_in_mirror&&rs1[i]==29) {
2887           emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
2888           emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
2889           fastload_reg_override=HOST_TEMPREG;
2890         }
2891         else
2892         #endif
2893         emit_cmpimm(addr,RAM_SIZE);
2894         jaddr=(int)out;
2895         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2896         // Hint to branch predictor that the branch is unlikely to be taken
2897         if(rs1[i]>=28)
2898           emit_jno_unlikely(0);
2899         else
2900         #endif
2901         emit_jno(0);
2902       }
2903     }
2904   }else{ // using tlb
2905     int x=0;
2906     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2907     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2908     map=get_reg(i_regs->regmap,TLREG);
2909     assert(map>=0);
2910     reglist&=~(1<<map);
2911     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2912     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2913   }
2914   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2915   if (opcode[i]==0x20) { // LB
2916     if(!c||memtarget) {
2917       if(!dummy) {
2918         #ifdef HOST_IMM_ADDR32
2919         if(c)
2920           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2921         else
2922         #endif
2923         {
2924           //emit_xorimm(addr,3,tl);
2925           //gen_tlb_addr_r(tl,map);
2926           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2927           int x=0,a=tl;
2928 #ifdef BIG_ENDIAN_MIPS
2929           if(!c) emit_xorimm(addr,3,tl);
2930           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2931 #else
2932           if(!c) a=addr;
2933 #endif
2934           if(fastload_reg_override) a=fastload_reg_override;
2935
2936           emit_movsbl_indexed_tlb(x,a,map,tl);
2937         }
2938       }
2939       if(jaddr)
2940         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2941     }
2942     else
2943       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2944   }
2945   if (opcode[i]==0x21) { // LH
2946     if(!c||memtarget) {
2947       if(!dummy) {
2948         #ifdef HOST_IMM_ADDR32
2949         if(c)
2950           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2951         else
2952         #endif
2953         {
2954           int x=0,a=tl;
2955 #ifdef BIG_ENDIAN_MIPS
2956           if(!c) emit_xorimm(addr,2,tl);
2957           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2958 #else
2959           if(!c) a=addr;
2960 #endif
2961           if(fastload_reg_override) a=fastload_reg_override;
2962           //#ifdef
2963           //emit_movswl_indexed_tlb(x,tl,map,tl);
2964           //else
2965           if(map>=0) {
2966             gen_tlb_addr_r(a,map);
2967             emit_movswl_indexed(x,a,tl);
2968           }else{
2969             #ifdef RAM_OFFSET
2970             emit_movswl_indexed(x,a,tl);
2971             #else
2972             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2973             #endif
2974           }
2975         }
2976       }
2977       if(jaddr)
2978         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2979     }
2980     else
2981       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2982   }
2983   if (opcode[i]==0x23) { // LW
2984     if(!c||memtarget) {
2985       if(!dummy) {
2986         int a=addr;
2987         if(fastload_reg_override) a=fastload_reg_override;
2988         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2989         #ifdef HOST_IMM_ADDR32
2990         if(c)
2991           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2992         else
2993         #endif
2994         emit_readword_indexed_tlb(0,a,map,tl);
2995       }
2996       if(jaddr)
2997         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2998     }
2999     else
3000       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3001   }
3002   if (opcode[i]==0x24) { // LBU
3003     if(!c||memtarget) {
3004       if(!dummy) {
3005         #ifdef HOST_IMM_ADDR32
3006         if(c)
3007           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
3008         else
3009         #endif
3010         {
3011           //emit_xorimm(addr,3,tl);
3012           //gen_tlb_addr_r(tl,map);
3013           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
3014           int x=0,a=tl;
3015 #ifdef BIG_ENDIAN_MIPS
3016           if(!c) emit_xorimm(addr,3,tl);
3017           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3018 #else
3019           if(!c) a=addr;
3020 #endif
3021           if(fastload_reg_override) a=fastload_reg_override;
3022
3023           emit_movzbl_indexed_tlb(x,a,map,tl);
3024         }
3025       }
3026       if(jaddr)
3027         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3028     }
3029     else
3030       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3031   }
3032   if (opcode[i]==0x25) { // LHU
3033     if(!c||memtarget) {
3034       if(!dummy) {
3035         #ifdef HOST_IMM_ADDR32
3036         if(c)
3037           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
3038         else
3039         #endif
3040         {
3041           int x=0,a=tl;
3042 #ifdef BIG_ENDIAN_MIPS
3043           if(!c) emit_xorimm(addr,2,tl);
3044           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3045 #else
3046           if(!c) a=addr;
3047 #endif
3048           if(fastload_reg_override) a=fastload_reg_override;
3049           //#ifdef
3050           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3051           //#else
3052           if(map>=0) {
3053             gen_tlb_addr_r(a,map);
3054             emit_movzwl_indexed(x,a,tl);
3055           }else{
3056             #ifdef RAM_OFFSET
3057             emit_movzwl_indexed(x,a,tl);
3058             #else
3059             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3060             #endif
3061           }
3062         }
3063       }
3064       if(jaddr)
3065         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3066     }
3067     else
3068       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3069   }
3070   if (opcode[i]==0x27) { // LWU
3071     assert(th>=0);
3072     if(!c||memtarget) {
3073       if(!dummy) {
3074         int a=addr;
3075         if(fastload_reg_override) a=fastload_reg_override;
3076         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3077         #ifdef HOST_IMM_ADDR32
3078         if(c)
3079           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3080         else
3081         #endif
3082         emit_readword_indexed_tlb(0,a,map,tl);
3083       }
3084       if(jaddr)
3085         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3086     }
3087     else {
3088       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3089     }
3090     emit_zeroreg(th);
3091   }
3092   if (opcode[i]==0x37) { // LD
3093     if(!c||memtarget) {
3094       if(!dummy) {
3095         int a=addr;
3096         if(fastload_reg_override) a=fastload_reg_override;
3097         //gen_tlb_addr_r(tl,map);
3098         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3099         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3100         #ifdef HOST_IMM_ADDR32
3101         if(c)
3102           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3103         else
3104         #endif
3105         emit_readdword_indexed_tlb(0,a,map,th,tl);
3106       }
3107       if(jaddr)
3108         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3109     }
3110     else
3111       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3112   }
3113  }
3114   //emit_storereg(rt1[i],tl); // DEBUG
3115   //if(opcode[i]==0x23)
3116   //if(opcode[i]==0x24)
3117   //if(opcode[i]==0x23||opcode[i]==0x24)
3118   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3119   {
3120     //emit_pusha();
3121     save_regs(0x100f);
3122         emit_readword((int)&last_count,ECX);
3123         #ifdef __i386__
3124         if(get_reg(i_regs->regmap,CCREG)<0)
3125           emit_loadreg(CCREG,HOST_CCREG);
3126         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3127         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3128         emit_writeword(HOST_CCREG,(int)&Count);
3129         #endif
3130         #ifdef __arm__
3131         if(get_reg(i_regs->regmap,CCREG)<0)
3132           emit_loadreg(CCREG,0);
3133         else
3134           emit_mov(HOST_CCREG,0);
3135         emit_add(0,ECX,0);
3136         emit_addimm(0,2*ccadj[i],0);
3137         emit_writeword(0,(int)&Count);
3138         #endif
3139     emit_call((int)memdebug);
3140     //emit_popa();
3141     restore_regs(0x100f);
3142   }/**/
3143 }
3144
3145 #ifndef loadlr_assemble
3146 void loadlr_assemble(int i,struct regstat *i_regs)
3147 {
3148   printf("Need loadlr_assemble for this architecture.\n");
3149   exit(1);
3150 }
3151 #endif
3152
3153 void store_assemble(int i,struct regstat *i_regs)
3154 {
3155   int s,th,tl,map=-1;
3156   int addr,temp;
3157   int offset;
3158   int jaddr=0,jaddr2,type;
3159   int memtarget=0,c=0;
3160   int agr=AGEN1+(i&1);
3161   int faststore_reg_override=0;
3162   u_int hr,reglist=0;
3163   th=get_reg(i_regs->regmap,rs2[i]|64);
3164   tl=get_reg(i_regs->regmap,rs2[i]);
3165   s=get_reg(i_regs->regmap,rs1[i]);
3166   temp=get_reg(i_regs->regmap,agr);
3167   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3168   offset=imm[i];
3169   if(s>=0) {
3170     c=(i_regs->wasconst>>s)&1;
3171     if(c) {
3172       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3173       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3174     }
3175   }
3176   assert(tl>=0);
3177   assert(temp>=0);
3178   for(hr=0;hr<HOST_REGS;hr++) {
3179     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3180   }
3181   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3182   if(offset||s<0||c) addr=temp;
3183   else addr=s;
3184   if(!using_tlb) {
3185     if(!c) {
3186       #ifdef PCSX
3187       if(sp_in_mirror&&rs1[i]==29) {
3188         emit_andimm(addr,~0x00e00000,HOST_TEMPREG);
3189         emit_cmpimm(HOST_TEMPREG,RAM_SIZE);
3190         faststore_reg_override=HOST_TEMPREG;
3191       }
3192       else
3193       #endif
3194       #ifdef R29_HACK
3195       // Strmnnrmn's speed hack
3196       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3197       #endif
3198       emit_cmpimm(addr,RAM_SIZE);
3199       #ifdef DESTRUCTIVE_SHIFT
3200       if(s==addr) emit_mov(s,temp);
3201       #endif
3202       #ifdef R29_HACK
3203       memtarget=1;
3204       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3205       #endif
3206       {
3207         jaddr=(int)out;
3208         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3209         // Hint to branch predictor that the branch is unlikely to be taken
3210         if(rs1[i]>=28)
3211           emit_jno_unlikely(0);
3212         else
3213         #endif
3214         emit_jno(0);
3215       }
3216     }
3217   }else{ // using tlb
3218     int x=0;
3219     if (opcode[i]==0x28) x=3; // SB
3220     if (opcode[i]==0x29) x=2; // SH
3221     map=get_reg(i_regs->regmap,TLREG);
3222     assert(map>=0);
3223     reglist&=~(1<<map);
3224     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3225     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3226   }
3227
3228   if (opcode[i]==0x28) { // SB
3229     if(!c||memtarget) {
3230       int x=0,a=temp;
3231 #ifdef BIG_ENDIAN_MIPS
3232       if(!c) emit_xorimm(addr,3,temp);
3233       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3234 #else
3235       if(!c) a=addr;
3236 #endif
3237       if(faststore_reg_override) a=faststore_reg_override;
3238       //gen_tlb_addr_w(temp,map);
3239       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3240       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3241     }
3242     type=STOREB_STUB;
3243   }
3244   if (opcode[i]==0x29) { // SH
3245     if(!c||memtarget) {
3246       int x=0,a=temp;
3247 #ifdef BIG_ENDIAN_MIPS
3248       if(!c) emit_xorimm(addr,2,temp);
3249       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3250 #else
3251       if(!c) a=addr;
3252 #endif
3253       if(faststore_reg_override) a=faststore_reg_override;
3254       //#ifdef
3255       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3256       //#else
3257       if(map>=0) {
3258         gen_tlb_addr_w(a,map);
3259         emit_writehword_indexed(tl,x,a);
3260       }else
3261         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3262     }
3263     type=STOREH_STUB;
3264   }
3265   if (opcode[i]==0x2B) { // SW
3266     if(!c||memtarget) {
3267       int a=addr;
3268       if(faststore_reg_override) a=faststore_reg_override;
3269       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3270       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3271     }
3272     type=STOREW_STUB;
3273   }
3274   if (opcode[i]==0x3F) { // SD
3275     if(!c||memtarget) {
3276       int a=addr;
3277       if(faststore_reg_override) a=faststore_reg_override;
3278       if(rs2[i]) {
3279         assert(th>=0);
3280         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3281         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3282         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3283       }else{
3284         // Store zero
3285         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3286         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3287         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3288       }
3289     }
3290     type=STORED_STUB;
3291   }
3292 #ifdef PCSX
3293   if(jaddr) {
3294     // PCSX store handlers don't check invcode again
3295     reglist|=1<<addr;
3296     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3297     jaddr=0;
3298   }
3299 #endif
3300   if(!using_tlb) {
3301     if(!c||memtarget) {
3302       #ifdef DESTRUCTIVE_SHIFT
3303       // The x86 shift operation is 'destructive'; it overwrites the
3304       // source register, so we need to make a copy first and use that.
3305       addr=temp;
3306       #endif
3307       #if defined(HOST_IMM8)
3308       int ir=get_reg(i_regs->regmap,INVCP);
3309       assert(ir>=0);
3310       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3311       #else
3312       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3313       #endif
3314       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3315       emit_callne(invalidate_addr_reg[addr]);
3316       #else
3317       jaddr2=(int)out;
3318       emit_jne(0);
3319       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3320       #endif
3321     }
3322   }
3323   if(jaddr) {
3324     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3325   } else if(c&&!memtarget) {
3326     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3327   }
3328   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3329   //if(opcode[i]==0x2B || opcode[i]==0x28)
3330   //if(opcode[i]==0x2B || opcode[i]==0x29)
3331   //if(opcode[i]==0x2B)
3332   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3333   {
3334     #ifdef __i386__
3335     emit_pusha();
3336     #endif
3337     #ifdef __arm__
3338     save_regs(0x100f);
3339     #endif
3340         emit_readword((int)&last_count,ECX);
3341         #ifdef __i386__
3342         if(get_reg(i_regs->regmap,CCREG)<0)
3343           emit_loadreg(CCREG,HOST_CCREG);
3344         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3345         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3346         emit_writeword(HOST_CCREG,(int)&Count);
3347         #endif
3348         #ifdef __arm__
3349         if(get_reg(i_regs->regmap,CCREG)<0)
3350           emit_loadreg(CCREG,0);
3351         else
3352           emit_mov(HOST_CCREG,0);
3353         emit_add(0,ECX,0);
3354         emit_addimm(0,2*ccadj[i],0);
3355         emit_writeword(0,(int)&Count);
3356         #endif
3357     emit_call((int)memdebug);
3358     #ifdef __i386__
3359     emit_popa();
3360     #endif
3361     #ifdef __arm__
3362     restore_regs(0x100f);
3363     #endif
3364   }/**/
3365 }
3366
3367 void storelr_assemble(int i,struct regstat *i_regs)
3368 {
3369   int s,th,tl;
3370   int temp;
3371   int temp2;
3372   int offset;
3373   int jaddr=0,jaddr2;
3374   int case1,case2,case3;
3375   int done0,done1,done2;
3376   int memtarget=0,c=0;
3377   int agr=AGEN1+(i&1);
3378   u_int hr,reglist=0;
3379   th=get_reg(i_regs->regmap,rs2[i]|64);
3380   tl=get_reg(i_regs->regmap,rs2[i]);
3381   s=get_reg(i_regs->regmap,rs1[i]);
3382   temp=get_reg(i_regs->regmap,agr);
3383   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3384   offset=imm[i];
3385   if(s>=0) {
3386     c=(i_regs->isconst>>s)&1;
3387     if(c) {
3388       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3389       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3390     }
3391   }
3392   assert(tl>=0);
3393   for(hr=0;hr<HOST_REGS;hr++) {
3394     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3395   }
3396   assert(temp>=0);
3397   if(!using_tlb) {
3398     if(!c) {
3399       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3400       if(!offset&&s!=temp) emit_mov(s,temp);
3401       jaddr=(int)out;
3402       emit_jno(0);
3403     }
3404     else
3405     {
3406       if(!memtarget||!rs1[i]) {
3407         jaddr=(int)out;
3408         emit_jmp(0);
3409       }
3410     }
3411     #ifdef RAM_OFFSET
3412     int map=get_reg(i_regs->regmap,ROREG);
3413     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3414     gen_tlb_addr_w(temp,map);
3415     #else
3416     if((u_int)rdram!=0x80000000) 
3417       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3418     #endif
3419   }else{ // using tlb
3420     int map=get_reg(i_regs->regmap,TLREG);
3421     assert(map>=0);
3422     reglist&=~(1<<map);
3423     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3424     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3425     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3426     if(!jaddr&&!memtarget) {
3427       jaddr=(int)out;
3428       emit_jmp(0);
3429     }
3430     gen_tlb_addr_w(temp,map);
3431   }
3432
3433   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3434     temp2=get_reg(i_regs->regmap,FTEMP);
3435     if(!rs2[i]) temp2=th=tl;
3436   }
3437
3438 #ifndef BIG_ENDIAN_MIPS
3439     emit_xorimm(temp,3,temp);
3440 #endif
3441   emit_testimm(temp,2);
3442   case2=(int)out;
3443   emit_jne(0);
3444   emit_testimm(temp,1);
3445   case1=(int)out;
3446   emit_jne(0);
3447   // 0
3448   if (opcode[i]==0x2A) { // SWL
3449     emit_writeword_indexed(tl,0,temp);
3450   }
3451   if (opcode[i]==0x2E) { // SWR
3452     emit_writebyte_indexed(tl,3,temp);
3453   }
3454   if (opcode[i]==0x2C) { // SDL
3455     emit_writeword_indexed(th,0,temp);
3456     if(rs2[i]) emit_mov(tl,temp2);
3457   }
3458   if (opcode[i]==0x2D) { // SDR
3459     emit_writebyte_indexed(tl,3,temp);
3460     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3461   }
3462   done0=(int)out;
3463   emit_jmp(0);
3464   // 1
3465   set_jump_target(case1,(int)out);
3466   if (opcode[i]==0x2A) { // SWL
3467     // Write 3 msb into three least significant bytes
3468     if(rs2[i]) emit_rorimm(tl,8,tl);
3469     emit_writehword_indexed(tl,-1,temp);
3470     if(rs2[i]) emit_rorimm(tl,16,tl);
3471     emit_writebyte_indexed(tl,1,temp);
3472     if(rs2[i]) emit_rorimm(tl,8,tl);
3473   }
3474   if (opcode[i]==0x2E) { // SWR
3475     // Write two lsb into two most significant bytes
3476     emit_writehword_indexed(tl,1,temp);
3477   }
3478   if (opcode[i]==0x2C) { // SDL
3479     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3480     // Write 3 msb into three least significant bytes
3481     if(rs2[i]) emit_rorimm(th,8,th);
3482     emit_writehword_indexed(th,-1,temp);
3483     if(rs2[i]) emit_rorimm(th,16,th);
3484     emit_writebyte_indexed(th,1,temp);
3485     if(rs2[i]) emit_rorimm(th,8,th);
3486   }
3487   if (opcode[i]==0x2D) { // SDR
3488     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3489     // Write two lsb into two most significant bytes
3490     emit_writehword_indexed(tl,1,temp);
3491   }
3492   done1=(int)out;
3493   emit_jmp(0);
3494   // 2
3495   set_jump_target(case2,(int)out);
3496   emit_testimm(temp,1);
3497   case3=(int)out;
3498   emit_jne(0);
3499   if (opcode[i]==0x2A) { // SWL
3500     // Write two msb into two least significant bytes
3501     if(rs2[i]) emit_rorimm(tl,16,tl);
3502     emit_writehword_indexed(tl,-2,temp);
3503     if(rs2[i]) emit_rorimm(tl,16,tl);
3504   }
3505   if (opcode[i]==0x2E) { // SWR
3506     // Write 3 lsb into three most significant bytes
3507     emit_writebyte_indexed(tl,-1,temp);
3508     if(rs2[i]) emit_rorimm(tl,8,tl);
3509     emit_writehword_indexed(tl,0,temp);
3510     if(rs2[i]) emit_rorimm(tl,24,tl);
3511   }
3512   if (opcode[i]==0x2C) { // SDL
3513     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3514     // Write two msb into two least significant bytes
3515     if(rs2[i]) emit_rorimm(th,16,th);
3516     emit_writehword_indexed(th,-2,temp);
3517     if(rs2[i]) emit_rorimm(th,16,th);
3518   }
3519   if (opcode[i]==0x2D) { // SDR
3520     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3521     // Write 3 lsb into three most significant bytes
3522     emit_writebyte_indexed(tl,-1,temp);
3523     if(rs2[i]) emit_rorimm(tl,8,tl);
3524     emit_writehword_indexed(tl,0,temp);
3525     if(rs2[i]) emit_rorimm(tl,24,tl);
3526   }
3527   done2=(int)out;
3528   emit_jmp(0);
3529   // 3
3530   set_jump_target(case3,(int)out);
3531   if (opcode[i]==0x2A) { // SWL
3532     // Write msb into least significant byte
3533     if(rs2[i]) emit_rorimm(tl,24,tl);
3534     emit_writebyte_indexed(tl,-3,temp);
3535     if(rs2[i]) emit_rorimm(tl,8,tl);
3536   }
3537   if (opcode[i]==0x2E) { // SWR
3538     // Write entire word
3539     emit_writeword_indexed(tl,-3,temp);
3540   }
3541   if (opcode[i]==0x2C) { // SDL
3542     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3543     // Write msb into least significant byte
3544     if(rs2[i]) emit_rorimm(th,24,th);
3545     emit_writebyte_indexed(th,-3,temp);
3546     if(rs2[i]) emit_rorimm(th,8,th);
3547   }
3548   if (opcode[i]==0x2D) { // SDR
3549     if(rs2[i]) emit_mov(th,temp2);
3550     // Write entire word
3551     emit_writeword_indexed(tl,-3,temp);
3552   }
3553   set_jump_target(done0,(int)out);
3554   set_jump_target(done1,(int)out);
3555   set_jump_target(done2,(int)out);
3556   if (opcode[i]==0x2C) { // SDL
3557     emit_testimm(temp,4);
3558     done0=(int)out;
3559     emit_jne(0);
3560     emit_andimm(temp,~3,temp);
3561     emit_writeword_indexed(temp2,4,temp);
3562     set_jump_target(done0,(int)out);
3563   }
3564   if (opcode[i]==0x2D) { // SDR
3565     emit_testimm(temp,4);
3566     done0=(int)out;
3567     emit_jeq(0);
3568     emit_andimm(temp,~3,temp);
3569     emit_writeword_indexed(temp2,-4,temp);
3570     set_jump_target(done0,(int)out);
3571   }
3572   if(!c||!memtarget)
3573     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3574   if(!using_tlb) {
3575     #ifdef RAM_OFFSET
3576     int map=get_reg(i_regs->regmap,ROREG);
3577     if(map<0) map=HOST_TEMPREG;
3578     gen_orig_addr_w(temp,map);
3579     #else
3580     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3581     #endif
3582     #if defined(HOST_IMM8)
3583     int ir=get_reg(i_regs->regmap,INVCP);
3584     assert(ir>=0);
3585     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3586     #else
3587     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3588     #endif
3589     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3590     emit_callne(invalidate_addr_reg[temp]);
3591     #else
3592     jaddr2=(int)out;
3593     emit_jne(0);
3594     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3595     #endif
3596   }
3597   /*
3598     emit_pusha();
3599     //save_regs(0x100f);
3600         emit_readword((int)&last_count,ECX);
3601         if(get_reg(i_regs->regmap,CCREG)<0)
3602           emit_loadreg(CCREG,HOST_CCREG);
3603         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3604         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3605         emit_writeword(HOST_CCREG,(int)&Count);
3606     emit_call((int)memdebug);
3607     emit_popa();
3608     //restore_regs(0x100f);
3609   /**/
3610 }
3611
3612 void c1ls_assemble(int i,struct regstat *i_regs)
3613 {
3614 #ifndef DISABLE_COP1
3615   int s,th,tl;
3616   int temp,ar;
3617   int map=-1;
3618   int offset;
3619   int c=0;
3620   int jaddr,jaddr2=0,jaddr3,type;
3621   int agr=AGEN1+(i&1);
3622   u_int hr,reglist=0;
3623   th=get_reg(i_regs->regmap,FTEMP|64);
3624   tl=get_reg(i_regs->regmap,FTEMP);
3625   s=get_reg(i_regs->regmap,rs1[i]);
3626   temp=get_reg(i_regs->regmap,agr);
3627   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3628   offset=imm[i];
3629   assert(tl>=0);
3630   assert(rs1[i]>0);
3631   assert(temp>=0);
3632   for(hr=0;hr<HOST_REGS;hr++) {
3633     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3634   }
3635   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3636   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3637   {
3638     // Loads use a temporary register which we need to save
3639     reglist|=1<<temp;
3640   }
3641   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3642     ar=temp;
3643   else // LWC1/LDC1
3644     ar=tl;
3645   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3646   //else c=(i_regs->wasconst>>s)&1;
3647   if(s>=0) c=(i_regs->wasconst>>s)&1;
3648   // Check cop1 unusable
3649   if(!cop1_usable) {
3650     signed char rs=get_reg(i_regs->regmap,CSREG);
3651     assert(rs>=0);
3652     emit_testimm(rs,0x20000000);
3653     jaddr=(int)out;
3654     emit_jeq(0);
3655     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3656     cop1_usable=1;
3657   }
3658   if (opcode[i]==0x39) { // SWC1 (get float address)
3659     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3660   }
3661   if (opcode[i]==0x3D) { // SDC1 (get double address)
3662     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3663   }
3664   // Generate address + offset
3665   if(!using_tlb) {
3666     if(!c)
3667       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3668   }
3669   else
3670   {
3671     map=get_reg(i_regs->regmap,TLREG);
3672     assert(map>=0);
3673     reglist&=~(1<<map);
3674     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3675       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3676     }
3677     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3678       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3679     }
3680   }
3681   if (opcode[i]==0x39) { // SWC1 (read float)
3682     emit_readword_indexed(0,tl,tl);
3683   }
3684   if (opcode[i]==0x3D) { // SDC1 (read double)
3685     emit_readword_indexed(4,tl,th);
3686     emit_readword_indexed(0,tl,tl);
3687   }
3688   if (opcode[i]==0x31) { // LWC1 (get target address)
3689     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3690   }
3691   if (opcode[i]==0x35) { // LDC1 (get target address)
3692     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3693   }
3694   if(!using_tlb) {
3695     if(!c) {
3696       jaddr2=(int)out;
3697       emit_jno(0);
3698     }
3699     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3700       jaddr2=(int)out;
3701       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3702     }
3703     #ifdef DESTRUCTIVE_SHIFT
3704     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3705       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3706     }
3707     #endif
3708   }else{
3709     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3710       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3711     }
3712     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3713       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3714     }
3715   }
3716   if (opcode[i]==0x31) { // LWC1
3717     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3718     //gen_tlb_addr_r(ar,map);
3719     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3720     #ifdef HOST_IMM_ADDR32
3721     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3722     else
3723     #endif
3724     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3725     type=LOADW_STUB;
3726   }
3727   if (opcode[i]==0x35) { // LDC1
3728     assert(th>=0);
3729     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3730     //gen_tlb_addr_r(ar,map);
3731     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3732     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3733     #ifdef HOST_IMM_ADDR32
3734     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3735     else
3736     #endif
3737     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3738     type=LOADD_STUB;
3739   }
3740   if (opcode[i]==0x39) { // SWC1
3741     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3742     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3743     type=STOREW_STUB;
3744   }
3745   if (opcode[i]==0x3D) { // SDC1
3746     assert(th>=0);
3747     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3748     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3749     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3750     type=STORED_STUB;
3751   }
3752   if(!using_tlb) {
3753     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3754       #ifndef DESTRUCTIVE_SHIFT
3755       temp=offset||c||s<0?ar:s;
3756       #endif
3757       #if defined(HOST_IMM8)
3758       int ir=get_reg(i_regs->regmap,INVCP);
3759       assert(ir>=0);
3760       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3761       #else
3762       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3763       #endif
3764       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3765       emit_callne(invalidate_addr_reg[temp]);
3766       #else
3767       jaddr3=(int)out;
3768       emit_jne(0);
3769       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3770       #endif
3771     }
3772   }
3773   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3774   if (opcode[i]==0x31) { // LWC1 (write float)
3775     emit_writeword_indexed(tl,0,temp);
3776   }
3777   if (opcode[i]==0x35) { // LDC1 (write double)
3778     emit_writeword_indexed(th,4,temp);
3779     emit_writeword_indexed(tl,0,temp);
3780   }
3781   //if(opcode[i]==0x39)
3782   /*if(opcode[i]==0x39||opcode[i]==0x31)
3783   {
3784     emit_pusha();
3785         emit_readword((int)&last_count,ECX);
3786         if(get_reg(i_regs->regmap,CCREG)<0)
3787           emit_loadreg(CCREG,HOST_CCREG);
3788         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3789         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3790         emit_writeword(HOST_CCREG,(int)&Count);
3791     emit_call((int)memdebug);
3792     emit_popa();
3793   }/**/
3794 #else
3795   cop1_unusable(i, i_regs);
3796 #endif
3797 }
3798
3799 void c2ls_assemble(int i,struct regstat *i_regs)
3800 {
3801   int s,tl;
3802   int ar;
3803   int offset;
3804   int memtarget=0,c=0;
3805   int jaddr2=0,jaddr3,type;
3806   int agr=AGEN1+(i&1);
3807   u_int hr,reglist=0;
3808   u_int copr=(source[i]>>16)&0x1f;
3809   s=get_reg(i_regs->regmap,rs1[i]);
3810   tl=get_reg(i_regs->regmap,FTEMP);
3811   offset=imm[i];
3812   assert(rs1[i]>0);
3813   assert(tl>=0);
3814   assert(!using_tlb);
3815
3816   for(hr=0;hr<HOST_REGS;hr++) {
3817     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3818   }
3819   if(i_regs->regmap[HOST_CCREG]==CCREG)
3820     reglist&=~(1<<HOST_CCREG);
3821
3822   // get the address
3823   if (opcode[i]==0x3a) { // SWC2
3824     ar=get_reg(i_regs->regmap,agr);
3825     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3826     reglist|=1<<ar;
3827   } else { // LWC2
3828     ar=tl;
3829   }
3830   if(s>=0) c=(i_regs->wasconst>>s)&1;
3831   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3832   if (!offset&&!c&&s>=0) ar=s;
3833   assert(ar>=0);
3834
3835   if (opcode[i]==0x3a) { // SWC2
3836     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3837     type=STOREW_STUB;
3838   }
3839   else
3840     type=LOADW_STUB;
3841
3842   if(c&&!memtarget) {
3843     jaddr2=(int)out;
3844     emit_jmp(0); // inline_readstub/inline_writestub?
3845   }
3846   else {
3847     if(!c) {
3848       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3849       jaddr2=(int)out;
3850       emit_jno(0);
3851     }
3852     if (opcode[i]==0x32) { // LWC2
3853       #ifdef HOST_IMM_ADDR32
3854       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3855       else
3856       #endif
3857       emit_readword_indexed(0,ar,tl);
3858     }
3859     if (opcode[i]==0x3a) { // SWC2
3860       #ifdef DESTRUCTIVE_SHIFT
3861       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3862       #endif
3863       emit_writeword_indexed(tl,0,ar);
3864     }
3865   }
3866   if(jaddr2)
3867     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3868   if (opcode[i]==0x3a) { // SWC2
3869 #if defined(HOST_IMM8)
3870     int ir=get_reg(i_regs->regmap,INVCP);
3871     assert(ir>=0);
3872     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3873 #else
3874     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3875 #endif
3876     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3877     emit_callne(invalidate_addr_reg[ar]);
3878     #else
3879     jaddr3=(int)out;
3880     emit_jne(0);
3881     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3882     #endif
3883   }
3884   if (opcode[i]==0x32) { // LWC2
3885     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3886   }
3887 }
3888
3889 #ifndef multdiv_assemble
3890 void multdiv_assemble(int i,struct regstat *i_regs)
3891 {
3892   printf("Need multdiv_assemble for this architecture.\n");
3893   exit(1);
3894 }
3895 #endif
3896
3897 void mov_assemble(int i,struct regstat *i_regs)
3898 {
3899   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3900   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3901   if(rt1[i]) {
3902     signed char sh,sl,th,tl;
3903     th=get_reg(i_regs->regmap,rt1[i]|64);
3904     tl=get_reg(i_regs->regmap,rt1[i]);
3905     //assert(tl>=0);
3906     if(tl>=0) {
3907       sh=get_reg(i_regs->regmap,rs1[i]|64);
3908       sl=get_reg(i_regs->regmap,rs1[i]);
3909       if(sl>=0) emit_mov(sl,tl);
3910       else emit_loadreg(rs1[i],tl);
3911       if(th>=0) {
3912         if(sh>=0) emit_mov(sh,th);
3913         else emit_loadreg(rs1[i]|64,th);
3914       }
3915     }
3916   }
3917 }
3918
3919 #ifndef fconv_assemble
3920 void fconv_assemble(int i,struct regstat *i_regs)
3921 {
3922   printf("Need fconv_assemble for this architecture.\n");
3923   exit(1);
3924 }
3925 #endif
3926
3927 #if 0
3928 void float_assemble(int i,struct regstat *i_regs)
3929 {
3930   printf("Need float_assemble for this architecture.\n");
3931   exit(1);
3932 }
3933 #endif
3934
3935 void syscall_assemble(int i,struct regstat *i_regs)
3936 {
3937   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3938   assert(ccreg==HOST_CCREG);
3939   assert(!is_delayslot);
3940   emit_movimm(start+i*4,EAX); // Get PC
3941   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3942   emit_jmp((int)jump_syscall_hle); // XXX
3943 }
3944
3945 void hlecall_assemble(int i,struct regstat *i_regs)
3946 {
3947   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3948   assert(ccreg==HOST_CCREG);
3949   assert(!is_delayslot);
3950   emit_movimm(start+i*4+4,0); // Get PC
3951   emit_movimm((int)psxHLEt[source[i]&7],1);
3952   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3953   emit_jmp((int)jump_hlecall);
3954 }
3955
3956 void intcall_assemble(int i,struct regstat *i_regs)
3957 {
3958   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3959   assert(ccreg==HOST_CCREG);
3960   assert(!is_delayslot);
3961   emit_movimm(start+i*4,0); // Get PC
3962   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3963   emit_jmp((int)jump_intcall);
3964 }
3965
3966 void ds_assemble(int i,struct regstat *i_regs)
3967 {
3968   is_delayslot=1;
3969   switch(itype[i]) {
3970     case ALU:
3971       alu_assemble(i,i_regs);break;
3972     case IMM16:
3973       imm16_assemble(i,i_regs);break;
3974     case SHIFT:
3975       shift_assemble(i,i_regs);break;
3976     case SHIFTIMM:
3977       shiftimm_assemble(i,i_regs);break;
3978     case LOAD:
3979       load_assemble(i,i_regs);break;
3980     case LOADLR:
3981       loadlr_assemble(i,i_regs);break;
3982     case STORE:
3983       store_assemble(i,i_regs);break;
3984     case STORELR:
3985       storelr_assemble(i,i_regs);break;
3986     case COP0:
3987       cop0_assemble(i,i_regs);break;
3988     case COP1:
3989       cop1_assemble(i,i_regs);break;
3990     case C1LS:
3991       c1ls_assemble(i,i_regs);break;
3992     case COP2:
3993       cop2_assemble(i,i_regs);break;
3994     case C2LS:
3995       c2ls_assemble(i,i_regs);break;
3996     case C2OP:
3997       c2op_assemble(i,i_regs);break;
3998     case FCONV:
3999       fconv_assemble(i,i_regs);break;
4000     case FLOAT:
4001       float_assemble(i,i_regs);break;
4002     case FCOMP:
4003       fcomp_assemble(i,i_regs);break;
4004     case MULTDIV:
4005       multdiv_assemble(i,i_regs);break;
4006     case MOV:
4007       mov_assemble(i,i_regs);break;
4008     case SYSCALL:
4009     case HLECALL:
4010     case INTCALL:
4011     case SPAN:
4012     case UJUMP:
4013     case RJUMP:
4014     case CJUMP:
4015     case SJUMP:
4016     case FJUMP:
4017       printf("Jump in the delay slot.  This is probably a bug.\n");
4018   }
4019   is_delayslot=0;
4020 }
4021
4022 // Is the branch target a valid internal jump?
4023 int internal_branch(uint64_t i_is32,int addr)
4024 {
4025   if(addr&1) return 0; // Indirect (register) jump
4026   if(addr>=start && addr<start+slen*4-4)
4027   {
4028     int t=(addr-start)>>2;
4029     // Delay slots are not valid branch targets
4030     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4031     // 64 -> 32 bit transition requires a recompile
4032     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4033     {
4034       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4035       else printf("optimizable: yes\n");
4036     }*/
4037     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4038 #ifndef FORCE32
4039     if(requires_32bit[t]&~i_is32) return 0;
4040     else
4041 #endif
4042       return 1;
4043   }
4044   return 0;
4045 }
4046
4047 #ifndef wb_invalidate
4048 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4049   uint64_t u,uint64_t uu)
4050 {
4051   int hr;
4052   for(hr=0;hr<HOST_REGS;hr++) {
4053     if(hr!=EXCLUDE_REG) {
4054       if(pre[hr]!=entry[hr]) {
4055         if(pre[hr]>=0) {
4056           if((dirty>>hr)&1) {
4057             if(get_reg(entry,pre[hr])<0) {
4058               if(pre[hr]<64) {
4059                 if(!((u>>pre[hr])&1)) {
4060                   emit_storereg(pre[hr],hr);
4061                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4062                     emit_sarimm(hr,31,hr);
4063                     emit_storereg(pre[hr]|64,hr);
4064                   }
4065                 }
4066               }else{
4067                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4068                   emit_storereg(pre[hr],hr);
4069                 }
4070               }
4071             }
4072           }
4073         }
4074       }
4075     }
4076   }
4077   // Move from one register to another (no writeback)
4078   for(hr=0;hr<HOST_REGS;hr++) {
4079     if(hr!=EXCLUDE_REG) {
4080       if(pre[hr]!=entry[hr]) {
4081         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4082           int nr;
4083           if((nr=get_reg(entry,pre[hr]))>=0) {
4084             emit_mov(hr,nr);
4085           }
4086         }
4087       }
4088     }
4089   }
4090 }
4091 #endif
4092
4093 // Load the specified registers
4094 // This only loads the registers given as arguments because
4095 // we don't want to load things that will be overwritten
4096 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4097 {
4098   int hr;
4099   // Load 32-bit regs
4100   for(hr=0;hr<HOST_REGS;hr++) {
4101     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4102       if(entry[hr]!=regmap[hr]) {
4103         if(regmap[hr]==rs1||regmap[hr]==rs2)
4104         {
4105           if(regmap[hr]==0) {
4106             emit_zeroreg(hr);
4107           }
4108           else
4109           {
4110             emit_loadreg(regmap[hr],hr);
4111           }
4112         }
4113       }
4114     }
4115   }
4116   //Load 64-bit regs
4117   for(hr=0;hr<HOST_REGS;hr++) {
4118     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4119       if(entry[hr]!=regmap[hr]) {
4120         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4121         {
4122           assert(regmap[hr]!=64);
4123           if((is32>>(regmap[hr]&63))&1) {
4124             int lr=get_reg(regmap,regmap[hr]-64);
4125             if(lr>=0)
4126               emit_sarimm(lr,31,hr);
4127             else
4128               emit_loadreg(regmap[hr],hr);
4129           }
4130           else
4131           {
4132             emit_loadreg(regmap[hr],hr);
4133           }
4134         }
4135       }
4136     }
4137   }
4138 }
4139
4140 // Load registers prior to the start of a loop
4141 // so that they are not loaded within the loop
4142 static void loop_preload(signed char pre[],signed char entry[])
4143 {
4144   int hr;
4145   for(hr=0;hr<HOST_REGS;hr++) {
4146     if(hr!=EXCLUDE_REG) {
4147       if(pre[hr]!=entry[hr]) {
4148         if(entry[hr]>=0) {
4149           if(get_reg(pre,entry[hr])<0) {
4150             assem_debug("loop preload:\n");
4151             //printf("loop preload: %d\n",hr);
4152             if(entry[hr]==0) {
4153               emit_zeroreg(hr);
4154             }
4155             else if(entry[hr]<TEMPREG)
4156             {
4157               emit_loadreg(entry[hr],hr);
4158             }
4159             else if(entry[hr]-64<TEMPREG)
4160             {
4161               emit_loadreg(entry[hr],hr);
4162             }
4163           }
4164         }
4165       }
4166     }
4167   }
4168 }
4169
4170 // Generate address for load/store instruction
4171 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4172 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4173 {
4174   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4175     int ra=-1;
4176     int agr=AGEN1+(i&1);
4177     int mgr=MGEN1+(i&1);
4178     if(itype[i]==LOAD) {
4179       ra=get_reg(i_regs->regmap,rt1[i]);
4180       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4181       assert(ra>=0);
4182     }
4183     if(itype[i]==LOADLR) {
4184       ra=get_reg(i_regs->regmap,FTEMP);
4185     }
4186     if(itype[i]==STORE||itype[i]==STORELR) {
4187       ra=get_reg(i_regs->regmap,agr);
4188       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4189     }
4190     if(itype[i]==C1LS||itype[i]==C2LS) {
4191       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4192         ra=get_reg(i_regs->regmap,FTEMP);
4193       else { // SWC1/SDC1/SWC2/SDC2
4194         ra=get_reg(i_regs->regmap,agr);
4195         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4196       }
4197     }
4198     int rs=get_reg(i_regs->regmap,rs1[i]);
4199     int rm=get_reg(i_regs->regmap,TLREG);
4200     if(ra>=0) {
4201       int offset=imm[i];
4202       int c=(i_regs->wasconst>>rs)&1;
4203       if(rs1[i]==0) {
4204         // Using r0 as a base address
4205         /*if(rm>=0) {
4206           if(!entry||entry[rm]!=mgr) {
4207             generate_map_const(offset,rm);
4208           } // else did it in the previous cycle
4209         }*/
4210         if(!entry||entry[ra]!=agr) {
4211           if (opcode[i]==0x22||opcode[i]==0x26) {
4212             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4213           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4214             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4215           }else{
4216             emit_movimm(offset,ra);
4217           }
4218         } // else did it in the previous cycle
4219       }
4220       else if(rs<0) {
4221         if(!entry||entry[ra]!=rs1[i])
4222           emit_loadreg(rs1[i],ra);
4223         //if(!entry||entry[ra]!=rs1[i])
4224         //  printf("poor load scheduling!\n");
4225       }
4226       else if(c) {
4227         if(rm>=0) {
4228           if(!entry||entry[rm]!=mgr) {
4229             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4230               // Stores to memory go thru the mapper to detect self-modifying
4231               // code, loads don't.
4232               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4233                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4234                 generate_map_const(constmap[i][rs]+offset,rm);
4235             }else{
4236               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4237                 generate_map_const(constmap[i][rs]+offset,rm);
4238             }
4239           }
4240         }
4241         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4242           if(!entry||entry[ra]!=agr) {
4243             if (opcode[i]==0x22||opcode[i]==0x26) {
4244               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4245             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4246               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4247             }else{
4248               #ifdef HOST_IMM_ADDR32
4249               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4250                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4251               #endif
4252               emit_movimm(constmap[i][rs]+offset,ra);
4253             }
4254           } // else did it in the previous cycle
4255         } // else load_consts already did it
4256       }
4257       if(offset&&!c&&rs1[i]) {
4258         if(rs>=0) {
4259           emit_addimm(rs,offset,ra);
4260         }else{
4261           emit_addimm(ra,offset,ra);
4262         }
4263       }
4264     }
4265   }
4266   // Preload constants for next instruction
4267   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4268     int agr,ra;
4269     #ifndef HOST_IMM_ADDR32
4270     // Mapper entry
4271     agr=MGEN1+((i+1)&1);
4272     ra=get_reg(i_regs->regmap,agr);
4273     if(ra>=0) {
4274       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4275       int offset=imm[i+1];
4276       int c=(regs[i+1].wasconst>>rs)&1;
4277       if(c) {
4278         if(itype[i+1]==STORE||itype[i+1]==STORELR
4279            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4280           // Stores to memory go thru the mapper to detect self-modifying
4281           // code, loads don't.
4282           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4283              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4284             generate_map_const(constmap[i+1][rs]+offset,ra);
4285         }else{
4286           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4287             generate_map_const(constmap[i+1][rs]+offset,ra);
4288         }
4289       }
4290       /*else if(rs1[i]==0) {
4291         generate_map_const(offset,ra);
4292       }*/
4293     }
4294     #endif
4295     // Actual address
4296     agr=AGEN1+((i+1)&1);
4297     ra=get_reg(i_regs->regmap,agr);
4298     if(ra>=0) {
4299       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4300       int offset=imm[i+1];
4301       int c=(regs[i+1].wasconst>>rs)&1;
4302       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4303         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4304           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4305         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4306           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4307         }else{
4308           #ifdef HOST_IMM_ADDR32
4309           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4310              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4311           #endif
4312           emit_movimm(constmap[i+1][rs]+offset,ra);
4313         }
4314       }
4315       else if(rs1[i+1]==0) {
4316         // Using r0 as a base address
4317         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4318           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4319         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4320           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4321         }else{
4322           emit_movimm(offset,ra);
4323         }
4324       }
4325     }
4326   }
4327 }
4328
4329 int get_final_value(int hr, int i, int *value)
4330 {
4331   int reg=regs[i].regmap[hr];
4332   while(i<slen-1) {
4333     if(regs[i+1].regmap[hr]!=reg) break;
4334     if(!((regs[i+1].isconst>>hr)&1)) break;
4335     if(bt[i+1]) break;
4336     i++;
4337   }
4338   if(i<slen-1) {
4339     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4340       *value=constmap[i][hr];
4341       return 1;
4342     }
4343     if(!bt[i+1]) {
4344       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4345         // Load in delay slot, out-of-order execution
4346         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4347         {
4348           #ifdef HOST_IMM_ADDR32
4349           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4350           #endif
4351           // Precompute load address
4352           *value=constmap[i][hr]+imm[i+2];
4353           return 1;
4354         }
4355       }
4356       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4357       {
4358         #ifdef HOST_IMM_ADDR32
4359         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4360         #endif
4361         // Precompute load address
4362         *value=constmap[i][hr]+imm[i+1];
4363         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4364         return 1;
4365       }
4366     }
4367   }
4368   *value=constmap[i][hr];
4369   //printf("c=%x\n",(int)constmap[i][hr]);
4370   if(i==slen-1) return 1;
4371   if(reg<64) {
4372     return !((unneeded_reg[i+1]>>reg)&1);
4373   }else{
4374     return !((unneeded_reg_upper[i+1]>>reg)&1);
4375   }
4376 }
4377
4378 // Load registers with known constants
4379 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4380 {
4381   int hr;
4382   // Load 32-bit regs
4383   for(hr=0;hr<HOST_REGS;hr++) {
4384     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4385       //if(entry[hr]!=regmap[hr]) {
4386       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4387         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4388           int value;
4389           if(get_final_value(hr,i,&value)) {
4390             if(value==0) {
4391               emit_zeroreg(hr);
4392             }
4393             else {
4394               emit_movimm(value,hr);
4395             }
4396           }
4397         }
4398       }
4399     }
4400   }
4401   // Load 64-bit regs
4402   for(hr=0;hr<HOST_REGS;hr++) {
4403     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4404       //if(entry[hr]!=regmap[hr]) {
4405       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4406         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4407           if((is32>>(regmap[hr]&63))&1) {
4408             int lr=get_reg(regmap,regmap[hr]-64);
4409             assert(lr>=0);
4410             emit_sarimm(lr,31,hr);
4411           }
4412           else
4413           {
4414             int value;
4415             if(get_final_value(hr,i,&value)) {
4416               if(value==0) {
4417                 emit_zeroreg(hr);
4418               }
4419               else {
4420                 emit_movimm(value,hr);
4421               }
4422             }
4423           }
4424         }
4425       }
4426     }
4427   }
4428 }
4429 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4430 {
4431   int hr;
4432   // Load 32-bit regs
4433   for(hr=0;hr<HOST_REGS;hr++) {
4434     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4435       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4436         int value=constmap[i][hr];
4437         if(value==0) {
4438           emit_zeroreg(hr);
4439         }
4440         else {
4441           emit_movimm(value,hr);
4442         }
4443       }
4444     }
4445   }
4446   // Load 64-bit regs
4447   for(hr=0;hr<HOST_REGS;hr++) {
4448     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4449       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4450         if((is32>>(regmap[hr]&63))&1) {
4451           int lr=get_reg(regmap,regmap[hr]-64);
4452           assert(lr>=0);
4453           emit_sarimm(lr,31,hr);
4454         }
4455         else
4456         {
4457           int value=constmap[i][hr];
4458           if(value==0) {
4459             emit_zeroreg(hr);
4460           }
4461           else {
4462             emit_movimm(value,hr);
4463           }
4464         }
4465       }
4466     }
4467   }
4468 }
4469
4470 // Write out all dirty registers (except cycle count)
4471 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4472 {
4473   int hr;
4474   for(hr=0;hr<HOST_REGS;hr++) {
4475     if(hr!=EXCLUDE_REG) {
4476       if(i_regmap[hr]>0) {
4477         if(i_regmap[hr]!=CCREG) {
4478           if((i_dirty>>hr)&1) {
4479             if(i_regmap[hr]<64) {
4480               emit_storereg(i_regmap[hr],hr);
4481 #ifndef FORCE32
4482               if( ((i_is32>>i_regmap[hr])&1) ) {
4483                 #ifdef DESTRUCTIVE_WRITEBACK
4484                 emit_sarimm(hr,31,hr);
4485                 emit_storereg(i_regmap[hr]|64,hr);
4486                 #else
4487                 emit_sarimm(hr,31,HOST_TEMPREG);
4488                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4489                 #endif
4490               }
4491 #endif
4492             }else{
4493               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4494                 emit_storereg(i_regmap[hr],hr);
4495               }
4496             }
4497           }
4498         }
4499       }
4500     }
4501   }
4502 }
4503 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4504 // This writes the registers not written by store_regs_bt
4505 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4506 {
4507   int hr;
4508   int t=(addr-start)>>2;
4509   for(hr=0;hr<HOST_REGS;hr++) {
4510     if(hr!=EXCLUDE_REG) {
4511       if(i_regmap[hr]>0) {
4512         if(i_regmap[hr]!=CCREG) {
4513           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4514             if((i_dirty>>hr)&1) {
4515               if(i_regmap[hr]<64) {
4516                 emit_storereg(i_regmap[hr],hr);
4517 #ifndef FORCE32
4518                 if( ((i_is32>>i_regmap[hr])&1) ) {
4519                   #ifdef DESTRUCTIVE_WRITEBACK
4520                   emit_sarimm(hr,31,hr);
4521                   emit_storereg(i_regmap[hr]|64,hr);
4522                   #else
4523                   emit_sarimm(hr,31,HOST_TEMPREG);
4524                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4525                   #endif
4526                 }
4527 #endif
4528               }else{
4529                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4530                   emit_storereg(i_regmap[hr],hr);
4531                 }
4532               }
4533             }
4534           }
4535         }
4536       }
4537     }
4538   }
4539 }
4540
4541 // Load all registers (except cycle count)
4542 void load_all_regs(signed char i_regmap[])
4543 {
4544   int hr;
4545   for(hr=0;hr<HOST_REGS;hr++) {
4546     if(hr!=EXCLUDE_REG) {
4547       if(i_regmap[hr]==0) {
4548         emit_zeroreg(hr);
4549       }
4550       else
4551       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4552       {
4553         emit_loadreg(i_regmap[hr],hr);
4554       }
4555     }
4556   }
4557 }
4558
4559 // Load all current registers also needed by next instruction
4560 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4561 {
4562   int hr;
4563   for(hr=0;hr<HOST_REGS;hr++) {
4564     if(hr!=EXCLUDE_REG) {
4565       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4566         if(i_regmap[hr]==0) {
4567           emit_zeroreg(hr);
4568         }
4569         else
4570         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4571         {
4572           emit_loadreg(i_regmap[hr],hr);
4573         }
4574       }
4575     }
4576   }
4577 }
4578
4579 // Load all regs, storing cycle count if necessary
4580 void load_regs_entry(int t)
4581 {
4582   int hr;
4583   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4584   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4585   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4586     emit_storereg(CCREG,HOST_CCREG);
4587   }
4588   // Load 32-bit regs
4589   for(hr=0;hr<HOST_REGS;hr++) {
4590     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4591       if(regs[t].regmap_entry[hr]==0) {
4592         emit_zeroreg(hr);
4593       }
4594       else if(regs[t].regmap_entry[hr]!=CCREG)
4595       {
4596         emit_loadreg(regs[t].regmap_entry[hr],hr);
4597       }
4598     }
4599   }
4600   // Load 64-bit regs
4601   for(hr=0;hr<HOST_REGS;hr++) {
4602     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4603       assert(regs[t].regmap_entry[hr]!=64);
4604       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4605         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4606         if(lr<0) {
4607           emit_loadreg(regs[t].regmap_entry[hr],hr);
4608         }
4609         else
4610         {
4611           emit_sarimm(lr,31,hr);
4612         }
4613       }
4614       else
4615       {
4616         emit_loadreg(regs[t].regmap_entry[hr],hr);
4617       }
4618     }
4619   }
4620 }
4621
4622 // Store dirty registers prior to branch
4623 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4624 {
4625   if(internal_branch(i_is32,addr))
4626   {
4627     int t=(addr-start)>>2;
4628     int hr;
4629     for(hr=0;hr<HOST_REGS;hr++) {
4630       if(hr!=EXCLUDE_REG) {
4631         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4632           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4633             if((i_dirty>>hr)&1) {
4634               if(i_regmap[hr]<64) {
4635                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4636                   emit_storereg(i_regmap[hr],hr);
4637                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4638                     #ifdef DESTRUCTIVE_WRITEBACK
4639                     emit_sarimm(hr,31,hr);
4640                     emit_storereg(i_regmap[hr]|64,hr);
4641                     #else
4642                     emit_sarimm(hr,31,HOST_TEMPREG);
4643                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4644                     #endif
4645                   }
4646                 }
4647               }else{
4648                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4649                   emit_storereg(i_regmap[hr],hr);
4650                 }
4651               }
4652             }
4653           }
4654         }
4655       }
4656     }
4657   }
4658   else
4659   {
4660     // Branch out of this block, write out all dirty regs
4661     wb_dirtys(i_regmap,i_is32,i_dirty);
4662   }
4663 }
4664
4665 // Load all needed registers for branch target
4666 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4667 {
4668   //if(addr>=start && addr<(start+slen*4))
4669   if(internal_branch(i_is32,addr))
4670   {
4671     int t=(addr-start)>>2;
4672     int hr;
4673     // Store the cycle count before loading something else
4674     if(i_regmap[HOST_CCREG]!=CCREG) {
4675       assert(i_regmap[HOST_CCREG]==-1);
4676     }
4677     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4678       emit_storereg(CCREG,HOST_CCREG);
4679     }
4680     // Load 32-bit regs
4681     for(hr=0;hr<HOST_REGS;hr++) {
4682       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4683         #ifdef DESTRUCTIVE_WRITEBACK
4684         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4685         #else
4686         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4687         #endif
4688           if(regs[t].regmap_entry[hr]==0) {
4689             emit_zeroreg(hr);
4690           }
4691           else if(regs[t].regmap_entry[hr]!=CCREG)
4692           {
4693             emit_loadreg(regs[t].regmap_entry[hr],hr);
4694           }
4695         }
4696       }
4697     }
4698     //Load 64-bit regs
4699     for(hr=0;hr<HOST_REGS;hr++) {
4700       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4701         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4702           assert(regs[t].regmap_entry[hr]!=64);
4703           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4704             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4705             if(lr<0) {
4706               emit_loadreg(regs[t].regmap_entry[hr],hr);
4707             }
4708             else
4709             {
4710               emit_sarimm(lr,31,hr);
4711             }
4712           }
4713           else
4714           {
4715             emit_loadreg(regs[t].regmap_entry[hr],hr);
4716           }
4717         }
4718         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4719           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4720           assert(lr>=0);
4721           emit_sarimm(lr,31,hr);
4722         }
4723       }
4724     }
4725   }
4726 }
4727
4728 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4729 {
4730   if(addr>=start && addr<start+slen*4-4)
4731   {
4732     int t=(addr-start)>>2;
4733     int hr;
4734     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4735     for(hr=0;hr<HOST_REGS;hr++)
4736     {
4737       if(hr!=EXCLUDE_REG)
4738       {
4739         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4740         {
4741           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4742           {
4743             return 0;
4744           }
4745           else 
4746           if((i_dirty>>hr)&1)
4747           {
4748             if(i_regmap[hr]<TEMPREG)
4749             {
4750               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4751                 return 0;
4752             }
4753             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4754             {
4755               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4756                 return 0;
4757             }
4758           }
4759         }
4760         else // Same register but is it 32-bit or dirty?
4761         if(i_regmap[hr]>=0)
4762         {
4763           if(!((regs[t].dirty>>hr)&1))
4764           {
4765             if((i_dirty>>hr)&1)
4766             {
4767               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4768               {
4769                 //printf("%x: dirty no match\n",addr);
4770                 return 0;
4771               }
4772             }
4773           }
4774           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4775           {
4776             //printf("%x: is32 no match\n",addr);
4777             return 0;
4778           }
4779         }
4780       }
4781     }
4782     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4783 #ifndef FORCE32
4784     if(requires_32bit[t]&~i_is32) return 0;
4785 #endif
4786     // Delay slots are not valid branch targets
4787     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4788     // Delay slots require additional processing, so do not match
4789     if(is_ds[t]) return 0;
4790   }
4791   else
4792   {
4793     int hr;
4794     for(hr=0;hr<HOST_REGS;hr++)
4795     {
4796       if(hr!=EXCLUDE_REG)
4797       {
4798         if(i_regmap[hr]>=0)
4799         {
4800           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4801           {
4802             if((i_dirty>>hr)&1)
4803             {
4804               return 0;
4805             }
4806           }
4807         }
4808       }
4809     }
4810   }
4811   return 1;
4812 }
4813
4814 // Used when a branch jumps into the delay slot of another branch
4815 void ds_assemble_entry(int i)
4816 {
4817   int t=(ba[i]-start)>>2;
4818   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4819   assem_debug("Assemble delay slot at %x\n",ba[i]);
4820   assem_debug("<->\n");
4821   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4822     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4823   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4824   address_generation(t,&regs[t],regs[t].regmap_entry);
4825   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4826     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4827   cop1_usable=0;
4828   is_delayslot=0;
4829   switch(itype[t]) {
4830     case ALU:
4831       alu_assemble(t,&regs[t]);break;
4832     case IMM16:
4833       imm16_assemble(t,&regs[t]);break;
4834     case SHIFT:
4835       shift_assemble(t,&regs[t]);break;
4836     case SHIFTIMM:
4837       shiftimm_assemble(t,&regs[t]);break;
4838     case LOAD:
4839       load_assemble(t,&regs[t]);break;
4840     case LOADLR:
4841       loadlr_assemble(t,&regs[t]);break;
4842     case STORE:
4843       store_assemble(t,&regs[t]);break;
4844     case STORELR:
4845       storelr_assemble(t,&regs[t]);break;
4846     case COP0:
4847       cop0_assemble(t,&regs[t]);break;
4848     case COP1:
4849       cop1_assemble(t,&regs[t]);break;
4850     case C1LS:
4851       c1ls_assemble(t,&regs[t]);break;
4852     case COP2:
4853       cop2_assemble(t,&regs[t]);break;
4854     case C2LS:
4855       c2ls_assemble(t,&regs[t]);break;
4856     case C2OP:
4857       c2op_assemble(t,&regs[t]);break;
4858     case FCONV:
4859       fconv_assemble(t,&regs[t]);break;
4860     case FLOAT:
4861       float_assemble(t,&regs[t]);break;
4862     case FCOMP:
4863       fcomp_assemble(t,&regs[t]);break;
4864     case MULTDIV:
4865       multdiv_assemble(t,&regs[t]);break;
4866     case MOV:
4867       mov_assemble(t,&regs[t]);break;
4868     case SYSCALL:
4869     case HLECALL:
4870     case INTCALL:
4871     case SPAN:
4872     case UJUMP:
4873     case RJUMP:
4874     case CJUMP:
4875     case SJUMP:
4876     case FJUMP:
4877       printf("Jump in the delay slot.  This is probably a bug.\n");
4878   }
4879   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4880   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4881   if(internal_branch(regs[t].is32,ba[i]+4))
4882     assem_debug("branch: internal\n");
4883   else
4884     assem_debug("branch: external\n");
4885   assert(internal_branch(regs[t].is32,ba[i]+4));
4886   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4887   emit_jmp(0);
4888 }
4889
4890 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4891 {
4892   int count;
4893   int jaddr;
4894   int idle=0;
4895   if(itype[i]==RJUMP)
4896   {
4897     *adj=0;
4898   }
4899   //if(ba[i]>=start && ba[i]<(start+slen*4))
4900   if(internal_branch(branch_regs[i].is32,ba[i]))
4901   {
4902     int t=(ba[i]-start)>>2;
4903     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4904     else *adj=ccadj[t];
4905   }
4906   else
4907   {
4908     *adj=0;
4909   }
4910   count=ccadj[i];
4911   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4912     // Idle loop
4913     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4914     idle=(int)out;
4915     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4916     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4917     jaddr=(int)out;
4918     emit_jmp(0);
4919   }
4920   else if(*adj==0||invert) {
4921     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4922     jaddr=(int)out;
4923     emit_jns(0);
4924   }
4925   else
4926   {
4927     emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2));
4928     jaddr=(int)out;
4929     emit_jns(0);
4930   }
4931   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4932 }
4933
4934 void do_ccstub(int n)
4935 {
4936   literal_pool(256);
4937   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4938   set_jump_target(stubs[n][1],(int)out);
4939   int i=stubs[n][4];
4940   if(stubs[n][6]==NULLDS) {
4941     // Delay slot instruction is nullified ("likely" branch)
4942     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4943   }
4944   else if(stubs[n][6]!=TAKEN) {
4945     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4946   }
4947   else {
4948     if(internal_branch(branch_regs[i].is32,ba[i]))
4949       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4950   }
4951   if(stubs[n][5]!=-1)
4952   {
4953     // Save PC as return address
4954     emit_movimm(stubs[n][5],EAX);
4955     emit_writeword(EAX,(int)&pcaddr);
4956   }
4957   else
4958   {
4959     // Return address depends on which way the branch goes
4960     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4961     {
4962       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4963       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4964       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4965       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4966       if(rs1[i]==0)
4967       {
4968         s1l=s2l;s1h=s2h;
4969         s2l=s2h=-1;
4970       }
4971       else if(rs2[i]==0)
4972       {
4973         s2l=s2h=-1;
4974       }
4975       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4976         s1h=s2h=-1;
4977       }
4978       assert(s1l>=0);
4979       #ifdef DESTRUCTIVE_WRITEBACK
4980       if(rs1[i]) {
4981         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4982           emit_loadreg(rs1[i],s1l);
4983       } 
4984       else {
4985         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4986           emit_loadreg(rs2[i],s1l);
4987       }
4988       if(s2l>=0)
4989         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4990           emit_loadreg(rs2[i],s2l);
4991       #endif
4992       int hr=0;
4993       int addr=-1,alt=-1,ntaddr=-1;
4994       while(hr<HOST_REGS)
4995       {
4996         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4997            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4998            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4999         {
5000           addr=hr++;break;
5001         }
5002         hr++;
5003       }
5004       while(hr<HOST_REGS)
5005       {
5006         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5007            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5008            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5009         {
5010           alt=hr++;break;
5011         }
5012         hr++;
5013       }
5014       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5015       {
5016         while(hr<HOST_REGS)
5017         {
5018           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5019              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5020              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5021           {
5022             ntaddr=hr;break;
5023           }
5024           hr++;
5025         }
5026         assert(hr<HOST_REGS);
5027       }
5028       if((opcode[i]&0x2f)==4) // BEQ
5029       {
5030         #ifdef HAVE_CMOV_IMM
5031         if(s1h<0) {
5032           if(s2l>=0) emit_cmp(s1l,s2l);
5033           else emit_test(s1l,s1l);
5034           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5035         }
5036         else
5037         #endif
5038         {
5039           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5040           if(s1h>=0) {
5041             if(s2h>=0) emit_cmp(s1h,s2h);
5042             else emit_test(s1h,s1h);
5043             emit_cmovne_reg(alt,addr);
5044           }
5045           if(s2l>=0) emit_cmp(s1l,s2l);
5046           else emit_test(s1l,s1l);
5047           emit_cmovne_reg(alt,addr);
5048         }
5049       }
5050       if((opcode[i]&0x2f)==5) // BNE
5051       {
5052         #ifdef HAVE_CMOV_IMM
5053         if(s1h<0) {
5054           if(s2l>=0) emit_cmp(s1l,s2l);
5055           else emit_test(s1l,s1l);
5056           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5057         }
5058         else
5059         #endif
5060         {
5061           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5062           if(s1h>=0) {
5063             if(s2h>=0) emit_cmp(s1h,s2h);
5064             else emit_test(s1h,s1h);
5065             emit_cmovne_reg(alt,addr);
5066           }
5067           if(s2l>=0) emit_cmp(s1l,s2l);
5068           else emit_test(s1l,s1l);
5069           emit_cmovne_reg(alt,addr);
5070         }
5071       }
5072       if((opcode[i]&0x2f)==6) // BLEZ
5073       {
5074         //emit_movimm(ba[i],alt);
5075         //emit_movimm(start+i*4+8,addr);
5076         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5077         emit_cmpimm(s1l,1);
5078         if(s1h>=0) emit_mov(addr,ntaddr);
5079         emit_cmovl_reg(alt,addr);
5080         if(s1h>=0) {
5081           emit_test(s1h,s1h);
5082           emit_cmovne_reg(ntaddr,addr);
5083           emit_cmovs_reg(alt,addr);
5084         }
5085       }
5086       if((opcode[i]&0x2f)==7) // BGTZ
5087       {
5088         //emit_movimm(ba[i],addr);
5089         //emit_movimm(start+i*4+8,ntaddr);
5090         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5091         emit_cmpimm(s1l,1);
5092         if(s1h>=0) emit_mov(addr,alt);
5093         emit_cmovl_reg(ntaddr,addr);
5094         if(s1h>=0) {
5095           emit_test(s1h,s1h);
5096           emit_cmovne_reg(alt,addr);
5097           emit_cmovs_reg(ntaddr,addr);
5098         }
5099       }
5100       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5101       {
5102         //emit_movimm(ba[i],alt);
5103         //emit_movimm(start+i*4+8,addr);
5104         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5105         if(s1h>=0) emit_test(s1h,s1h);
5106         else emit_test(s1l,s1l);
5107         emit_cmovs_reg(alt,addr);
5108       }
5109       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5110       {
5111         //emit_movimm(ba[i],addr);
5112         //emit_movimm(start+i*4+8,alt);
5113         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5114         if(s1h>=0) emit_test(s1h,s1h);
5115         else emit_test(s1l,s1l);
5116         emit_cmovs_reg(alt,addr);
5117       }
5118       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5119         if(source[i]&0x10000) // BC1T
5120         {
5121           //emit_movimm(ba[i],alt);
5122           //emit_movimm(start+i*4+8,addr);
5123           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5124           emit_testimm(s1l,0x800000);
5125           emit_cmovne_reg(alt,addr);
5126         }
5127         else // BC1F
5128         {
5129           //emit_movimm(ba[i],addr);
5130           //emit_movimm(start+i*4+8,alt);
5131           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5132           emit_testimm(s1l,0x800000);
5133           emit_cmovne_reg(alt,addr);
5134         }
5135       }
5136       emit_writeword(addr,(int)&pcaddr);
5137     }
5138     else
5139     if(itype[i]==RJUMP)
5140     {
5141       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5142       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5143         r=get_reg(branch_regs[i].regmap,RTEMP);
5144       }
5145       emit_writeword(r,(int)&pcaddr);
5146     }
5147     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5148   }
5149   // Update cycle count
5150   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5151   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5152   emit_call((int)cc_interrupt);
5153   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5154   if(stubs[n][6]==TAKEN) {
5155     if(internal_branch(branch_regs[i].is32,ba[i]))
5156       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5157     else if(itype[i]==RJUMP) {
5158       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5159         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5160       else
5161         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5162     }
5163   }else if(stubs[n][6]==NOTTAKEN) {
5164     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5165     else load_all_regs(branch_regs[i].regmap);
5166   }else if(stubs[n][6]==NULLDS) {
5167     // Delay slot instruction is nullified ("likely" branch)
5168     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5169     else load_all_regs(regs[i].regmap);
5170   }else{
5171     load_all_regs(branch_regs[i].regmap);
5172   }
5173   emit_jmp(stubs[n][2]); // return address
5174   
5175   /* This works but uses a lot of memory...
5176   emit_readword((int)&last_count,ECX);
5177   emit_add(HOST_CCREG,ECX,EAX);
5178   emit_writeword(EAX,(int)&Count);
5179   emit_call((int)gen_interupt);
5180   emit_readword((int)&Count,HOST_CCREG);
5181   emit_readword((int)&next_interupt,EAX);
5182   emit_readword((int)&pending_exception,EBX);
5183   emit_writeword(EAX,(int)&last_count);
5184   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5185   emit_test(EBX,EBX);
5186   int jne_instr=(int)out;
5187   emit_jne(0);
5188   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5189   load_all_regs(branch_regs[i].regmap);
5190   emit_jmp(stubs[n][2]); // return address
5191   set_jump_target(jne_instr,(int)out);
5192   emit_readword((int)&pcaddr,EAX);
5193   // Call get_addr_ht instead of doing the hash table here.
5194   // This code is executed infrequently and takes up a lot of space
5195   // so smaller is better.
5196   emit_storereg(CCREG,HOST_CCREG);
5197   emit_pushreg(EAX);
5198   emit_call((int)get_addr_ht);
5199   emit_loadreg(CCREG,HOST_CCREG);
5200   emit_addimm(ESP,4,ESP);
5201   emit_jmpreg(EAX);*/
5202 }
5203
5204 add_to_linker(int addr,int target,int ext)
5205 {
5206   link_addr[linkcount][0]=addr;
5207   link_addr[linkcount][1]=target;
5208   link_addr[linkcount][2]=ext;  
5209   linkcount++;
5210 }
5211
5212 static void ujump_assemble_write_ra(int i)
5213 {
5214   int rt;
5215   unsigned int return_address;
5216   rt=get_reg(branch_regs[i].regmap,31);
5217   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5218   //assert(rt>=0);
5219   return_address=start+i*4+8;
5220   if(rt>=0) {
5221     #ifdef USE_MINI_HT
5222     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5223       int temp=-1; // note: must be ds-safe
5224       #ifdef HOST_TEMPREG
5225       temp=HOST_TEMPREG;
5226       #endif
5227       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5228       else emit_movimm(return_address,rt);
5229     }
5230     else
5231     #endif
5232     {
5233       #ifdef REG_PREFETCH
5234       if(temp>=0) 
5235       {
5236         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5237       }
5238       #endif
5239       emit_movimm(return_address,rt); // PC into link register
5240       #ifdef IMM_PREFETCH
5241       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5242       #endif
5243     }
5244   }
5245 }
5246
5247 void ujump_assemble(int i,struct regstat *i_regs)
5248 {
5249   signed char *i_regmap=i_regs->regmap;
5250   int ra_done=0;
5251   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5252   address_generation(i+1,i_regs,regs[i].regmap_entry);
5253   #ifdef REG_PREFETCH
5254   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5255   if(rt1[i]==31&&temp>=0) 
5256   {
5257     int return_address=start+i*4+8;
5258     if(get_reg(branch_regs[i].regmap,31)>0) 
5259     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5260   }
5261   #endif
5262   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5263     ujump_assemble_write_ra(i); // writeback ra for DS
5264     ra_done=1;
5265   }
5266   ds_assemble(i+1,i_regs);
5267   uint64_t bc_unneeded=branch_regs[i].u;
5268   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5269   bc_unneeded|=1|(1LL<<rt1[i]);
5270   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5271   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5272                 bc_unneeded,bc_unneeded_upper);
5273   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5274   if(!ra_done&&rt1[i]==31)
5275     ujump_assemble_write_ra(i);
5276   int cc,adj;
5277   cc=get_reg(branch_regs[i].regmap,CCREG);
5278   assert(cc==HOST_CCREG);
5279   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5280   #ifdef REG_PREFETCH
5281   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5282   #endif
5283   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5284   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5285   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5286   if(internal_branch(branch_regs[i].is32,ba[i]))
5287     assem_debug("branch: internal\n");
5288   else
5289     assem_debug("branch: external\n");
5290   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5291     ds_assemble_entry(i);
5292   }
5293   else {
5294     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5295     emit_jmp(0);
5296   }
5297 }
5298
5299 static void rjump_assemble_write_ra(int i)
5300 {
5301   int rt,return_address;
5302   assert(rt1[i+1]!=rt1[i]);
5303   assert(rt2[i+1]!=rt1[i]);
5304   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5305   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5306   assert(rt>=0);
5307   return_address=start+i*4+8;
5308   #ifdef REG_PREFETCH
5309   if(temp>=0) 
5310   {
5311     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5312   }
5313   #endif
5314   emit_movimm(return_address,rt); // PC into link register
5315   #ifdef IMM_PREFETCH
5316   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5317   #endif
5318 }
5319
5320 void rjump_assemble(int i,struct regstat *i_regs)
5321 {
5322   signed char *i_regmap=i_regs->regmap;
5323   int temp;
5324   int rs,cc,adj;
5325   int ra_done=0;
5326   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5327   assert(rs>=0);
5328   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5329     // Delay slot abuse, make a copy of the branch address register
5330     temp=get_reg(branch_regs[i].regmap,RTEMP);
5331     assert(temp>=0);
5332     assert(regs[i].regmap[temp]==RTEMP);
5333     emit_mov(rs,temp);
5334     rs=temp;
5335   }
5336   address_generation(i+1,i_regs,regs[i].regmap_entry);
5337   #ifdef REG_PREFETCH
5338   if(rt1[i]==31) 
5339   {
5340     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5341       int return_address=start+i*4+8;
5342       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5343     }
5344   }
5345   #endif
5346   #ifdef USE_MINI_HT
5347   if(rs1[i]==31) {
5348     int rh=get_reg(regs[i].regmap,RHASH);
5349     if(rh>=0) do_preload_rhash(rh);
5350   }
5351   #endif
5352   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5353     rjump_assemble_write_ra(i);
5354     ra_done=1;
5355   }
5356   ds_assemble(i+1,i_regs);
5357   uint64_t bc_unneeded=branch_regs[i].u;
5358   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5359   bc_unneeded|=1|(1LL<<rt1[i]);
5360   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5361   bc_unneeded&=~(1LL<<rs1[i]);
5362   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5363                 bc_unneeded,bc_unneeded_upper);
5364   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5365   if(!ra_done&&rt1[i]!=0)
5366     rjump_assemble_write_ra(i);
5367   cc=get_reg(branch_regs[i].regmap,CCREG);
5368   assert(cc==HOST_CCREG);
5369   #ifdef USE_MINI_HT
5370   int rh=get_reg(branch_regs[i].regmap,RHASH);
5371   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5372   if(rs1[i]==31) {
5373     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5374     do_preload_rhtbl(ht);
5375     do_rhash(rs,rh);
5376   }
5377   #endif
5378   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5379   #ifdef DESTRUCTIVE_WRITEBACK
5380   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5381     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5382       emit_loadreg(rs1[i],rs);
5383     }
5384   }
5385   #endif
5386   #ifdef REG_PREFETCH
5387   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5388   #endif
5389   #ifdef USE_MINI_HT
5390   if(rs1[i]==31) {
5391     do_miniht_load(ht,rh);
5392   }
5393   #endif
5394   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5395   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5396   //assert(adj==0);
5397   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5398   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5399   emit_jns(0);
5400   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5401   #ifdef USE_MINI_HT
5402   if(rs1[i]==31) {
5403     do_miniht_jump(rs,rh,ht);
5404   }
5405   else
5406   #endif
5407   {
5408     //if(rs!=EAX) emit_mov(rs,EAX);
5409     //emit_jmp((int)jump_vaddr_eax);
5410     emit_jmp(jump_vaddr_reg[rs]);
5411   }
5412   /* Check hash table
5413   temp=!rs;
5414   emit_mov(rs,temp);
5415   emit_shrimm(rs,16,rs);
5416   emit_xor(temp,rs,rs);
5417   emit_movzwl_reg(rs,rs);
5418   emit_shlimm(rs,4,rs);
5419   emit_cmpmem_indexed((int)hash_table,rs,temp);
5420   emit_jne((int)out+14);
5421   emit_readword_indexed((int)hash_table+4,rs,rs);
5422   emit_jmpreg(rs);
5423   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5424   emit_addimm_no_flags(8,rs);
5425   emit_jeq((int)out-17);
5426   // No hit on hash table, call compiler
5427   emit_pushreg(temp);
5428 //DEBUG >
5429 #ifdef DEBUG_CYCLE_COUNT
5430   emit_readword((int)&last_count,ECX);
5431   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5432   emit_readword((int)&next_interupt,ECX);
5433   emit_writeword(HOST_CCREG,(int)&Count);
5434   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5435   emit_writeword(ECX,(int)&last_count);
5436 #endif
5437 //DEBUG <
5438   emit_storereg(CCREG,HOST_CCREG);
5439   emit_call((int)get_addr);
5440   emit_loadreg(CCREG,HOST_CCREG);
5441   emit_addimm(ESP,4,ESP);
5442   emit_jmpreg(EAX);*/
5443   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5444   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5445   #endif
5446 }
5447
5448 void cjump_assemble(int i,struct regstat *i_regs)
5449 {
5450   signed char *i_regmap=i_regs->regmap;
5451   int cc;
5452   int match;
5453   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5454   assem_debug("match=%d\n",match);
5455   int s1h,s1l,s2h,s2l;
5456   int prev_cop1_usable=cop1_usable;
5457   int unconditional=0,nop=0;
5458   int only32=0;
5459   int invert=0;
5460   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5461   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5462   if(!match) invert=1;
5463   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5464   if(i>(ba[i]-start)>>2) invert=1;
5465   #endif
5466   
5467   if(ooo[i]) {
5468     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5469     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5470     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5471     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5472   }
5473   else {
5474     s1l=get_reg(i_regmap,rs1[i]);
5475     s1h=get_reg(i_regmap,rs1[i]|64);
5476     s2l=get_reg(i_regmap,rs2[i]);
5477     s2h=get_reg(i_regmap,rs2[i]|64);
5478   }
5479   if(rs1[i]==0&&rs2[i]==0)
5480   {
5481     if(opcode[i]&1) nop=1;
5482     else unconditional=1;
5483     //assert(opcode[i]!=5);
5484     //assert(opcode[i]!=7);
5485     //assert(opcode[i]!=0x15);
5486     //assert(opcode[i]!=0x17);
5487   }
5488   else if(rs1[i]==0)
5489   {
5490     s1l=s2l;s1h=s2h;
5491     s2l=s2h=-1;
5492     only32=(regs[i].was32>>rs2[i])&1;
5493   }
5494   else if(rs2[i]==0)
5495   {
5496     s2l=s2h=-1;
5497     only32=(regs[i].was32>>rs1[i])&1;
5498   }
5499   else {
5500     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5501   }
5502
5503   if(ooo[i]) {
5504     // Out of order execution (delay slot first)
5505     //printf("OOOE\n");
5506     address_generation(i+1,i_regs,regs[i].regmap_entry);
5507     ds_assemble(i+1,i_regs);
5508     int adj;
5509     uint64_t bc_unneeded=branch_regs[i].u;
5510     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5511     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5512     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5513     bc_unneeded|=1;
5514     bc_unneeded_upper|=1;
5515     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5516                   bc_unneeded,bc_unneeded_upper);
5517     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5518     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5519     cc=get_reg(branch_regs[i].regmap,CCREG);
5520     assert(cc==HOST_CCREG);
5521     if(unconditional) 
5522       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5523     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5524     //assem_debug("cycle count (adj)\n");
5525     if(unconditional) {
5526       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5527       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5528         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5529         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5530         if(internal)
5531           assem_debug("branch: internal\n");
5532         else
5533           assem_debug("branch: external\n");
5534         if(internal&&is_ds[(ba[i]-start)>>2]) {
5535           ds_assemble_entry(i);
5536         }
5537         else {
5538           add_to_linker((int)out,ba[i],internal);
5539           emit_jmp(0);
5540         }
5541         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5542         if(((u_int)out)&7) emit_addnop(0);
5543         #endif
5544       }
5545     }
5546     else if(nop) {
5547       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5548       int jaddr=(int)out;
5549       emit_jns(0);
5550       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5551     }
5552     else {
5553       int taken=0,nottaken=0,nottaken1=0;
5554       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5555       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5556       if(!only32)
5557       {
5558         assert(s1h>=0);
5559         if(opcode[i]==4) // BEQ
5560         {
5561           if(s2h>=0) emit_cmp(s1h,s2h);
5562           else emit_test(s1h,s1h);
5563           nottaken1=(int)out;
5564           emit_jne(1);
5565         }
5566         if(opcode[i]==5) // BNE
5567         {
5568           if(s2h>=0) emit_cmp(s1h,s2h);
5569           else emit_test(s1h,s1h);
5570           if(invert) taken=(int)out;
5571           else add_to_linker((int)out,ba[i],internal);
5572           emit_jne(0);
5573         }
5574         if(opcode[i]==6) // BLEZ
5575         {
5576           emit_test(s1h,s1h);
5577           if(invert) taken=(int)out;
5578           else add_to_linker((int)out,ba[i],internal);
5579           emit_js(0);
5580           nottaken1=(int)out;
5581           emit_jne(1);
5582         }
5583         if(opcode[i]==7) // BGTZ
5584         {
5585           emit_test(s1h,s1h);
5586           nottaken1=(int)out;
5587           emit_js(1);
5588           if(invert) taken=(int)out;
5589           else add_to_linker((int)out,ba[i],internal);
5590           emit_jne(0);
5591         }
5592       } // if(!only32)
5593           
5594       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5595       assert(s1l>=0);
5596       if(opcode[i]==4) // BEQ
5597       {
5598         if(s2l>=0) emit_cmp(s1l,s2l);
5599         else emit_test(s1l,s1l);
5600         if(invert){
5601           nottaken=(int)out;
5602           emit_jne(1);
5603         }else{
5604           add_to_linker((int)out,ba[i],internal);
5605           emit_jeq(0);
5606         }
5607       }
5608       if(opcode[i]==5) // BNE
5609       {
5610         if(s2l>=0) emit_cmp(s1l,s2l);
5611         else emit_test(s1l,s1l);
5612         if(invert){
5613           nottaken=(int)out;
5614           emit_jeq(1);
5615         }else{
5616           add_to_linker((int)out,ba[i],internal);
5617           emit_jne(0);
5618         }
5619       }
5620       if(opcode[i]==6) // BLEZ
5621       {
5622         emit_cmpimm(s1l,1);
5623         if(invert){
5624           nottaken=(int)out;
5625           emit_jge(1);
5626         }else{
5627           add_to_linker((int)out,ba[i],internal);
5628           emit_jl(0);
5629         }
5630       }
5631       if(opcode[i]==7) // BGTZ
5632       {
5633         emit_cmpimm(s1l,1);
5634         if(invert){
5635           nottaken=(int)out;
5636           emit_jl(1);
5637         }else{
5638           add_to_linker((int)out,ba[i],internal);
5639           emit_jge(0);
5640         }
5641       }
5642       if(invert) {
5643         if(taken) set_jump_target(taken,(int)out);
5644         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5645         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5646           if(adj) {
5647             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5648             add_to_linker((int)out,ba[i],internal);
5649           }else{
5650             emit_addnop(13);
5651             add_to_linker((int)out,ba[i],internal*2);
5652           }
5653           emit_jmp(0);
5654         }else
5655         #endif
5656         {
5657           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5658           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5659           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5660           if(internal)
5661             assem_debug("branch: internal\n");
5662           else
5663             assem_debug("branch: external\n");
5664           if(internal&&is_ds[(ba[i]-start)>>2]) {
5665             ds_assemble_entry(i);
5666           }
5667           else {
5668             add_to_linker((int)out,ba[i],internal);
5669             emit_jmp(0);
5670           }
5671         }
5672         set_jump_target(nottaken,(int)out);
5673       }
5674
5675       if(nottaken1) set_jump_target(nottaken1,(int)out);
5676       if(adj) {
5677         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5678       }
5679     } // (!unconditional)
5680   } // if(ooo)
5681   else
5682   {
5683     // In-order execution (branch first)
5684     //if(likely[i]) printf("IOL\n");
5685     //else
5686     //printf("IOE\n");
5687     int taken=0,nottaken=0,nottaken1=0;
5688     if(!unconditional&&!nop) {
5689       if(!only32)
5690       {
5691         assert(s1h>=0);
5692         if((opcode[i]&0x2f)==4) // BEQ
5693         {
5694           if(s2h>=0) emit_cmp(s1h,s2h);
5695           else emit_test(s1h,s1h);
5696           nottaken1=(int)out;
5697           emit_jne(2);
5698         }
5699         if((opcode[i]&0x2f)==5) // BNE
5700         {
5701           if(s2h>=0) emit_cmp(s1h,s2h);
5702           else emit_test(s1h,s1h);
5703           taken=(int)out;
5704           emit_jne(1);
5705         }
5706         if((opcode[i]&0x2f)==6) // BLEZ
5707         {
5708           emit_test(s1h,s1h);
5709           taken=(int)out;
5710           emit_js(1);
5711           nottaken1=(int)out;
5712           emit_jne(2);
5713         }
5714         if((opcode[i]&0x2f)==7) // BGTZ
5715         {
5716           emit_test(s1h,s1h);
5717           nottaken1=(int)out;
5718           emit_js(2);
5719           taken=(int)out;
5720           emit_jne(1);
5721         }
5722       } // if(!only32)
5723           
5724       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5725       assert(s1l>=0);
5726       if((opcode[i]&0x2f)==4) // BEQ
5727       {
5728         if(s2l>=0) emit_cmp(s1l,s2l);
5729         else emit_test(s1l,s1l);
5730         nottaken=(int)out;
5731         emit_jne(2);
5732       }
5733       if((opcode[i]&0x2f)==5) // BNE
5734       {
5735         if(s2l>=0) emit_cmp(s1l,s2l);
5736         else emit_test(s1l,s1l);
5737         nottaken=(int)out;
5738         emit_jeq(2);
5739       }
5740       if((opcode[i]&0x2f)==6) // BLEZ
5741       {
5742         emit_cmpimm(s1l,1);
5743         nottaken=(int)out;
5744         emit_jge(2);
5745       }
5746       if((opcode[i]&0x2f)==7) // BGTZ
5747       {
5748         emit_cmpimm(s1l,1);
5749         nottaken=(int)out;
5750         emit_jl(2);
5751       }
5752     } // if(!unconditional)
5753     int adj;
5754     uint64_t ds_unneeded=branch_regs[i].u;
5755     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5756     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5757     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5758     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5759     ds_unneeded|=1;
5760     ds_unneeded_upper|=1;
5761     // branch taken
5762     if(!nop) {
5763       if(taken) set_jump_target(taken,(int)out);
5764       assem_debug("1:\n");
5765       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5766                     ds_unneeded,ds_unneeded_upper);
5767       // load regs
5768       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5769       address_generation(i+1,&branch_regs[i],0);
5770       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5771       ds_assemble(i+1,&branch_regs[i]);
5772       cc=get_reg(branch_regs[i].regmap,CCREG);
5773       if(cc==-1) {
5774         emit_loadreg(CCREG,cc=HOST_CCREG);
5775         // CHECK: Is the following instruction (fall thru) allocated ok?
5776       }
5777       assert(cc==HOST_CCREG);
5778       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5779       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5780       assem_debug("cycle count (adj)\n");
5781       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5782       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5783       if(internal)
5784         assem_debug("branch: internal\n");
5785       else
5786         assem_debug("branch: external\n");
5787       if(internal&&is_ds[(ba[i]-start)>>2]) {
5788         ds_assemble_entry(i);
5789       }
5790       else {
5791         add_to_linker((int)out,ba[i],internal);
5792         emit_jmp(0);
5793       }
5794     }
5795     // branch not taken
5796     cop1_usable=prev_cop1_usable;
5797     if(!unconditional) {
5798       if(nottaken1) set_jump_target(nottaken1,(int)out);
5799       set_jump_target(nottaken,(int)out);
5800       assem_debug("2:\n");
5801       if(!likely[i]) {
5802         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5803                       ds_unneeded,ds_unneeded_upper);
5804         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5805         address_generation(i+1,&branch_regs[i],0);
5806         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5807         ds_assemble(i+1,&branch_regs[i]);
5808       }
5809       cc=get_reg(branch_regs[i].regmap,CCREG);
5810       if(cc==-1&&!likely[i]) {
5811         // Cycle count isn't in a register, temporarily load it then write it out
5812         emit_loadreg(CCREG,HOST_CCREG);
5813         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5814         int jaddr=(int)out;
5815         emit_jns(0);
5816         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5817         emit_storereg(CCREG,HOST_CCREG);
5818       }
5819       else{
5820         cc=get_reg(i_regmap,CCREG);
5821         assert(cc==HOST_CCREG);
5822         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5823         int jaddr=(int)out;
5824         emit_jns(0);
5825         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5826       }
5827     }
5828   }
5829 }
5830
5831 void sjump_assemble(int i,struct regstat *i_regs)
5832 {
5833   signed char *i_regmap=i_regs->regmap;
5834   int cc;
5835   int match;
5836   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5837   assem_debug("smatch=%d\n",match);
5838   int s1h,s1l;
5839   int prev_cop1_usable=cop1_usable;
5840   int unconditional=0,nevertaken=0;
5841   int only32=0;
5842   int invert=0;
5843   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5844   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5845   if(!match) invert=1;
5846   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5847   if(i>(ba[i]-start)>>2) invert=1;
5848   #endif
5849
5850   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5851   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5852
5853   if(ooo[i]) {
5854     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5855     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5856   }
5857   else {
5858     s1l=get_reg(i_regmap,rs1[i]);
5859     s1h=get_reg(i_regmap,rs1[i]|64);
5860   }
5861   if(rs1[i]==0)
5862   {
5863     if(opcode2[i]&1) unconditional=1;
5864     else nevertaken=1;
5865     // These are never taken (r0 is never less than zero)
5866     //assert(opcode2[i]!=0);
5867     //assert(opcode2[i]!=2);
5868     //assert(opcode2[i]!=0x10);
5869     //assert(opcode2[i]!=0x12);
5870   }
5871   else {
5872     only32=(regs[i].was32>>rs1[i])&1;
5873   }
5874
5875   if(ooo[i]) {
5876     // Out of order execution (delay slot first)
5877     //printf("OOOE\n");
5878     address_generation(i+1,i_regs,regs[i].regmap_entry);
5879     ds_assemble(i+1,i_regs);
5880     int adj;
5881     uint64_t bc_unneeded=branch_regs[i].u;
5882     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5883     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5884     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5885     bc_unneeded|=1;
5886     bc_unneeded_upper|=1;
5887     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5888                   bc_unneeded,bc_unneeded_upper);
5889     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5890     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5891     if(rt1[i]==31) {
5892       int rt,return_address;
5893       rt=get_reg(branch_regs[i].regmap,31);
5894       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5895       if(rt>=0) {
5896         // Save the PC even if the branch is not taken
5897         return_address=start+i*4+8;
5898         emit_movimm(return_address,rt); // PC into link register
5899         #ifdef IMM_PREFETCH
5900         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5901         #endif
5902       }
5903     }
5904     cc=get_reg(branch_regs[i].regmap,CCREG);
5905     assert(cc==HOST_CCREG);
5906     if(unconditional) 
5907       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5908     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5909     assem_debug("cycle count (adj)\n");
5910     if(unconditional) {
5911       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5912       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5913         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5914         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5915         if(internal)
5916           assem_debug("branch: internal\n");
5917         else
5918           assem_debug("branch: external\n");
5919         if(internal&&is_ds[(ba[i]-start)>>2]) {
5920           ds_assemble_entry(i);
5921         }
5922         else {
5923           add_to_linker((int)out,ba[i],internal);
5924           emit_jmp(0);
5925         }
5926         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5927         if(((u_int)out)&7) emit_addnop(0);
5928         #endif
5929       }
5930     }
5931     else if(nevertaken) {
5932       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5933       int jaddr=(int)out;
5934       emit_jns(0);
5935       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5936     }
5937     else {
5938       int nottaken=0;
5939       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5940       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5941       if(!only32)
5942       {
5943         assert(s1h>=0);
5944         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5945         {
5946           emit_test(s1h,s1h);
5947           if(invert){
5948             nottaken=(int)out;
5949             emit_jns(1);
5950           }else{
5951             add_to_linker((int)out,ba[i],internal);
5952             emit_js(0);
5953           }
5954         }
5955         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5956         {
5957           emit_test(s1h,s1h);
5958           if(invert){
5959             nottaken=(int)out;
5960             emit_js(1);
5961           }else{
5962             add_to_linker((int)out,ba[i],internal);
5963             emit_jns(0);
5964           }
5965         }
5966       } // if(!only32)
5967       else
5968       {
5969         assert(s1l>=0);
5970         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5971         {
5972           emit_test(s1l,s1l);
5973           if(invert){
5974             nottaken=(int)out;
5975             emit_jns(1);
5976           }else{
5977             add_to_linker((int)out,ba[i],internal);
5978             emit_js(0);
5979           }
5980         }
5981         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5982         {
5983           emit_test(s1l,s1l);
5984           if(invert){
5985             nottaken=(int)out;
5986             emit_js(1);
5987           }else{
5988             add_to_linker((int)out,ba[i],internal);
5989             emit_jns(0);
5990           }
5991         }
5992       } // if(!only32)
5993           
5994       if(invert) {
5995         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5996         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5997           if(adj) {
5998             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5999             add_to_linker((int)out,ba[i],internal);
6000           }else{
6001             emit_addnop(13);
6002             add_to_linker((int)out,ba[i],internal*2);
6003           }
6004           emit_jmp(0);
6005         }else
6006         #endif
6007         {
6008           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6009           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6010           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6011           if(internal)
6012             assem_debug("branch: internal\n");
6013           else
6014             assem_debug("branch: external\n");
6015           if(internal&&is_ds[(ba[i]-start)>>2]) {
6016             ds_assemble_entry(i);
6017           }
6018           else {
6019             add_to_linker((int)out,ba[i],internal);
6020             emit_jmp(0);
6021           }
6022         }
6023         set_jump_target(nottaken,(int)out);
6024       }
6025
6026       if(adj) {
6027         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6028       }
6029     } // (!unconditional)
6030   } // if(ooo)
6031   else
6032   {
6033     // In-order execution (branch first)
6034     //printf("IOE\n");
6035     int nottaken=0;
6036     if(rt1[i]==31) {
6037       int rt,return_address;
6038       rt=get_reg(branch_regs[i].regmap,31);
6039       if(rt>=0) {
6040         // Save the PC even if the branch is not taken
6041         return_address=start+i*4+8;
6042         emit_movimm(return_address,rt); // PC into link register
6043         #ifdef IMM_PREFETCH
6044         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6045         #endif
6046       }
6047     }
6048     if(!unconditional) {
6049       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6050       if(!only32)
6051       {
6052         assert(s1h>=0);
6053         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6054         {
6055           emit_test(s1h,s1h);
6056           nottaken=(int)out;
6057           emit_jns(1);
6058         }
6059         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6060         {
6061           emit_test(s1h,s1h);
6062           nottaken=(int)out;
6063           emit_js(1);
6064         }
6065       } // if(!only32)
6066       else
6067       {
6068         assert(s1l>=0);
6069         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6070         {
6071           emit_test(s1l,s1l);
6072           nottaken=(int)out;
6073           emit_jns(1);
6074         }
6075         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6076         {
6077           emit_test(s1l,s1l);
6078           nottaken=(int)out;
6079           emit_js(1);
6080         }
6081       }
6082     } // if(!unconditional)
6083     int adj;
6084     uint64_t ds_unneeded=branch_regs[i].u;
6085     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6086     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6087     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6088     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6089     ds_unneeded|=1;
6090     ds_unneeded_upper|=1;
6091     // branch taken
6092     if(!nevertaken) {
6093       //assem_debug("1:\n");
6094       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6095                     ds_unneeded,ds_unneeded_upper);
6096       // load regs
6097       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6098       address_generation(i+1,&branch_regs[i],0);
6099       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6100       ds_assemble(i+1,&branch_regs[i]);
6101       cc=get_reg(branch_regs[i].regmap,CCREG);
6102       if(cc==-1) {
6103         emit_loadreg(CCREG,cc=HOST_CCREG);
6104         // CHECK: Is the following instruction (fall thru) allocated ok?
6105       }
6106       assert(cc==HOST_CCREG);
6107       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6108       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6109       assem_debug("cycle count (adj)\n");
6110       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6111       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6112       if(internal)
6113         assem_debug("branch: internal\n");
6114       else
6115         assem_debug("branch: external\n");
6116       if(internal&&is_ds[(ba[i]-start)>>2]) {
6117         ds_assemble_entry(i);
6118       }
6119       else {
6120         add_to_linker((int)out,ba[i],internal);
6121         emit_jmp(0);
6122       }
6123     }
6124     // branch not taken
6125     cop1_usable=prev_cop1_usable;
6126     if(!unconditional) {
6127       set_jump_target(nottaken,(int)out);
6128       assem_debug("1:\n");
6129       if(!likely[i]) {
6130         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6131                       ds_unneeded,ds_unneeded_upper);
6132         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6133         address_generation(i+1,&branch_regs[i],0);
6134         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6135         ds_assemble(i+1,&branch_regs[i]);
6136       }
6137       cc=get_reg(branch_regs[i].regmap,CCREG);
6138       if(cc==-1&&!likely[i]) {
6139         // Cycle count isn't in a register, temporarily load it then write it out
6140         emit_loadreg(CCREG,HOST_CCREG);
6141         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6142         int jaddr=(int)out;
6143         emit_jns(0);
6144         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6145         emit_storereg(CCREG,HOST_CCREG);
6146       }
6147       else{
6148         cc=get_reg(i_regmap,CCREG);
6149         assert(cc==HOST_CCREG);
6150         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6151         int jaddr=(int)out;
6152         emit_jns(0);
6153         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6154       }
6155     }
6156   }
6157 }
6158
6159 void fjump_assemble(int i,struct regstat *i_regs)
6160 {
6161   signed char *i_regmap=i_regs->regmap;
6162   int cc;
6163   int match;
6164   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6165   assem_debug("fmatch=%d\n",match);
6166   int fs,cs;
6167   int eaddr;
6168   int invert=0;
6169   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6170   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6171   if(!match) invert=1;
6172   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6173   if(i>(ba[i]-start)>>2) invert=1;
6174   #endif
6175
6176   if(ooo[i]) {
6177     fs=get_reg(branch_regs[i].regmap,FSREG);
6178     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6179   }
6180   else {
6181     fs=get_reg(i_regmap,FSREG);
6182   }
6183
6184   // Check cop1 unusable
6185   if(!cop1_usable) {
6186     cs=get_reg(i_regmap,CSREG);
6187     assert(cs>=0);
6188     emit_testimm(cs,0x20000000);
6189     eaddr=(int)out;
6190     emit_jeq(0);
6191     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6192     cop1_usable=1;
6193   }
6194
6195   if(ooo[i]) {
6196     // Out of order execution (delay slot first)
6197     //printf("OOOE\n");
6198     ds_assemble(i+1,i_regs);
6199     int adj;
6200     uint64_t bc_unneeded=branch_regs[i].u;
6201     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6202     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6203     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6204     bc_unneeded|=1;
6205     bc_unneeded_upper|=1;
6206     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6207                   bc_unneeded,bc_unneeded_upper);
6208     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6209     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6210     cc=get_reg(branch_regs[i].regmap,CCREG);
6211     assert(cc==HOST_CCREG);
6212     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6213     assem_debug("cycle count (adj)\n");
6214     if(1) {
6215       int nottaken=0;
6216       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6217       if(1) {
6218         assert(fs>=0);
6219         emit_testimm(fs,0x800000);
6220         if(source[i]&0x10000) // BC1T
6221         {
6222           if(invert){
6223             nottaken=(int)out;
6224             emit_jeq(1);
6225           }else{
6226             add_to_linker((int)out,ba[i],internal);
6227             emit_jne(0);
6228           }
6229         }
6230         else // BC1F
6231           if(invert){
6232             nottaken=(int)out;
6233             emit_jne(1);
6234           }else{
6235             add_to_linker((int)out,ba[i],internal);
6236             emit_jeq(0);
6237           }
6238         {
6239         }
6240       } // if(!only32)
6241           
6242       if(invert) {
6243         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6244         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6245         else if(match) emit_addnop(13);
6246         #endif
6247         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6248         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6249         if(internal)
6250           assem_debug("branch: internal\n");
6251         else
6252           assem_debug("branch: external\n");
6253         if(internal&&is_ds[(ba[i]-start)>>2]) {
6254           ds_assemble_entry(i);
6255         }
6256         else {
6257           add_to_linker((int)out,ba[i],internal);
6258           emit_jmp(0);
6259         }
6260         set_jump_target(nottaken,(int)out);
6261       }
6262
6263       if(adj) {
6264         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6265       }
6266     } // (!unconditional)
6267   } // if(ooo)
6268   else
6269   {
6270     // In-order execution (branch first)
6271     //printf("IOE\n");
6272     int nottaken=0;
6273     if(1) {
6274       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6275       if(1) {
6276         assert(fs>=0);
6277         emit_testimm(fs,0x800000);
6278         if(source[i]&0x10000) // BC1T
6279         {
6280           nottaken=(int)out;
6281           emit_jeq(1);
6282         }
6283         else // BC1F
6284         {
6285           nottaken=(int)out;
6286           emit_jne(1);
6287         }
6288       }
6289     } // if(!unconditional)
6290     int adj;
6291     uint64_t ds_unneeded=branch_regs[i].u;
6292     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6293     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6294     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6295     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6296     ds_unneeded|=1;
6297     ds_unneeded_upper|=1;
6298     // branch taken
6299     //assem_debug("1:\n");
6300     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6301                   ds_unneeded,ds_unneeded_upper);
6302     // load regs
6303     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6304     address_generation(i+1,&branch_regs[i],0);
6305     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6306     ds_assemble(i+1,&branch_regs[i]);
6307     cc=get_reg(branch_regs[i].regmap,CCREG);
6308     if(cc==-1) {
6309       emit_loadreg(CCREG,cc=HOST_CCREG);
6310       // CHECK: Is the following instruction (fall thru) allocated ok?
6311     }
6312     assert(cc==HOST_CCREG);
6313     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6314     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6315     assem_debug("cycle count (adj)\n");
6316     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6317     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6318     if(internal)
6319       assem_debug("branch: internal\n");
6320     else
6321       assem_debug("branch: external\n");
6322     if(internal&&is_ds[(ba[i]-start)>>2]) {
6323       ds_assemble_entry(i);
6324     }
6325     else {
6326       add_to_linker((int)out,ba[i],internal);
6327       emit_jmp(0);
6328     }
6329
6330     // branch not taken
6331     if(1) { // <- FIXME (don't need this)
6332       set_jump_target(nottaken,(int)out);
6333       assem_debug("1:\n");
6334       if(!likely[i]) {
6335         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6336                       ds_unneeded,ds_unneeded_upper);
6337         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6338         address_generation(i+1,&branch_regs[i],0);
6339         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6340         ds_assemble(i+1,&branch_regs[i]);
6341       }
6342       cc=get_reg(branch_regs[i].regmap,CCREG);
6343       if(cc==-1&&!likely[i]) {
6344         // Cycle count isn't in a register, temporarily load it then write it out
6345         emit_loadreg(CCREG,HOST_CCREG);
6346         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6347         int jaddr=(int)out;
6348         emit_jns(0);
6349         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6350         emit_storereg(CCREG,HOST_CCREG);
6351       }
6352       else{
6353         cc=get_reg(i_regmap,CCREG);
6354         assert(cc==HOST_CCREG);
6355         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6356         int jaddr=(int)out;
6357         emit_jns(0);
6358         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6359       }
6360     }
6361   }
6362 }
6363
6364 static void pagespan_assemble(int i,struct regstat *i_regs)
6365 {
6366   int s1l=get_reg(i_regs->regmap,rs1[i]);
6367   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6368   int s2l=get_reg(i_regs->regmap,rs2[i]);
6369   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6370   void *nt_branch=NULL;
6371   int taken=0;
6372   int nottaken=0;
6373   int unconditional=0;
6374   if(rs1[i]==0)
6375   {
6376     s1l=s2l;s1h=s2h;
6377     s2l=s2h=-1;
6378   }
6379   else if(rs2[i]==0)
6380   {
6381     s2l=s2h=-1;
6382   }
6383   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6384     s1h=s2h=-1;
6385   }
6386   int hr=0;
6387   int addr,alt,ntaddr;
6388   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6389   else {
6390     while(hr<HOST_REGS)
6391     {
6392       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6393          (i_regs->regmap[hr]&63)!=rs1[i] &&
6394          (i_regs->regmap[hr]&63)!=rs2[i] )
6395       {
6396         addr=hr++;break;
6397       }
6398       hr++;
6399     }
6400   }
6401   while(hr<HOST_REGS)
6402   {
6403     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6404        (i_regs->regmap[hr]&63)!=rs1[i] &&
6405        (i_regs->regmap[hr]&63)!=rs2[i] )
6406     {
6407       alt=hr++;break;
6408     }
6409     hr++;
6410   }
6411   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6412   {
6413     while(hr<HOST_REGS)
6414     {
6415       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6416          (i_regs->regmap[hr]&63)!=rs1[i] &&
6417          (i_regs->regmap[hr]&63)!=rs2[i] )
6418       {
6419         ntaddr=hr;break;
6420       }
6421       hr++;
6422     }
6423   }
6424   assert(hr<HOST_REGS);
6425   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6426     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6427   }
6428   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6429   if(opcode[i]==2) // J
6430   {
6431     unconditional=1;
6432   }
6433   if(opcode[i]==3) // JAL
6434   {
6435     // TODO: mini_ht
6436     int rt=get_reg(i_regs->regmap,31);
6437     emit_movimm(start+i*4+8,rt);
6438     unconditional=1;
6439   }
6440   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6441   {
6442     emit_mov(s1l,addr);
6443     if(opcode2[i]==9) // JALR
6444     {
6445       int rt=get_reg(i_regs->regmap,rt1[i]);
6446       emit_movimm(start+i*4+8,rt);
6447     }
6448   }
6449   if((opcode[i]&0x3f)==4) // BEQ
6450   {
6451     if(rs1[i]==rs2[i])
6452     {
6453       unconditional=1;
6454     }
6455     else
6456     #ifdef HAVE_CMOV_IMM
6457     if(s1h<0) {
6458       if(s2l>=0) emit_cmp(s1l,s2l);
6459       else emit_test(s1l,s1l);
6460       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6461     }
6462     else
6463     #endif
6464     {
6465       assert(s1l>=0);
6466       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6467       if(s1h>=0) {
6468         if(s2h>=0) emit_cmp(s1h,s2h);
6469         else emit_test(s1h,s1h);
6470         emit_cmovne_reg(alt,addr);
6471       }
6472       if(s2l>=0) emit_cmp(s1l,s2l);
6473       else emit_test(s1l,s1l);
6474       emit_cmovne_reg(alt,addr);
6475     }
6476   }
6477   if((opcode[i]&0x3f)==5) // BNE
6478   {
6479     #ifdef HAVE_CMOV_IMM
6480     if(s1h<0) {
6481       if(s2l>=0) emit_cmp(s1l,s2l);
6482       else emit_test(s1l,s1l);
6483       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6484     }
6485     else
6486     #endif
6487     {
6488       assert(s1l>=0);
6489       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6490       if(s1h>=0) {
6491         if(s2h>=0) emit_cmp(s1h,s2h);
6492         else emit_test(s1h,s1h);
6493         emit_cmovne_reg(alt,addr);
6494       }
6495       if(s2l>=0) emit_cmp(s1l,s2l);
6496       else emit_test(s1l,s1l);
6497       emit_cmovne_reg(alt,addr);
6498     }
6499   }
6500   if((opcode[i]&0x3f)==0x14) // BEQL
6501   {
6502     if(s1h>=0) {
6503       if(s2h>=0) emit_cmp(s1h,s2h);
6504       else emit_test(s1h,s1h);
6505       nottaken=(int)out;
6506       emit_jne(0);
6507     }
6508     if(s2l>=0) emit_cmp(s1l,s2l);
6509     else emit_test(s1l,s1l);
6510     if(nottaken) set_jump_target(nottaken,(int)out);
6511     nottaken=(int)out;
6512     emit_jne(0);
6513   }
6514   if((opcode[i]&0x3f)==0x15) // BNEL
6515   {
6516     if(s1h>=0) {
6517       if(s2h>=0) emit_cmp(s1h,s2h);
6518       else emit_test(s1h,s1h);
6519       taken=(int)out;
6520       emit_jne(0);
6521     }
6522     if(s2l>=0) emit_cmp(s1l,s2l);
6523     else emit_test(s1l,s1l);
6524     nottaken=(int)out;
6525     emit_jeq(0);
6526     if(taken) set_jump_target(taken,(int)out);
6527   }
6528   if((opcode[i]&0x3f)==6) // BLEZ
6529   {
6530     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6531     emit_cmpimm(s1l,1);
6532     if(s1h>=0) emit_mov(addr,ntaddr);
6533     emit_cmovl_reg(alt,addr);
6534     if(s1h>=0) {
6535       emit_test(s1h,s1h);
6536       emit_cmovne_reg(ntaddr,addr);
6537       emit_cmovs_reg(alt,addr);
6538     }
6539   }
6540   if((opcode[i]&0x3f)==7) // BGTZ
6541   {
6542     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6543     emit_cmpimm(s1l,1);
6544     if(s1h>=0) emit_mov(addr,alt);
6545     emit_cmovl_reg(ntaddr,addr);
6546     if(s1h>=0) {
6547       emit_test(s1h,s1h);
6548       emit_cmovne_reg(alt,addr);
6549       emit_cmovs_reg(ntaddr,addr);
6550     }
6551   }
6552   if((opcode[i]&0x3f)==0x16) // BLEZL
6553   {
6554     assert((opcode[i]&0x3f)!=0x16);
6555   }
6556   if((opcode[i]&0x3f)==0x17) // BGTZL
6557   {
6558     assert((opcode[i]&0x3f)!=0x17);
6559   }
6560   assert(opcode[i]!=1); // BLTZ/BGEZ
6561
6562   //FIXME: Check CSREG
6563   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6564     if((source[i]&0x30000)==0) // BC1F
6565     {
6566       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6567       emit_testimm(s1l,0x800000);
6568       emit_cmovne_reg(alt,addr);
6569     }
6570     if((source[i]&0x30000)==0x10000) // BC1T
6571     {
6572       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6573       emit_testimm(s1l,0x800000);
6574       emit_cmovne_reg(alt,addr);
6575     }
6576     if((source[i]&0x30000)==0x20000) // BC1FL
6577     {
6578       emit_testimm(s1l,0x800000);
6579       nottaken=(int)out;
6580       emit_jne(0);
6581     }
6582     if((source[i]&0x30000)==0x30000) // BC1TL
6583     {
6584       emit_testimm(s1l,0x800000);
6585       nottaken=(int)out;
6586       emit_jeq(0);
6587     }
6588   }
6589
6590   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6591   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6592   if(likely[i]||unconditional)
6593   {
6594     emit_movimm(ba[i],HOST_BTREG);
6595   }
6596   else if(addr!=HOST_BTREG)
6597   {
6598     emit_mov(addr,HOST_BTREG);
6599   }
6600   void *branch_addr=out;
6601   emit_jmp(0);
6602   int target_addr=start+i*4+5;
6603   void *stub=out;
6604   void *compiled_target_addr=check_addr(target_addr);
6605   emit_extjump_ds((int)branch_addr,target_addr);
6606   if(compiled_target_addr) {
6607     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6608     add_link(target_addr,stub);
6609   }
6610   else set_jump_target((int)branch_addr,(int)stub);
6611   if(likely[i]) {
6612     // Not-taken path
6613     set_jump_target((int)nottaken,(int)out);
6614     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6615     void *branch_addr=out;
6616     emit_jmp(0);
6617     int target_addr=start+i*4+8;
6618     void *stub=out;
6619     void *compiled_target_addr=check_addr(target_addr);
6620     emit_extjump_ds((int)branch_addr,target_addr);
6621     if(compiled_target_addr) {
6622       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6623       add_link(target_addr,stub);
6624     }
6625     else set_jump_target((int)branch_addr,(int)stub);
6626   }
6627 }
6628
6629 // Assemble the delay slot for the above
6630 static void pagespan_ds()
6631 {
6632   assem_debug("initial delay slot:\n");
6633   u_int vaddr=start+1;
6634   u_int page=get_page(vaddr);
6635   u_int vpage=get_vpage(vaddr);
6636   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6637   do_dirty_stub_ds();
6638   ll_add(jump_in+page,vaddr,(void *)out);
6639   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6640   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6641     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6642   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6643     emit_writeword(HOST_BTREG,(int)&branch_target);
6644   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6645   address_generation(0,&regs[0],regs[0].regmap_entry);
6646   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6647     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6648   cop1_usable=0;
6649   is_delayslot=0;
6650   switch(itype[0]) {
6651     case ALU:
6652       alu_assemble(0,&regs[0]);break;
6653     case IMM16:
6654       imm16_assemble(0,&regs[0]);break;
6655     case SHIFT:
6656       shift_assemble(0,&regs[0]);break;
6657     case SHIFTIMM:
6658       shiftimm_assemble(0,&regs[0]);break;
6659     case LOAD:
6660       load_assemble(0,&regs[0]);break;
6661     case LOADLR:
6662       loadlr_assemble(0,&regs[0]);break;
6663     case STORE:
6664       store_assemble(0,&regs[0]);break;
6665     case STORELR:
6666       storelr_assemble(0,&regs[0]);break;
6667     case COP0:
6668       cop0_assemble(0,&regs[0]);break;
6669     case COP1:
6670       cop1_assemble(0,&regs[0]);break;
6671     case C1LS:
6672       c1ls_assemble(0,&regs[0]);break;
6673     case COP2:
6674       cop2_assemble(0,&regs[0]);break;
6675     case C2LS:
6676       c2ls_assemble(0,&regs[0]);break;
6677     case C2OP:
6678       c2op_assemble(0,&regs[0]);break;
6679     case FCONV:
6680       fconv_assemble(0,&regs[0]);break;
6681     case FLOAT:
6682       float_assemble(0,&regs[0]);break;
6683     case FCOMP:
6684       fcomp_assemble(0,&regs[0]);break;
6685     case MULTDIV:
6686       multdiv_assemble(0,&regs[0]);break;
6687     case MOV:
6688       mov_assemble(0,&regs[0]);break;
6689     case SYSCALL:
6690     case HLECALL:
6691     case INTCALL:
6692     case SPAN:
6693     case UJUMP:
6694     case RJUMP:
6695     case CJUMP:
6696     case SJUMP:
6697     case FJUMP:
6698       printf("Jump in the delay slot.  This is probably a bug.\n");
6699   }
6700   int btaddr=get_reg(regs[0].regmap,BTREG);
6701   if(btaddr<0) {
6702     btaddr=get_reg(regs[0].regmap,-1);
6703     emit_readword((int)&branch_target,btaddr);
6704   }
6705   assert(btaddr!=HOST_CCREG);
6706   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6707 #ifdef HOST_IMM8
6708   emit_movimm(start+4,HOST_TEMPREG);
6709   emit_cmp(btaddr,HOST_TEMPREG);
6710 #else
6711   emit_cmpimm(btaddr,start+4);
6712 #endif
6713   int branch=(int)out;
6714   emit_jeq(0);
6715   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6716   emit_jmp(jump_vaddr_reg[btaddr]);
6717   set_jump_target(branch,(int)out);
6718   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6719   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6720 }
6721
6722 // Basic liveness analysis for MIPS registers
6723 void unneeded_registers(int istart,int iend,int r)
6724 {
6725   int i;
6726   uint64_t u,uu,gte_u,b,bu,gte_bu;
6727   uint64_t temp_u,temp_uu,temp_gte_u;
6728   uint64_t tdep;
6729   if(iend==slen-1) {
6730     u=1;uu=1;
6731   }else{
6732     u=unneeded_reg[iend+1];
6733     uu=unneeded_reg_upper[iend+1];
6734     u=1;uu=1;
6735   }
6736   gte_u=temp_gte_u=0;
6737
6738   for (i=iend;i>=istart;i--)
6739   {
6740     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6741     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6742     {
6743       // If subroutine call, flag return address as a possible branch target
6744       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6745       
6746       if(ba[i]<start || ba[i]>=(start+slen*4))
6747       {
6748         // Branch out of this block, flush all regs
6749         u=1;
6750         uu=1;
6751         gte_u=0;
6752         /* Hexagon hack 
6753         if(itype[i]==UJUMP&&rt1[i]==31)
6754         {
6755           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6756         }
6757         if(itype[i]==RJUMP&&rs1[i]==31)
6758         {
6759           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6760         }
6761         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6762           if(itype[i]==UJUMP&&rt1[i]==31)
6763           {
6764             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6765             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6766           }
6767           if(itype[i]==RJUMP&&rs1[i]==31)
6768           {
6769             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6770             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6771           }
6772         }*/
6773         branch_unneeded_reg[i]=u;
6774         branch_unneeded_reg_upper[i]=uu;
6775         // Merge in delay slot
6776         tdep=(~uu>>rt1[i+1])&1;
6777         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6778         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6779         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6780         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6781         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6782         u|=1;uu|=1;
6783         gte_u|=gte_rt[i+1];
6784         gte_u&=~gte_rs[i+1];
6785         // If branch is "likely" (and conditional)
6786         // then we skip the delay slot on the fall-thru path
6787         if(likely[i]) {
6788           if(i<slen-1) {
6789             u&=unneeded_reg[i+2];
6790             uu&=unneeded_reg_upper[i+2];
6791             gte_u&=gte_unneeded[i+2];
6792           }
6793           else
6794           {
6795             u=1;
6796             uu=1;
6797             gte_u=0;
6798           }
6799         }
6800       }
6801       else
6802       {
6803         // Internal branch, flag target
6804         bt[(ba[i]-start)>>2]=1;
6805         if(ba[i]<=start+i*4) {
6806           // Backward branch
6807           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6808           {
6809             // Unconditional branch
6810             temp_u=1;temp_uu=1;
6811             temp_gte_u=0;
6812           } else {
6813             // Conditional branch (not taken case)
6814             temp_u=unneeded_reg[i+2];
6815             temp_uu=unneeded_reg_upper[i+2];
6816             temp_gte_u&=gte_unneeded[i+2];
6817           }
6818           // Merge in delay slot
6819           tdep=(~temp_uu>>rt1[i+1])&1;
6820           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6821           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6822           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6823           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6824           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6825           temp_u|=1;temp_uu|=1;
6826           temp_gte_u|=gte_rt[i+1];
6827           temp_gte_u&=~gte_rs[i+1];
6828           // If branch is "likely" (and conditional)
6829           // then we skip the delay slot on the fall-thru path
6830           if(likely[i]) {
6831             if(i<slen-1) {
6832               temp_u&=unneeded_reg[i+2];
6833               temp_uu&=unneeded_reg_upper[i+2];
6834               temp_gte_u&=gte_unneeded[i+2];
6835             }
6836             else
6837             {
6838               temp_u=1;
6839               temp_uu=1;
6840               temp_gte_u=0;
6841             }
6842           }
6843           tdep=(~temp_uu>>rt1[i])&1;
6844           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6845           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6846           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6847           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6848           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6849           temp_u|=1;temp_uu|=1;
6850           temp_gte_u|=gte_rt[i];
6851           temp_gte_u&=~gte_rs[i];
6852           unneeded_reg[i]=temp_u;
6853           unneeded_reg_upper[i]=temp_uu;
6854           gte_unneeded[i]=temp_gte_u;
6855           // Only go three levels deep.  This recursion can take an
6856           // excessive amount of time if there are a lot of nested loops.
6857           if(r<2) {
6858             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6859           }else{
6860             unneeded_reg[(ba[i]-start)>>2]=1;
6861             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6862             gte_unneeded[(ba[i]-start)>>2]=0;
6863           }
6864         } /*else*/ if(1) {
6865           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6866           {
6867             // Unconditional branch
6868             u=unneeded_reg[(ba[i]-start)>>2];
6869             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6870             gte_u=gte_unneeded[(ba[i]-start)>>2];
6871             branch_unneeded_reg[i]=u;
6872             branch_unneeded_reg_upper[i]=uu;
6873         //u=1;
6874         //uu=1;
6875         //branch_unneeded_reg[i]=u;
6876         //branch_unneeded_reg_upper[i]=uu;
6877             // Merge in delay slot
6878             tdep=(~uu>>rt1[i+1])&1;
6879             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6880             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6881             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6882             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6883             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6884             u|=1;uu|=1;
6885             gte_u|=gte_rt[i+1];
6886             gte_u&=~gte_rs[i+1];
6887           } else {
6888             // Conditional branch
6889             b=unneeded_reg[(ba[i]-start)>>2];
6890             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6891             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6892             branch_unneeded_reg[i]=b;
6893             branch_unneeded_reg_upper[i]=bu;
6894         //b=1;
6895         //bu=1;
6896         //branch_unneeded_reg[i]=b;
6897         //branch_unneeded_reg_upper[i]=bu;
6898             // Branch delay slot
6899             tdep=(~uu>>rt1[i+1])&1;
6900             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6901             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6902             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6903             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6904             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6905             b|=1;bu|=1;
6906             gte_bu|=gte_rt[i+1];
6907             gte_bu&=~gte_rs[i+1];
6908             // If branch is "likely" then we skip the
6909             // delay slot on the fall-thru path
6910             if(likely[i]) {
6911               u=b;
6912               uu=bu;
6913               gte_u=gte_bu;
6914               if(i<slen-1) {
6915                 u&=unneeded_reg[i+2];
6916                 uu&=unneeded_reg_upper[i+2];
6917                 gte_u&=gte_unneeded[i+2];
6918         //u=1;
6919         //uu=1;
6920               }
6921             } else {
6922               u&=b;
6923               uu&=bu;
6924               gte_u&=gte_bu;
6925         //u=1;
6926         //uu=1;
6927             }
6928             if(i<slen-1) {
6929               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6930               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6931         //branch_unneeded_reg[i]=1;
6932         //branch_unneeded_reg_upper[i]=1;
6933             } else {
6934               branch_unneeded_reg[i]=1;
6935               branch_unneeded_reg_upper[i]=1;
6936             }
6937           }
6938         }
6939       }
6940     }
6941     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6942     {
6943       // SYSCALL instruction (software interrupt)
6944       u=1;
6945       uu=1;
6946     }
6947     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6948     {
6949       // ERET instruction (return from interrupt)
6950       u=1;
6951       uu=1;
6952     }
6953     //u=uu=1; // DEBUG
6954     tdep=(~uu>>rt1[i])&1;
6955     // Written registers are unneeded
6956     u|=1LL<<rt1[i];
6957     u|=1LL<<rt2[i];
6958     uu|=1LL<<rt1[i];
6959     uu|=1LL<<rt2[i];
6960     gte_u|=gte_rt[i];
6961     // Accessed registers are needed
6962     u&=~(1LL<<rs1[i]);
6963     u&=~(1LL<<rs2[i]);
6964     uu&=~(1LL<<us1[i]);
6965     uu&=~(1LL<<us2[i]);
6966     gte_u&=~gte_rs[i];
6967     // Source-target dependencies
6968     uu&=~(tdep<<dep1[i]);
6969     uu&=~(tdep<<dep2[i]);
6970     // R0 is always unneeded
6971     u|=1;uu|=1;
6972     // Save it
6973     unneeded_reg[i]=u;
6974     unneeded_reg_upper[i]=uu;
6975     gte_unneeded[i]=gte_u;
6976     /*
6977     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6978     printf("U:");
6979     int r;
6980     for(r=1;r<=CCREG;r++) {
6981       if((unneeded_reg[i]>>r)&1) {
6982         if(r==HIREG) printf(" HI");
6983         else if(r==LOREG) printf(" LO");
6984         else printf(" r%d",r);
6985       }
6986     }
6987     printf(" UU:");
6988     for(r=1;r<=CCREG;r++) {
6989       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6990         if(r==HIREG) printf(" HI");
6991         else if(r==LOREG) printf(" LO");
6992         else printf(" r%d",r);
6993       }
6994     }
6995     printf("\n");*/
6996   }
6997 #ifdef FORCE32
6998   for (i=iend;i>=istart;i--)
6999   {
7000     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7001   }
7002 #endif
7003 }
7004
7005 // Identify registers which are likely to contain 32-bit values
7006 // This is used to predict whether any branches will jump to a
7007 // location with 64-bit values in registers.
7008 static void provisional_32bit()
7009 {
7010   int i,j;
7011   uint64_t is32=1;
7012   uint64_t lastbranch=1;
7013   
7014   for(i=0;i<slen;i++)
7015   {
7016     if(i>0) {
7017       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7018         if(i>1) is32=lastbranch;
7019         else is32=1;
7020       }
7021     }
7022     if(i>1)
7023     {
7024       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7025         if(likely[i-2]) {
7026           if(i>2) is32=lastbranch;
7027           else is32=1;
7028         }
7029       }
7030       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7031       {
7032         if(rs1[i-2]==0||rs2[i-2]==0)
7033         {
7034           if(rs1[i-2]) {
7035             is32|=1LL<<rs1[i-2];
7036           }
7037           if(rs2[i-2]) {
7038             is32|=1LL<<rs2[i-2];
7039           }
7040         }
7041       }
7042     }
7043     // If something jumps here with 64-bit values
7044     // then promote those registers to 64 bits
7045     if(bt[i])
7046     {
7047       uint64_t temp_is32=is32;
7048       for(j=i-1;j>=0;j--)
7049       {
7050         if(ba[j]==start+i*4) 
7051           //temp_is32&=branch_regs[j].is32;
7052           temp_is32&=p32[j];
7053       }
7054       for(j=i;j<slen;j++)
7055       {
7056         if(ba[j]==start+i*4) 
7057           temp_is32=1;
7058       }
7059       is32=temp_is32;
7060     }
7061     int type=itype[i];
7062     int op=opcode[i];
7063     int op2=opcode2[i];
7064     int rt=rt1[i];
7065     int s1=rs1[i];
7066     int s2=rs2[i];
7067     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7068       // Branches don't write registers, consider the delay slot instead.
7069       type=itype[i+1];
7070       op=opcode[i+1];
7071       op2=opcode2[i+1];
7072       rt=rt1[i+1];
7073       s1=rs1[i+1];
7074       s2=rs2[i+1];
7075       lastbranch=is32;
7076     }
7077     switch(type) {
7078       case LOAD:
7079         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7080            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7081           is32&=~(1LL<<rt);
7082         else
7083           is32|=1LL<<rt;
7084         break;
7085       case STORE:
7086       case STORELR:
7087         break;
7088       case LOADLR:
7089         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7090         if(op==0x22) is32|=1LL<<rt; // LWL
7091         break;
7092       case IMM16:
7093         if (op==0x08||op==0x09|| // ADDI/ADDIU
7094             op==0x0a||op==0x0b|| // SLTI/SLTIU
7095             op==0x0c|| // ANDI
7096             op==0x0f)  // LUI
7097         {
7098           is32|=1LL<<rt;
7099         }
7100         if(op==0x18||op==0x19) { // DADDI/DADDIU
7101           is32&=~(1LL<<rt);
7102           //if(imm[i]==0)
7103           //  is32|=((is32>>s1)&1LL)<<rt;
7104         }
7105         if(op==0x0d||op==0x0e) { // ORI/XORI
7106           uint64_t sr=((is32>>s1)&1LL);
7107           is32&=~(1LL<<rt);
7108           is32|=sr<<rt;
7109         }
7110         break;
7111       case UJUMP:
7112         break;
7113       case RJUMP:
7114         break;
7115       case CJUMP:
7116         break;
7117       case SJUMP:
7118         break;
7119       case FJUMP:
7120         break;
7121       case ALU:
7122         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7123           is32|=1LL<<rt;
7124         }
7125         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7126           is32|=1LL<<rt;
7127         }
7128         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7129           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7130           is32&=~(1LL<<rt);
7131           is32|=sr<<rt;
7132         }
7133         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7134           if(s1==0&&s2==0) {
7135             is32|=1LL<<rt;
7136           }
7137           else if(s2==0) {
7138             uint64_t sr=((is32>>s1)&1LL);
7139             is32&=~(1LL<<rt);
7140             is32|=sr<<rt;
7141           }
7142           else if(s1==0) {
7143             uint64_t sr=((is32>>s2)&1LL);
7144             is32&=~(1LL<<rt);
7145             is32|=sr<<rt;
7146           }
7147           else {
7148             is32&=~(1LL<<rt);
7149           }
7150         }
7151         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7152           if(s1==0&&s2==0) {
7153             is32|=1LL<<rt;
7154           }
7155           else if(s2==0) {
7156             uint64_t sr=((is32>>s1)&1LL);
7157             is32&=~(1LL<<rt);
7158             is32|=sr<<rt;
7159           }
7160           else {
7161             is32&=~(1LL<<rt);
7162           }
7163         }
7164         break;
7165       case MULTDIV:
7166         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7167           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7168         }
7169         else {
7170           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7171         }
7172         break;
7173       case MOV:
7174         {
7175           uint64_t sr=((is32>>s1)&1LL);
7176           is32&=~(1LL<<rt);
7177           is32|=sr<<rt;
7178         }
7179         break;
7180       case SHIFT:
7181         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7182         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7183         break;
7184       case SHIFTIMM:
7185         is32|=1LL<<rt;
7186         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7187         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7188         break;
7189       case COP0:
7190         if(op2==0) is32|=1LL<<rt; // MFC0
7191         break;
7192       case COP1:
7193       case COP2:
7194         if(op2==0) is32|=1LL<<rt; // MFC1
7195         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7196         if(op2==2) is32|=1LL<<rt; // CFC1
7197         break;
7198       case C1LS:
7199       case C2LS:
7200         break;
7201       case FLOAT:
7202       case FCONV:
7203         break;
7204       case FCOMP:
7205         break;
7206       case C2OP:
7207       case SYSCALL:
7208       case HLECALL:
7209         break;
7210       default:
7211         break;
7212     }
7213     is32|=1;
7214     p32[i]=is32;
7215
7216     if(i>0)
7217     {
7218       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7219       {
7220         if(rt1[i-1]==31) // JAL/JALR
7221         {
7222           // Subroutine call will return here, don't alloc any registers
7223           is32=1;
7224         }
7225         else if(i+1<slen)
7226         {
7227           // Internal branch will jump here, match registers to caller
7228           is32=0x3FFFFFFFFLL;
7229         }
7230       }
7231     }
7232   }
7233 }
7234
7235 // Identify registers which may be assumed to contain 32-bit values
7236 // and where optimizations will rely on this.
7237 // This is used to determine whether backward branches can safely
7238 // jump to a location with 64-bit values in registers.
7239 static void provisional_r32()
7240 {
7241   u_int r32=0;
7242   int i;
7243   
7244   for (i=slen-1;i>=0;i--)
7245   {
7246     int hr;
7247     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7248     {
7249       if(ba[i]<start || ba[i]>=(start+slen*4))
7250       {
7251         // Branch out of this block, don't need anything
7252         r32=0;
7253       }
7254       else
7255       {
7256         // Internal branch
7257         // Need whatever matches the target
7258         // (and doesn't get overwritten by the delay slot instruction)
7259         r32=0;
7260         int t=(ba[i]-start)>>2;
7261         if(ba[i]>start+i*4) {
7262           // Forward branch
7263           //if(!(requires_32bit[t]&~regs[i].was32))
7264           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7265           if(!(pr32[t]&~regs[i].was32))
7266             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7267         }else{
7268           // Backward branch
7269           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7270             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7271         }
7272       }
7273       // Conditional branch may need registers for following instructions
7274       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7275       {
7276         if(i<slen-2) {
7277           //r32|=requires_32bit[i+2];
7278           r32|=pr32[i+2];
7279           r32&=regs[i].was32;
7280           // Mark this address as a branch target since it may be called
7281           // upon return from interrupt
7282           //bt[i+2]=1;
7283         }
7284       }
7285       // Merge in delay slot
7286       if(!likely[i]) {
7287         // These are overwritten unless the branch is "likely"
7288         // and the delay slot is nullified if not taken
7289         r32&=~(1LL<<rt1[i+1]);
7290         r32&=~(1LL<<rt2[i+1]);
7291       }
7292       // Assume these are needed (delay slot)
7293       if(us1[i+1]>0)
7294       {
7295         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7296       }
7297       if(us2[i+1]>0)
7298       {
7299         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7300       }
7301       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7302       {
7303         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7304       }
7305       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7306       {
7307         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7308       }
7309     }
7310     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7311     {
7312       // SYSCALL instruction (software interrupt)
7313       r32=0;
7314     }
7315     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7316     {
7317       // ERET instruction (return from interrupt)
7318       r32=0;
7319     }
7320     // Check 32 bits
7321     r32&=~(1LL<<rt1[i]);
7322     r32&=~(1LL<<rt2[i]);
7323     if(us1[i]>0)
7324     {
7325       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7326     }
7327     if(us2[i]>0)
7328     {
7329       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7330     }
7331     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7332     {
7333       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7334     }
7335     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7336     {
7337       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7338     }
7339     //requires_32bit[i]=r32;
7340     pr32[i]=r32;
7341     
7342     // Dirty registers which are 32-bit, require 32-bit input
7343     // as they will be written as 32-bit values
7344     for(hr=0;hr<HOST_REGS;hr++)
7345     {
7346       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7347         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7348           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7349           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7350           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7351         }
7352       }
7353     }
7354   }
7355 }
7356
7357 // Write back dirty registers as soon as we will no longer modify them,
7358 // so that we don't end up with lots of writes at the branches.
7359 void clean_registers(int istart,int iend,int wr)
7360 {
7361   int i;
7362   int r;
7363   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7364   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7365   if(iend==slen-1) {
7366     will_dirty_i=will_dirty_next=0;
7367     wont_dirty_i=wont_dirty_next=0;
7368   }else{
7369     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7370     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7371   }
7372   for (i=iend;i>=istart;i--)
7373   {
7374     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7375     {
7376       if(ba[i]<start || ba[i]>=(start+slen*4))
7377       {
7378         // Branch out of this block, flush all regs
7379         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7380         {
7381           // Unconditional branch
7382           will_dirty_i=0;
7383           wont_dirty_i=0;
7384           // Merge in delay slot (will dirty)
7385           for(r=0;r<HOST_REGS;r++) {
7386             if(r!=EXCLUDE_REG) {
7387               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7388               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7389               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7390               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7391               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7392               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7393               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7394               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7395               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7396               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7397               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7398               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7399               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7400               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7401             }
7402           }
7403         }
7404         else
7405         {
7406           // Conditional branch
7407           will_dirty_i=0;
7408           wont_dirty_i=wont_dirty_next;
7409           // Merge in delay slot (will dirty)
7410           for(r=0;r<HOST_REGS;r++) {
7411             if(r!=EXCLUDE_REG) {
7412               if(!likely[i]) {
7413                 // Might not dirty if likely branch is not taken
7414                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7415                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7416                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7417                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7418                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7419                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7420                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7421                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7422                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7423                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7424                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7425                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7426                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7427                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7428               }
7429             }
7430           }
7431         }
7432         // Merge in delay slot (wont dirty)
7433         for(r=0;r<HOST_REGS;r++) {
7434           if(r!=EXCLUDE_REG) {
7435             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7436             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7437             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7438             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7439             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7440             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7441             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7442             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7443             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7444             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7445           }
7446         }
7447         if(wr) {
7448           #ifndef DESTRUCTIVE_WRITEBACK
7449           branch_regs[i].dirty&=wont_dirty_i;
7450           #endif
7451           branch_regs[i].dirty|=will_dirty_i;
7452         }
7453       }
7454       else
7455       {
7456         // Internal branch
7457         if(ba[i]<=start+i*4) {
7458           // Backward branch
7459           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7460           {
7461             // Unconditional branch
7462             temp_will_dirty=0;
7463             temp_wont_dirty=0;
7464             // Merge in delay slot (will dirty)
7465             for(r=0;r<HOST_REGS;r++) {
7466               if(r!=EXCLUDE_REG) {
7467                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7468                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7469                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7470                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7471                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7472                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7473                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7474                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7475                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7476                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7477                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7478                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7479                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7480                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7481               }
7482             }
7483           } else {
7484             // Conditional branch (not taken case)
7485             temp_will_dirty=will_dirty_next;
7486             temp_wont_dirty=wont_dirty_next;
7487             // Merge in delay slot (will dirty)
7488             for(r=0;r<HOST_REGS;r++) {
7489               if(r!=EXCLUDE_REG) {
7490                 if(!likely[i]) {
7491                   // Will not dirty if likely branch is not taken
7492                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7493                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7494                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7495                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7496                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7497                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7498                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7499                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7500                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7501                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7502                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7503                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7504                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7505                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7506                 }
7507               }
7508             }
7509           }
7510           // Merge in delay slot (wont dirty)
7511           for(r=0;r<HOST_REGS;r++) {
7512             if(r!=EXCLUDE_REG) {
7513               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7514               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7515               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7516               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7517               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7518               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7519               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7520               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7521               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7522               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7523             }
7524           }
7525           // Deal with changed mappings
7526           if(i<iend) {
7527             for(r=0;r<HOST_REGS;r++) {
7528               if(r!=EXCLUDE_REG) {
7529                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7530                   temp_will_dirty&=~(1<<r);
7531                   temp_wont_dirty&=~(1<<r);
7532                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7533                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7534                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7535                   } else {
7536                     temp_will_dirty|=1<<r;
7537                     temp_wont_dirty|=1<<r;
7538                   }
7539                 }
7540               }
7541             }
7542           }
7543           if(wr) {
7544             will_dirty[i]=temp_will_dirty;
7545             wont_dirty[i]=temp_wont_dirty;
7546             clean_registers((ba[i]-start)>>2,i-1,0);
7547           }else{
7548             // Limit recursion.  It can take an excessive amount
7549             // of time if there are a lot of nested loops.
7550             will_dirty[(ba[i]-start)>>2]=0;
7551             wont_dirty[(ba[i]-start)>>2]=-1;
7552           }
7553         }
7554         /*else*/ if(1)
7555         {
7556           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7557           {
7558             // Unconditional branch
7559             will_dirty_i=0;
7560             wont_dirty_i=0;
7561           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7562             for(r=0;r<HOST_REGS;r++) {
7563               if(r!=EXCLUDE_REG) {
7564                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7565                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7566                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7567                 }
7568                 if(branch_regs[i].regmap[r]>=0) {
7569                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7570                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7571                 }
7572               }
7573             }
7574           //}
7575             // Merge in delay slot
7576             for(r=0;r<HOST_REGS;r++) {
7577               if(r!=EXCLUDE_REG) {
7578                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7579                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7580                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7581                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7582                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7583                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7584                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7585                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7586                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7587                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7588                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7589                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7590                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7591                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7592               }
7593             }
7594           } else {
7595             // Conditional branch
7596             will_dirty_i=will_dirty_next;
7597             wont_dirty_i=wont_dirty_next;
7598           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7599             for(r=0;r<HOST_REGS;r++) {
7600               if(r!=EXCLUDE_REG) {
7601                 signed char target_reg=branch_regs[i].regmap[r];
7602                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7603                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7604                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7605                 }
7606                 else if(target_reg>=0) {
7607                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7608                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7609                 }
7610                 // Treat delay slot as part of branch too
7611                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7612                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7613                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7614                 }
7615                 else
7616                 {
7617                   will_dirty[i+1]&=~(1<<r);
7618                 }*/
7619               }
7620             }
7621           //}
7622             // Merge in delay slot
7623             for(r=0;r<HOST_REGS;r++) {
7624               if(r!=EXCLUDE_REG) {
7625                 if(!likely[i]) {
7626                   // Might not dirty if likely branch is not taken
7627                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7628                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7629                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7630                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7631                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7632                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7633                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7634                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7635                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7636                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7637                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7638                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7639                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7640                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7641                 }
7642               }
7643             }
7644           }
7645           // Merge in delay slot (won't dirty)
7646           for(r=0;r<HOST_REGS;r++) {
7647             if(r!=EXCLUDE_REG) {
7648               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7649               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7650               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7651               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7652               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7653               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7654               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7655               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7656               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7657               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7658             }
7659           }
7660           if(wr) {
7661             #ifndef DESTRUCTIVE_WRITEBACK
7662             branch_regs[i].dirty&=wont_dirty_i;
7663             #endif
7664             branch_regs[i].dirty|=will_dirty_i;
7665           }
7666         }
7667       }
7668     }
7669     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7670     {
7671       // SYSCALL instruction (software interrupt)
7672       will_dirty_i=0;
7673       wont_dirty_i=0;
7674     }
7675     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7676     {
7677       // ERET instruction (return from interrupt)
7678       will_dirty_i=0;
7679       wont_dirty_i=0;
7680     }
7681     will_dirty_next=will_dirty_i;
7682     wont_dirty_next=wont_dirty_i;
7683     for(r=0;r<HOST_REGS;r++) {
7684       if(r!=EXCLUDE_REG) {
7685         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7686         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7687         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7688         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7689         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7690         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7691         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7692         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7693         if(i>istart) {
7694           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7695           {
7696             // Don't store a register immediately after writing it,
7697             // may prevent dual-issue.
7698             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7699             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7700           }
7701         }
7702       }
7703     }
7704     // Save it
7705     will_dirty[i]=will_dirty_i;
7706     wont_dirty[i]=wont_dirty_i;
7707     // Mark registers that won't be dirtied as not dirty
7708     if(wr) {
7709       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7710       for(r=0;r<HOST_REGS;r++) {
7711         if((will_dirty_i>>r)&1) {
7712           printf(" r%d",r);
7713         }
7714       }
7715       printf("\n");*/
7716
7717       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7718         regs[i].dirty|=will_dirty_i;
7719         #ifndef DESTRUCTIVE_WRITEBACK
7720         regs[i].dirty&=wont_dirty_i;
7721         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7722         {
7723           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7724             for(r=0;r<HOST_REGS;r++) {
7725               if(r!=EXCLUDE_REG) {
7726                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7727                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7728                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7729               }
7730             }
7731           }
7732         }
7733         else
7734         {
7735           if(i<iend) {
7736             for(r=0;r<HOST_REGS;r++) {
7737               if(r!=EXCLUDE_REG) {
7738                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7739                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7740                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7741               }
7742             }
7743           }
7744         }
7745         #endif
7746       //}
7747     }
7748     // Deal with changed mappings
7749     temp_will_dirty=will_dirty_i;
7750     temp_wont_dirty=wont_dirty_i;
7751     for(r=0;r<HOST_REGS;r++) {
7752       if(r!=EXCLUDE_REG) {
7753         int nr;
7754         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7755           if(wr) {
7756             #ifndef DESTRUCTIVE_WRITEBACK
7757             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7758             #endif
7759             regs[i].wasdirty|=will_dirty_i&(1<<r);
7760           }
7761         }
7762         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7763           // Register moved to a different register
7764           will_dirty_i&=~(1<<r);
7765           wont_dirty_i&=~(1<<r);
7766           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7767           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7768           if(wr) {
7769             #ifndef DESTRUCTIVE_WRITEBACK
7770             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7771             #endif
7772             regs[i].wasdirty|=will_dirty_i&(1<<r);
7773           }
7774         }
7775         else {
7776           will_dirty_i&=~(1<<r);
7777           wont_dirty_i&=~(1<<r);
7778           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7779             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7780             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7781           } else {
7782             wont_dirty_i|=1<<r;
7783             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7784           }
7785         }
7786       }
7787     }
7788   }
7789 }
7790
7791 #ifdef DISASM
7792   /* disassembly */
7793 void disassemble_inst(int i)
7794 {
7795     if (bt[i]) printf("*"); else printf(" ");
7796     switch(itype[i]) {
7797       case UJUMP:
7798         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7799       case CJUMP:
7800         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7801       case SJUMP:
7802         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7803       case FJUMP:
7804         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7805       case RJUMP:
7806         if (opcode[i]==0x9&&rt1[i]!=31)
7807           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7808         else
7809           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7810         break;
7811       case SPAN:
7812         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7813       case IMM16:
7814         if(opcode[i]==0xf) //LUI
7815           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7816         else
7817           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7818         break;
7819       case LOAD:
7820       case LOADLR:
7821         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7822         break;
7823       case STORE:
7824       case STORELR:
7825         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7826         break;
7827       case ALU:
7828       case SHIFT:
7829         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7830         break;
7831       case MULTDIV:
7832         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7833         break;
7834       case SHIFTIMM:
7835         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7836         break;
7837       case MOV:
7838         if((opcode2[i]&0x1d)==0x10)
7839           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7840         else if((opcode2[i]&0x1d)==0x11)
7841           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7842         else
7843           printf (" %x: %s\n",start+i*4,insn[i]);
7844         break;
7845       case COP0:
7846         if(opcode2[i]==0)
7847           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7848         else if(opcode2[i]==4)
7849           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7850         else printf (" %x: %s\n",start+i*4,insn[i]);
7851         break;
7852       case COP1:
7853         if(opcode2[i]<3)
7854           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7855         else if(opcode2[i]>3)
7856           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7857         else printf (" %x: %s\n",start+i*4,insn[i]);
7858         break;
7859       case COP2:
7860         if(opcode2[i]<3)
7861           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7862         else if(opcode2[i]>3)
7863           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7864         else printf (" %x: %s\n",start+i*4,insn[i]);
7865         break;
7866       case C1LS:
7867         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7868         break;
7869       case C2LS:
7870         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7871         break;
7872       case INTCALL:
7873         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7874         break;
7875       default:
7876         //printf (" %s %8x\n",insn[i],source[i]);
7877         printf (" %x: %s\n",start+i*4,insn[i]);
7878     }
7879 }
7880 #else
7881 static void disassemble_inst(int i) {}
7882 #endif // DISASM
7883
7884 // clear the state completely, instead of just marking
7885 // things invalid like invalidate_all_pages() does
7886 void new_dynarec_clear_full()
7887 {
7888   int n;
7889   out=(u_char *)BASE_ADDR;
7890   memset(invalid_code,1,sizeof(invalid_code));
7891   memset(hash_table,0xff,sizeof(hash_table));
7892   memset(mini_ht,-1,sizeof(mini_ht));
7893   memset(restore_candidate,0,sizeof(restore_candidate));
7894   memset(shadow,0,sizeof(shadow));
7895   copy=shadow;
7896   expirep=16384; // Expiry pointer, +2 blocks
7897   pending_exception=0;
7898   literalcount=0;
7899   stop_after_jal=0;
7900   inv_code_start=inv_code_end=~0;
7901   gte_reads_flags=0;
7902   // TLB
7903 #ifndef DISABLE_TLB
7904   using_tlb=0;
7905 #endif
7906   sp_in_mirror=0;
7907   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7908     memory_map[n]=-1;
7909   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7910     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7911   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7912     memory_map[n]=-1;
7913   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7914   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7915   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7916 }
7917
7918 void new_dynarec_init()
7919 {
7920   printf("Init new dynarec\n");
7921   out=(u_char *)BASE_ADDR;
7922   if (mmap (out, 1<<TARGET_SIZE_2,
7923             PROT_READ | PROT_WRITE | PROT_EXEC,
7924             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7925             -1, 0) <= 0) {printf("mmap() failed\n");}
7926 #ifdef MUPEN64
7927   rdword=&readmem_dword;
7928   fake_pc.f.r.rs=&readmem_dword;
7929   fake_pc.f.r.rt=&readmem_dword;
7930   fake_pc.f.r.rd=&readmem_dword;
7931 #endif
7932   int n;
7933   new_dynarec_clear_full();
7934 #ifdef HOST_IMM8
7935   // Copy this into local area so we don't have to put it in every literal pool
7936   invc_ptr=invalid_code;
7937 #endif
7938 #ifdef MUPEN64
7939   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7940     writemem[n] = write_nomem_new;
7941     writememb[n] = write_nomemb_new;
7942     writememh[n] = write_nomemh_new;
7943 #ifndef FORCE32
7944     writememd[n] = write_nomemd_new;
7945 #endif
7946     readmem[n] = read_nomem_new;
7947     readmemb[n] = read_nomemb_new;
7948     readmemh[n] = read_nomemh_new;
7949 #ifndef FORCE32
7950     readmemd[n] = read_nomemd_new;
7951 #endif
7952   }
7953   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7954     writemem[n] = write_rdram_new;
7955     writememb[n] = write_rdramb_new;
7956     writememh[n] = write_rdramh_new;
7957 #ifndef FORCE32
7958     writememd[n] = write_rdramd_new;
7959 #endif
7960   }
7961   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7962     writemem[n] = write_nomem_new;
7963     writememb[n] = write_nomemb_new;
7964     writememh[n] = write_nomemh_new;
7965 #ifndef FORCE32
7966     writememd[n] = write_nomemd_new;
7967 #endif
7968     readmem[n] = read_nomem_new;
7969     readmemb[n] = read_nomemb_new;
7970     readmemh[n] = read_nomemh_new;
7971 #ifndef FORCE32
7972     readmemd[n] = read_nomemd_new;
7973 #endif
7974   }
7975 #endif
7976   tlb_hacks();
7977   arch_init();
7978 }
7979
7980 void new_dynarec_cleanup()
7981 {
7982   int n;
7983   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7984   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7985   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7986   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7987   #ifdef ROM_COPY
7988   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7989   #endif
7990 }
7991
7992 int new_recompile_block(int addr)
7993 {
7994 /*
7995   if(addr==0x800cd050) {
7996     int block;
7997     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7998     int n;
7999     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
8000   }
8001 */
8002   //if(Count==365117028) tracedebug=1;
8003   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8004   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8005   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8006   //if(debug) 
8007   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8008   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8009   /*if(Count>=312978186) {
8010     rlist();
8011   }*/
8012   //rlist();
8013   start = (u_int)addr&~3;
8014   //assert(((u_int)addr&1)==0);
8015   new_dynarec_did_compile=1;
8016 #ifdef PCSX
8017   if(!sp_in_mirror&&(signed int)(psxRegs.GPR.n.sp&0xffe00000)>0x80200000&&
8018      0x10000<=psxRegs.GPR.n.sp&&(psxRegs.GPR.n.sp&~0xe0e00000)<RAM_SIZE) {
8019     printf("SP hack enabled (%08x), @%08x\n", psxRegs.GPR.n.sp, psxRegs.pc);
8020     sp_in_mirror=1;
8021   }
8022   if (Config.HLE && start == 0x80001000) // hlecall
8023   {
8024     // XXX: is this enough? Maybe check hleSoftCall?
8025     u_int beginning=(u_int)out;
8026     u_int page=get_page(start);
8027     invalid_code[start>>12]=0;
8028     emit_movimm(start,0);
8029     emit_writeword(0,(int)&pcaddr);
8030     emit_jmp((int)new_dyna_leave);
8031     literal_pool(0);
8032 #ifdef __arm__
8033     __clear_cache((void *)beginning,out);
8034 #endif
8035     ll_add(jump_in+page,start,(void *)beginning);
8036     return 0;
8037   }
8038   else if ((u_int)addr < 0x00200000 ||
8039     (0xa0000000 <= addr && addr < 0xa0200000)) {
8040     // used for BIOS calls mostly?
8041     source = (u_int *)((u_int)rdram+(start&0x1fffff));
8042     pagelimit = (addr&0xa0000000)|0x00200000;
8043   }
8044   else if (!Config.HLE && (
8045 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8046     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8047     // BIOS
8048     source = (u_int *)((u_int)psxR+(start&0x7ffff));
8049     pagelimit = (addr&0xfff00000)|0x80000;
8050   }
8051   else
8052 #endif
8053 #ifdef MUPEN64
8054   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
8055     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
8056     pagelimit = 0xa4001000;
8057   }
8058   else
8059 #endif
8060   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
8061     source = (u_int *)((u_int)rdram+start-0x80000000);
8062     pagelimit = 0x80000000+RAM_SIZE;
8063   }
8064 #ifndef DISABLE_TLB
8065   else if ((signed int)addr >= (signed int)0xC0000000) {
8066     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
8067     //if(tlb_LUT_r[start>>12])
8068       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
8069     if((signed int)memory_map[start>>12]>=0) {
8070       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
8071       pagelimit=(start+4096)&0xFFFFF000;
8072       int map=memory_map[start>>12];
8073       int i;
8074       for(i=0;i<5;i++) {
8075         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
8076         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
8077       }
8078       assem_debug("pagelimit=%x\n",pagelimit);
8079       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
8080     }
8081     else {
8082       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
8083       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
8084       return -1; // Caller will invoke exception handler
8085     }
8086     //printf("source= %x\n",(int)source);
8087   }
8088 #endif
8089   else {
8090     printf("Compile at bogus memory address: %x \n", (int)addr);
8091     exit(1);
8092   }
8093
8094   /* Pass 1: disassemble */
8095   /* Pass 2: register dependencies, branch targets */
8096   /* Pass 3: register allocation */
8097   /* Pass 4: branch dependencies */
8098   /* Pass 5: pre-alloc */
8099   /* Pass 6: optimize clean/dirty state */
8100   /* Pass 7: flag 32-bit registers */
8101   /* Pass 8: assembly */
8102   /* Pass 9: linker */
8103   /* Pass 10: garbage collection / free memory */
8104
8105   int i,j;
8106   int done=0;
8107   unsigned int type,op,op2;
8108
8109   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8110   
8111   /* Pass 1 disassembly */
8112
8113   for(i=0;!done;i++) {
8114     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8115     minimum_free_regs[i]=0;
8116     opcode[i]=op=source[i]>>26;
8117     switch(op)
8118     {
8119       case 0x00: strcpy(insn[i],"special"); type=NI;
8120         op2=source[i]&0x3f;
8121         switch(op2)
8122         {
8123           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8124           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8125           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8126           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8127           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8128           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8129           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8130           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8131           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8132           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8133           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8134           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8135           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8136           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8137           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8138           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8139           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8140           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8141           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8142           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8143           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8144           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8145           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8146           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8147           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8148           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8149           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8150           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8151           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8152           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8153           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8154           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8155           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8156           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8157           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8158 #ifndef FORCE32
8159           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8160           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8161           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8162           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8163           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8164           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8165           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8166           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8167           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8168           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8169           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8170           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8171           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8172           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8173           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8174           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8175           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8176 #endif
8177         }
8178         break;
8179       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8180         op2=(source[i]>>16)&0x1f;
8181         switch(op2)
8182         {
8183           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8184           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8185           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8186           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8187           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8188           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8189           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8190           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8191           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8192           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8193           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8194           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8195           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8196           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8197         }
8198         break;
8199       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8200       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8201       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8202       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8203       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8204       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8205       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8206       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8207       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8208       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8209       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8210       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8211       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8212       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8213       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8214         op2=(source[i]>>21)&0x1f;
8215         switch(op2)
8216         {
8217           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8218           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8219           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8220           switch(source[i]&0x3f)
8221           {
8222             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8223             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8224             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8225             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8226 #ifdef PCSX
8227             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8228 #else
8229             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8230 #endif
8231           }
8232         }
8233         break;
8234       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8235         op2=(source[i]>>21)&0x1f;
8236         switch(op2)
8237         {
8238           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8239           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8240           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8241           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8242           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8243           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8244           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8245           switch((source[i]>>16)&0x3)
8246           {
8247             case 0x00: strcpy(insn[i],"BC1F"); break;
8248             case 0x01: strcpy(insn[i],"BC1T"); break;
8249             case 0x02: strcpy(insn[i],"BC1FL"); break;
8250             case 0x03: strcpy(insn[i],"BC1TL"); break;
8251           }
8252           break;
8253           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8254           switch(source[i]&0x3f)
8255           {
8256             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8257             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8258             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8259             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8260             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8261             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8262             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8263             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8264             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8265             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8266             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8267             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8268             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8269             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8270             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8271             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8272             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8273             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8274             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8275             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8276             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8277             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8278             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8279             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8280             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8281             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8282             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8283             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8284             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8285             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8286             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8287             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8288             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8289             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8290             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8291           }
8292           break;
8293           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8294           switch(source[i]&0x3f)
8295           {
8296             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8297             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8298             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8299             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8300             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8301             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8302             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8303             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8304             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8305             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8306             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8307             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8308             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8309             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8310             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8311             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8312             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8313             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8314             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8315             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8316             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8317             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8318             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8319             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8320             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8321             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8322             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8323             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8324             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8325             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8326             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8327             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8328             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8329             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8330             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8331           }
8332           break;
8333           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8334           switch(source[i]&0x3f)
8335           {
8336             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8337             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8338           }
8339           break;
8340           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8341           switch(source[i]&0x3f)
8342           {
8343             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8344             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8345           }
8346           break;
8347         }
8348         break;
8349 #ifndef FORCE32
8350       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8351       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8352       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8353       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8354       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8355       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8356       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8357       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8358 #endif
8359       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8360       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8361       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8362       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8363       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8364       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8365       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8366 #ifndef FORCE32
8367       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8368 #endif
8369       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8370       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8371       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8372       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8373 #ifndef FORCE32
8374       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8375       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8376 #endif
8377       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8378       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8379       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8380       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8381 #ifndef FORCE32
8382       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8383       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8384       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8385 #endif
8386       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8387       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8388 #ifndef FORCE32
8389       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8390       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8391       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8392 #endif
8393 #ifdef PCSX
8394       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8395         op2=(source[i]>>21)&0x1f;
8396         //if (op2 & 0x10) {
8397         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8398           if (gte_handlers[source[i]&0x3f]!=NULL) {
8399             if (gte_regnames[source[i]&0x3f]!=NULL)
8400               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8401             else
8402               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8403             type=C2OP;
8404           }
8405         }
8406         else switch(op2)
8407         {
8408           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8409           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8410           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8411           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8412         }
8413         break;
8414       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8415       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8416       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8417 #endif
8418       default: strcpy(insn[i],"???"); type=NI;
8419         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8420         break;
8421     }
8422     itype[i]=type;
8423     opcode2[i]=op2;
8424     /* Get registers/immediates */
8425     lt1[i]=0;
8426     us1[i]=0;
8427     us2[i]=0;
8428     dep1[i]=0;
8429     dep2[i]=0;
8430     gte_rs[i]=gte_rt[i]=0;
8431     switch(type) {
8432       case LOAD:
8433         rs1[i]=(source[i]>>21)&0x1f;
8434         rs2[i]=0;
8435         rt1[i]=(source[i]>>16)&0x1f;
8436         rt2[i]=0;
8437         imm[i]=(short)source[i];
8438         break;
8439       case STORE:
8440       case STORELR:
8441         rs1[i]=(source[i]>>21)&0x1f;
8442         rs2[i]=(source[i]>>16)&0x1f;
8443         rt1[i]=0;
8444         rt2[i]=0;
8445         imm[i]=(short)source[i];
8446         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8447         break;
8448       case LOADLR:
8449         // LWL/LWR only load part of the register,
8450         // therefore the target register must be treated as a source too
8451         rs1[i]=(source[i]>>21)&0x1f;
8452         rs2[i]=(source[i]>>16)&0x1f;
8453         rt1[i]=(source[i]>>16)&0x1f;
8454         rt2[i]=0;
8455         imm[i]=(short)source[i];
8456         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8457         if(op==0x26) dep1[i]=rt1[i]; // LWR
8458         break;
8459       case IMM16:
8460         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8461         else rs1[i]=(source[i]>>21)&0x1f;
8462         rs2[i]=0;
8463         rt1[i]=(source[i]>>16)&0x1f;
8464         rt2[i]=0;
8465         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8466           imm[i]=(unsigned short)source[i];
8467         }else{
8468           imm[i]=(short)source[i];
8469         }
8470         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8471         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8472         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8473         break;
8474       case UJUMP:
8475         rs1[i]=0;
8476         rs2[i]=0;
8477         rt1[i]=0;
8478         rt2[i]=0;
8479         // The JAL instruction writes to r31.
8480         if (op&1) {
8481           rt1[i]=31;
8482         }
8483         rs2[i]=CCREG;
8484         break;
8485       case RJUMP:
8486         rs1[i]=(source[i]>>21)&0x1f;
8487         rs2[i]=0;
8488         rt1[i]=0;
8489         rt2[i]=0;
8490         // The JALR instruction writes to rd.
8491         if (op2&1) {
8492           rt1[i]=(source[i]>>11)&0x1f;
8493         }
8494         rs2[i]=CCREG;
8495         break;
8496       case CJUMP:
8497         rs1[i]=(source[i]>>21)&0x1f;
8498         rs2[i]=(source[i]>>16)&0x1f;
8499         rt1[i]=0;
8500         rt2[i]=0;
8501         if(op&2) { // BGTZ/BLEZ
8502           rs2[i]=0;
8503         }
8504         us1[i]=rs1[i];
8505         us2[i]=rs2[i];
8506         likely[i]=op>>4;
8507         break;
8508       case SJUMP:
8509         rs1[i]=(source[i]>>21)&0x1f;
8510         rs2[i]=CCREG;
8511         rt1[i]=0;
8512         rt2[i]=0;
8513         us1[i]=rs1[i];
8514         if(op2&0x10) { // BxxAL
8515           rt1[i]=31;
8516           // NOTE: If the branch is not taken, r31 is still overwritten
8517         }
8518         likely[i]=(op2&2)>>1;
8519         break;
8520       case FJUMP:
8521         rs1[i]=FSREG;
8522         rs2[i]=CSREG;
8523         rt1[i]=0;
8524         rt2[i]=0;
8525         likely[i]=((source[i])>>17)&1;
8526         break;
8527       case ALU:
8528         rs1[i]=(source[i]>>21)&0x1f; // source
8529         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8530         rt1[i]=(source[i]>>11)&0x1f; // destination
8531         rt2[i]=0;
8532         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8533           us1[i]=rs1[i];us2[i]=rs2[i];
8534         }
8535         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8536           dep1[i]=rs1[i];dep2[i]=rs2[i];
8537         }
8538         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8539           dep1[i]=rs1[i];dep2[i]=rs2[i];
8540         }
8541         break;
8542       case MULTDIV:
8543         rs1[i]=(source[i]>>21)&0x1f; // source
8544         rs2[i]=(source[i]>>16)&0x1f; // divisor
8545         rt1[i]=HIREG;
8546         rt2[i]=LOREG;
8547         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8548           us1[i]=rs1[i];us2[i]=rs2[i];
8549         }
8550         break;
8551       case MOV:
8552         rs1[i]=0;
8553         rs2[i]=0;
8554         rt1[i]=0;
8555         rt2[i]=0;
8556         if(op2==0x10) rs1[i]=HIREG; // MFHI
8557         if(op2==0x11) rt1[i]=HIREG; // MTHI
8558         if(op2==0x12) rs1[i]=LOREG; // MFLO
8559         if(op2==0x13) rt1[i]=LOREG; // MTLO
8560         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8561         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8562         dep1[i]=rs1[i];
8563         break;
8564       case SHIFT:
8565         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8566         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8567         rt1[i]=(source[i]>>11)&0x1f; // destination
8568         rt2[i]=0;
8569         // DSLLV/DSRLV/DSRAV are 64-bit
8570         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8571         break;
8572       case SHIFTIMM:
8573         rs1[i]=(source[i]>>16)&0x1f;
8574         rs2[i]=0;
8575         rt1[i]=(source[i]>>11)&0x1f;
8576         rt2[i]=0;
8577         imm[i]=(source[i]>>6)&0x1f;
8578         // DSxx32 instructions
8579         if(op2>=0x3c) imm[i]|=0x20;
8580         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8581         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8582         break;
8583       case COP0:
8584         rs1[i]=0;
8585         rs2[i]=0;
8586         rt1[i]=0;
8587         rt2[i]=0;
8588         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8589         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8590         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8591         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8592         break;
8593       case COP1:
8594         rs1[i]=0;
8595         rs2[i]=0;
8596         rt1[i]=0;
8597         rt2[i]=0;
8598         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8599         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8600         if(op2==5) us1[i]=rs1[i]; // DMTC1
8601         rs2[i]=CSREG;
8602         break;
8603       case COP2:
8604         rs1[i]=0;
8605         rs2[i]=0;
8606         rt1[i]=0;
8607         rt2[i]=0;
8608         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8609         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8610         rs2[i]=CSREG;
8611         int gr=(source[i]>>11)&0x1F;
8612         switch(op2)
8613         {
8614           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8615           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8616           case 0x02: gte_rs[i]=1ll<<(gr+32); // CFC2
8617             if(gr==31&&!gte_reads_flags) {
8618               assem_debug("gte flag read encountered @%08x\n",addr + i*4);
8619               gte_reads_flags=1;
8620             }
8621             break;
8622           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8623         }
8624         break;
8625       case C1LS:
8626         rs1[i]=(source[i]>>21)&0x1F;
8627         rs2[i]=CSREG;
8628         rt1[i]=0;
8629         rt2[i]=0;
8630         imm[i]=(short)source[i];
8631         break;
8632       case C2LS:
8633         rs1[i]=(source[i]>>21)&0x1F;
8634         rs2[i]=0;
8635         rt1[i]=0;
8636         rt2[i]=0;
8637         imm[i]=(short)source[i];
8638         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8639         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8640         break;
8641       case C2OP:
8642         rs1[i]=0;
8643         rs2[i]=0;
8644         rt1[i]=0;
8645         rt2[i]=0;
8646         gte_rt[i]=1ll<<63; // every op changes flags
8647         // TODO: other regs?
8648         break;
8649       case FLOAT:
8650       case FCONV:
8651         rs1[i]=0;
8652         rs2[i]=CSREG;
8653         rt1[i]=0;
8654         rt2[i]=0;
8655         break;
8656       case FCOMP:
8657         rs1[i]=FSREG;
8658         rs2[i]=CSREG;
8659         rt1[i]=FSREG;
8660         rt2[i]=0;
8661         break;
8662       case SYSCALL:
8663       case HLECALL:
8664       case INTCALL:
8665         rs1[i]=CCREG;
8666         rs2[i]=0;
8667         rt1[i]=0;
8668         rt2[i]=0;
8669         break;
8670       default:
8671         rs1[i]=0;
8672         rs2[i]=0;
8673         rt1[i]=0;
8674         rt2[i]=0;
8675     }
8676     /* Calculate branch target addresses */
8677     if(type==UJUMP)
8678       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8679     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8680       ba[i]=start+i*4+8; // Ignore never taken branch
8681     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8682       ba[i]=start+i*4+8; // Ignore never taken branch
8683     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8684       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8685     else ba[i]=-1;
8686 #ifdef PCSX
8687     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8688       int do_in_intrp=0;
8689       // branch in delay slot?
8690       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8691         // don't handle first branch and call interpreter if it's hit
8692         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8693         do_in_intrp=1;
8694       }
8695       // basic load delay detection
8696       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8697         int t=(ba[i-1]-start)/4;
8698         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8699           // jump target wants DS result - potential load delay effect
8700           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8701           do_in_intrp=1;
8702           bt[t+1]=1; // expected return from interpreter
8703         }
8704         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8705               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8706           // v0 overwrite like this is a sign of trouble, bail out
8707           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8708           do_in_intrp=1;
8709         }
8710       }
8711       if(do_in_intrp) {
8712         rs1[i-1]=CCREG;
8713         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8714         ba[i-1]=-1;
8715         itype[i-1]=INTCALL;
8716         done=2;
8717         i--; // don't compile the DS
8718       }
8719     }
8720 #endif
8721     /* Is this the end of the block? */
8722     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8723       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8724         done=2;
8725       }
8726       else {
8727         if(stop_after_jal) done=1;
8728         // Stop on BREAK
8729         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8730       }
8731       // Don't recompile stuff that's already compiled
8732       if(check_addr(start+i*4+4)) done=1;
8733       // Don't get too close to the limit
8734       if(i>MAXBLOCK/2) done=1;
8735     }
8736     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8737     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8738     if(done==2) {
8739       // Does the block continue due to a branch?
8740       for(j=i-1;j>=0;j--)
8741       {
8742         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8743         if(ba[j]==start+i*4+4) done=j=0;
8744         if(ba[j]==start+i*4+8) done=j=0;
8745       }
8746     }
8747     //assert(i<MAXBLOCK-1);
8748     if(start+i*4==pagelimit-4) done=1;
8749     assert(start+i*4<pagelimit);
8750     if (i==MAXBLOCK-1) done=1;
8751     // Stop if we're compiling junk
8752     if(itype[i]==NI&&opcode[i]==0x11) {
8753       done=stop_after_jal=1;
8754       printf("Disabled speculative precompilation\n");
8755     }
8756   }
8757   slen=i;
8758   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8759     if(start+i*4==pagelimit) {
8760       itype[i-1]=SPAN;
8761     }
8762   }
8763   assert(slen>0);
8764
8765   /* Pass 2 - Register dependencies and branch targets */
8766
8767   unneeded_registers(0,slen-1,0);
8768   
8769   /* Pass 3 - Register allocation */
8770
8771   struct regstat current; // Current register allocations/status
8772   current.is32=1;
8773   current.dirty=0;
8774   current.u=unneeded_reg[0];
8775   current.uu=unneeded_reg_upper[0];
8776   clear_all_regs(current.regmap);
8777   alloc_reg(&current,0,CCREG);
8778   dirty_reg(&current,CCREG);
8779   current.isconst=0;
8780   current.wasconst=0;
8781   int ds=0;
8782   int cc=0;
8783   int hr=-1;
8784
8785 #ifndef FORCE32
8786   provisional_32bit();
8787 #endif
8788   if((u_int)addr&1) {
8789     // First instruction is delay slot
8790     cc=-1;
8791     bt[1]=1;
8792     ds=1;
8793     unneeded_reg[0]=1;
8794     unneeded_reg_upper[0]=1;
8795     current.regmap[HOST_BTREG]=BTREG;
8796   }
8797   
8798   for(i=0;i<slen;i++)
8799   {
8800     if(bt[i])
8801     {
8802       int hr;
8803       for(hr=0;hr<HOST_REGS;hr++)
8804       {
8805         // Is this really necessary?
8806         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8807       }
8808       current.isconst=0;
8809     }
8810     if(i>1)
8811     {
8812       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8813       {
8814         if(rs1[i-2]==0||rs2[i-2]==0)
8815         {
8816           if(rs1[i-2]) {
8817             current.is32|=1LL<<rs1[i-2];
8818             int hr=get_reg(current.regmap,rs1[i-2]|64);
8819             if(hr>=0) current.regmap[hr]=-1;
8820           }
8821           if(rs2[i-2]) {
8822             current.is32|=1LL<<rs2[i-2];
8823             int hr=get_reg(current.regmap,rs2[i-2]|64);
8824             if(hr>=0) current.regmap[hr]=-1;
8825           }
8826         }
8827       }
8828     }
8829 #ifndef FORCE32
8830     // If something jumps here with 64-bit values
8831     // then promote those registers to 64 bits
8832     if(bt[i])
8833     {
8834       uint64_t temp_is32=current.is32;
8835       for(j=i-1;j>=0;j--)
8836       {
8837         if(ba[j]==start+i*4) 
8838           temp_is32&=branch_regs[j].is32;
8839       }
8840       for(j=i;j<slen;j++)
8841       {
8842         if(ba[j]==start+i*4) 
8843           //temp_is32=1;
8844           temp_is32&=p32[j];
8845       }
8846       if(temp_is32!=current.is32) {
8847         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8848         #ifndef DESTRUCTIVE_WRITEBACK
8849         if(ds)
8850         #endif
8851         for(hr=0;hr<HOST_REGS;hr++)
8852         {
8853           int r=current.regmap[hr];
8854           if(r>0&&r<64)
8855           {
8856             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8857               temp_is32|=1LL<<r;
8858               //printf("restore %d\n",r);
8859             }
8860           }
8861         }
8862         current.is32=temp_is32;
8863       }
8864     }
8865 #else
8866     current.is32=-1LL;
8867 #endif
8868
8869     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8870     regs[i].wasconst=current.isconst;
8871     regs[i].was32=current.is32;
8872     regs[i].wasdirty=current.dirty;
8873     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8874     // To change a dirty register from 32 to 64 bits, we must write
8875     // it out during the previous cycle (for branches, 2 cycles)
8876     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8877     {
8878       uint64_t temp_is32=current.is32;
8879       for(j=i-1;j>=0;j--)
8880       {
8881         if(ba[j]==start+i*4+4) 
8882           temp_is32&=branch_regs[j].is32;
8883       }
8884       for(j=i;j<slen;j++)
8885       {
8886         if(ba[j]==start+i*4+4) 
8887           //temp_is32=1;
8888           temp_is32&=p32[j];
8889       }
8890       if(temp_is32!=current.is32) {
8891         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8892         for(hr=0;hr<HOST_REGS;hr++)
8893         {
8894           int r=current.regmap[hr];
8895           if(r>0)
8896           {
8897             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8898               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8899               {
8900                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8901                 {
8902                   //printf("dump %d/r%d\n",hr,r);
8903                   current.regmap[hr]=-1;
8904                   if(get_reg(current.regmap,r|64)>=0) 
8905                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8906                 }
8907               }
8908             }
8909           }
8910         }
8911       }
8912     }
8913     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8914     {
8915       uint64_t temp_is32=current.is32;
8916       for(j=i-1;j>=0;j--)
8917       {
8918         if(ba[j]==start+i*4+8) 
8919           temp_is32&=branch_regs[j].is32;
8920       }
8921       for(j=i;j<slen;j++)
8922       {
8923         if(ba[j]==start+i*4+8) 
8924           //temp_is32=1;
8925           temp_is32&=p32[j];
8926       }
8927       if(temp_is32!=current.is32) {
8928         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8929         for(hr=0;hr<HOST_REGS;hr++)
8930         {
8931           int r=current.regmap[hr];
8932           if(r>0)
8933           {
8934             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8935               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8936               {
8937                 //printf("dump %d/r%d\n",hr,r);
8938                 current.regmap[hr]=-1;
8939                 if(get_reg(current.regmap,r|64)>=0) 
8940                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8941               }
8942             }
8943           }
8944         }
8945       }
8946     }
8947     #endif
8948     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8949       if(i+1<slen) {
8950         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8951         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8952         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8953         current.u|=1;
8954         current.uu|=1;
8955       } else {
8956         current.u=1;
8957         current.uu=1;
8958       }
8959     } else {
8960       if(i+1<slen) {
8961         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8962         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8963         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8964         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8965         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8966         current.u|=1;
8967         current.uu|=1;
8968       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8969     }
8970     is_ds[i]=ds;
8971     if(ds) {
8972       ds=0; // Skip delay slot, already allocated as part of branch
8973       // ...but we need to alloc it in case something jumps here
8974       if(i+1<slen) {
8975         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8976         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8977       }else{
8978         current.u=branch_unneeded_reg[i-1];
8979         current.uu=branch_unneeded_reg_upper[i-1];
8980       }
8981       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8982       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8983       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8984       current.u|=1;
8985       current.uu|=1;
8986       struct regstat temp;
8987       memcpy(&temp,&current,sizeof(current));
8988       temp.wasdirty=temp.dirty;
8989       temp.was32=temp.is32;
8990       // TODO: Take into account unconditional branches, as below
8991       delayslot_alloc(&temp,i);
8992       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8993       regs[i].wasdirty=temp.wasdirty;
8994       regs[i].was32=temp.was32;
8995       regs[i].dirty=temp.dirty;
8996       regs[i].is32=temp.is32;
8997       regs[i].isconst=0;
8998       regs[i].wasconst=0;
8999       current.isconst=0;
9000       // Create entry (branch target) regmap
9001       for(hr=0;hr<HOST_REGS;hr++)
9002       {
9003         int r=temp.regmap[hr];
9004         if(r>=0) {
9005           if(r!=regmap_pre[i][hr]) {
9006             regs[i].regmap_entry[hr]=-1;
9007           }
9008           else
9009           {
9010             if(r<64){
9011               if((current.u>>r)&1) {
9012                 regs[i].regmap_entry[hr]=-1;
9013                 regs[i].regmap[hr]=-1;
9014                 //Don't clear regs in the delay slot as the branch might need them
9015                 //current.regmap[hr]=-1;
9016               }else
9017                 regs[i].regmap_entry[hr]=r;
9018             }
9019             else {
9020               if((current.uu>>(r&63))&1) {
9021                 regs[i].regmap_entry[hr]=-1;
9022                 regs[i].regmap[hr]=-1;
9023                 //Don't clear regs in the delay slot as the branch might need them
9024                 //current.regmap[hr]=-1;
9025               }else
9026                 regs[i].regmap_entry[hr]=r;
9027             }
9028           }
9029         } else {
9030           // First instruction expects CCREG to be allocated
9031           if(i==0&&hr==HOST_CCREG) 
9032             regs[i].regmap_entry[hr]=CCREG;
9033           else
9034             regs[i].regmap_entry[hr]=-1;
9035         }
9036       }
9037     }
9038     else { // Not delay slot
9039       switch(itype[i]) {
9040         case UJUMP:
9041           //current.isconst=0; // DEBUG
9042           //current.wasconst=0; // DEBUG
9043           //regs[i].wasconst=0; // DEBUG
9044           clear_const(&current,rt1[i]);
9045           alloc_cc(&current,i);
9046           dirty_reg(&current,CCREG);
9047           if (rt1[i]==31) {
9048             alloc_reg(&current,i,31);
9049             dirty_reg(&current,31);
9050             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9051             //assert(rt1[i+1]!=rt1[i]);
9052             #ifdef REG_PREFETCH
9053             alloc_reg(&current,i,PTEMP);
9054             #endif
9055             //current.is32|=1LL<<rt1[i];
9056           }
9057           ooo[i]=1;
9058           delayslot_alloc(&current,i+1);
9059           //current.isconst=0; // DEBUG
9060           ds=1;
9061           //printf("i=%d, isconst=%x\n",i,current.isconst);
9062           break;
9063         case RJUMP:
9064           //current.isconst=0;
9065           //current.wasconst=0;
9066           //regs[i].wasconst=0;
9067           clear_const(&current,rs1[i]);
9068           clear_const(&current,rt1[i]);
9069           alloc_cc(&current,i);
9070           dirty_reg(&current,CCREG);
9071           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9072             alloc_reg(&current,i,rs1[i]);
9073             if (rt1[i]!=0) {
9074               alloc_reg(&current,i,rt1[i]);
9075               dirty_reg(&current,rt1[i]);
9076               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9077               assert(rt1[i+1]!=rt1[i]);
9078               #ifdef REG_PREFETCH
9079               alloc_reg(&current,i,PTEMP);
9080               #endif
9081             }
9082             #ifdef USE_MINI_HT
9083             if(rs1[i]==31) { // JALR
9084               alloc_reg(&current,i,RHASH);
9085               #ifndef HOST_IMM_ADDR32
9086               alloc_reg(&current,i,RHTBL);
9087               #endif
9088             }
9089             #endif
9090             delayslot_alloc(&current,i+1);
9091           } else {
9092             // The delay slot overwrites our source register,
9093             // allocate a temporary register to hold the old value.
9094             current.isconst=0;
9095             current.wasconst=0;
9096             regs[i].wasconst=0;
9097             delayslot_alloc(&current,i+1);
9098             current.isconst=0;
9099             alloc_reg(&current,i,RTEMP);
9100           }
9101           //current.isconst=0; // DEBUG
9102           ooo[i]=1;
9103           ds=1;
9104           break;
9105         case CJUMP:
9106           //current.isconst=0;
9107           //current.wasconst=0;
9108           //regs[i].wasconst=0;
9109           clear_const(&current,rs1[i]);
9110           clear_const(&current,rs2[i]);
9111           if((opcode[i]&0x3E)==4) // BEQ/BNE
9112           {
9113             alloc_cc(&current,i);
9114             dirty_reg(&current,CCREG);
9115             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9116             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9117             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9118             {
9119               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9120               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9121             }
9122             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9123                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9124               // The delay slot overwrites one of our conditions.
9125               // Allocate the branch condition registers instead.
9126               current.isconst=0;
9127               current.wasconst=0;
9128               regs[i].wasconst=0;
9129               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9130               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9131               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9132               {
9133                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9134                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9135               }
9136             }
9137             else
9138             {
9139               ooo[i]=1;
9140               delayslot_alloc(&current,i+1);
9141             }
9142           }
9143           else
9144           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9145           {
9146             alloc_cc(&current,i);
9147             dirty_reg(&current,CCREG);
9148             alloc_reg(&current,i,rs1[i]);
9149             if(!(current.is32>>rs1[i]&1))
9150             {
9151               alloc_reg64(&current,i,rs1[i]);
9152             }
9153             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9154               // The delay slot overwrites one of our conditions.
9155               // Allocate the branch condition registers instead.
9156               current.isconst=0;
9157               current.wasconst=0;
9158               regs[i].wasconst=0;
9159               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9160               if(!((current.is32>>rs1[i])&1))
9161               {
9162                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9163               }
9164             }
9165             else
9166             {
9167               ooo[i]=1;
9168               delayslot_alloc(&current,i+1);
9169             }
9170           }
9171           else
9172           // Don't alloc the delay slot yet because we might not execute it
9173           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9174           {
9175             current.isconst=0;
9176             current.wasconst=0;
9177             regs[i].wasconst=0;
9178             alloc_cc(&current,i);
9179             dirty_reg(&current,CCREG);
9180             alloc_reg(&current,i,rs1[i]);
9181             alloc_reg(&current,i,rs2[i]);
9182             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9183             {
9184               alloc_reg64(&current,i,rs1[i]);
9185               alloc_reg64(&current,i,rs2[i]);
9186             }
9187           }
9188           else
9189           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9190           {
9191             current.isconst=0;
9192             current.wasconst=0;
9193             regs[i].wasconst=0;
9194             alloc_cc(&current,i);
9195             dirty_reg(&current,CCREG);
9196             alloc_reg(&current,i,rs1[i]);
9197             if(!(current.is32>>rs1[i]&1))
9198             {
9199               alloc_reg64(&current,i,rs1[i]);
9200             }
9201           }
9202           ds=1;
9203           //current.isconst=0;
9204           break;
9205         case SJUMP:
9206           //current.isconst=0;
9207           //current.wasconst=0;
9208           //regs[i].wasconst=0;
9209           clear_const(&current,rs1[i]);
9210           clear_const(&current,rt1[i]);
9211           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9212           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9213           {
9214             alloc_cc(&current,i);
9215             dirty_reg(&current,CCREG);
9216             alloc_reg(&current,i,rs1[i]);
9217             if(!(current.is32>>rs1[i]&1))
9218             {
9219               alloc_reg64(&current,i,rs1[i]);
9220             }
9221             if (rt1[i]==31) { // BLTZAL/BGEZAL
9222               alloc_reg(&current,i,31);
9223               dirty_reg(&current,31);
9224               //#ifdef REG_PREFETCH
9225               //alloc_reg(&current,i,PTEMP);
9226               //#endif
9227               //current.is32|=1LL<<rt1[i];
9228             }
9229             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9230                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9231               // Allocate the branch condition registers instead.
9232               current.isconst=0;
9233               current.wasconst=0;
9234               regs[i].wasconst=0;
9235               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9236               if(!((current.is32>>rs1[i])&1))
9237               {
9238                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9239               }
9240             }
9241             else
9242             {
9243               ooo[i]=1;
9244               delayslot_alloc(&current,i+1);
9245             }
9246           }
9247           else
9248           // Don't alloc the delay slot yet because we might not execute it
9249           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9250           {
9251             current.isconst=0;
9252             current.wasconst=0;
9253             regs[i].wasconst=0;
9254             alloc_cc(&current,i);
9255             dirty_reg(&current,CCREG);
9256             alloc_reg(&current,i,rs1[i]);
9257             if(!(current.is32>>rs1[i]&1))
9258             {
9259               alloc_reg64(&current,i,rs1[i]);
9260             }
9261           }
9262           ds=1;
9263           //current.isconst=0;
9264           break;
9265         case FJUMP:
9266           current.isconst=0;
9267           current.wasconst=0;
9268           regs[i].wasconst=0;
9269           if(likely[i]==0) // BC1F/BC1T
9270           {
9271             // TODO: Theoretically we can run out of registers here on x86.
9272             // The delay slot can allocate up to six, and we need to check
9273             // CSREG before executing the delay slot.  Possibly we can drop
9274             // the cycle count and then reload it after checking that the
9275             // FPU is in a usable state, or don't do out-of-order execution.
9276             alloc_cc(&current,i);
9277             dirty_reg(&current,CCREG);
9278             alloc_reg(&current,i,FSREG);
9279             alloc_reg(&current,i,CSREG);
9280             if(itype[i+1]==FCOMP) {
9281               // The delay slot overwrites the branch condition.
9282               // Allocate the branch condition registers instead.
9283               alloc_cc(&current,i);
9284               dirty_reg(&current,CCREG);
9285               alloc_reg(&current,i,CSREG);
9286               alloc_reg(&current,i,FSREG);
9287             }
9288             else {
9289               ooo[i]=1;
9290               delayslot_alloc(&current,i+1);
9291               alloc_reg(&current,i+1,CSREG);
9292             }
9293           }
9294           else
9295           // Don't alloc the delay slot yet because we might not execute it
9296           if(likely[i]) // BC1FL/BC1TL
9297           {
9298             alloc_cc(&current,i);
9299             dirty_reg(&current,CCREG);
9300             alloc_reg(&current,i,CSREG);
9301             alloc_reg(&current,i,FSREG);
9302           }
9303           ds=1;
9304           current.isconst=0;
9305           break;
9306         case IMM16:
9307           imm16_alloc(&current,i);
9308           break;
9309         case LOAD:
9310         case LOADLR:
9311           load_alloc(&current,i);
9312           break;
9313         case STORE:
9314         case STORELR:
9315           store_alloc(&current,i);
9316           break;
9317         case ALU:
9318           alu_alloc(&current,i);
9319           break;
9320         case SHIFT:
9321           shift_alloc(&current,i);
9322           break;
9323         case MULTDIV:
9324           multdiv_alloc(&current,i);
9325           break;
9326         case SHIFTIMM:
9327           shiftimm_alloc(&current,i);
9328           break;
9329         case MOV:
9330           mov_alloc(&current,i);
9331           break;
9332         case COP0:
9333           cop0_alloc(&current,i);
9334           break;
9335         case COP1:
9336         case COP2:
9337           cop1_alloc(&current,i);
9338           break;
9339         case C1LS:
9340           c1ls_alloc(&current,i);
9341           break;
9342         case C2LS:
9343           c2ls_alloc(&current,i);
9344           break;
9345         case C2OP:
9346           c2op_alloc(&current,i);
9347           break;
9348         case FCONV:
9349           fconv_alloc(&current,i);
9350           break;
9351         case FLOAT:
9352           float_alloc(&current,i);
9353           break;
9354         case FCOMP:
9355           fcomp_alloc(&current,i);
9356           break;
9357         case SYSCALL:
9358         case HLECALL:
9359         case INTCALL:
9360           syscall_alloc(&current,i);
9361           break;
9362         case SPAN:
9363           pagespan_alloc(&current,i);
9364           break;
9365       }
9366       
9367       // Drop the upper half of registers that have become 32-bit
9368       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9369       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9370         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9371         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9372         current.uu|=1;
9373       } else {
9374         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9375         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9376         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9377         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9378         current.uu|=1;
9379       }
9380
9381       // Create entry (branch target) regmap
9382       for(hr=0;hr<HOST_REGS;hr++)
9383       {
9384         int r,or,er;
9385         r=current.regmap[hr];
9386         if(r>=0) {
9387           if(r!=regmap_pre[i][hr]) {
9388             // TODO: delay slot (?)
9389             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9390             if(or<0||(r&63)>=TEMPREG){
9391               regs[i].regmap_entry[hr]=-1;
9392             }
9393             else
9394             {
9395               // Just move it to a different register
9396               regs[i].regmap_entry[hr]=r;
9397               // If it was dirty before, it's still dirty
9398               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9399             }
9400           }
9401           else
9402           {
9403             // Unneeded
9404             if(r==0){
9405               regs[i].regmap_entry[hr]=0;
9406             }
9407             else
9408             if(r<64){
9409               if((current.u>>r)&1) {
9410                 regs[i].regmap_entry[hr]=-1;
9411                 //regs[i].regmap[hr]=-1;
9412                 current.regmap[hr]=-1;
9413               }else
9414                 regs[i].regmap_entry[hr]=r;
9415             }
9416             else {
9417               if((current.uu>>(r&63))&1) {
9418                 regs[i].regmap_entry[hr]=-1;
9419                 //regs[i].regmap[hr]=-1;
9420                 current.regmap[hr]=-1;
9421               }else
9422                 regs[i].regmap_entry[hr]=r;
9423             }
9424           }
9425         } else {
9426           // Branches expect CCREG to be allocated at the target
9427           if(regmap_pre[i][hr]==CCREG) 
9428             regs[i].regmap_entry[hr]=CCREG;
9429           else
9430             regs[i].regmap_entry[hr]=-1;
9431         }
9432       }
9433       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9434     }
9435     /* Branch post-alloc */
9436     if(i>0)
9437     {
9438       current.was32=current.is32;
9439       current.wasdirty=current.dirty;
9440       switch(itype[i-1]) {
9441         case UJUMP:
9442           memcpy(&branch_regs[i-1],&current,sizeof(current));
9443           branch_regs[i-1].isconst=0;
9444           branch_regs[i-1].wasconst=0;
9445           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9446           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9447           alloc_cc(&branch_regs[i-1],i-1);
9448           dirty_reg(&branch_regs[i-1],CCREG);
9449           if(rt1[i-1]==31) { // JAL
9450             alloc_reg(&branch_regs[i-1],i-1,31);
9451             dirty_reg(&branch_regs[i-1],31);
9452             branch_regs[i-1].is32|=1LL<<31;
9453           }
9454           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9455           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9456           break;
9457         case RJUMP:
9458           memcpy(&branch_regs[i-1],&current,sizeof(current));
9459           branch_regs[i-1].isconst=0;
9460           branch_regs[i-1].wasconst=0;
9461           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9462           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9463           alloc_cc(&branch_regs[i-1],i-1);
9464           dirty_reg(&branch_regs[i-1],CCREG);
9465           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9466           if(rt1[i-1]!=0) { // JALR
9467             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9468             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9469             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9470           }
9471           #ifdef USE_MINI_HT
9472           if(rs1[i-1]==31) { // JALR
9473             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9474             #ifndef HOST_IMM_ADDR32
9475             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9476             #endif
9477           }
9478           #endif
9479           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9480           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9481           break;
9482         case CJUMP:
9483           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9484           {
9485             alloc_cc(&current,i-1);
9486             dirty_reg(&current,CCREG);
9487             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9488                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9489               // The delay slot overwrote one of our conditions
9490               // Delay slot goes after the test (in order)
9491               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9492               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9493               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9494               current.u|=1;
9495               current.uu|=1;
9496               delayslot_alloc(&current,i);
9497               current.isconst=0;
9498             }
9499             else
9500             {
9501               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9502               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9503               // Alloc the branch condition registers
9504               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9505               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9506               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9507               {
9508                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9509                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9510               }
9511             }
9512             memcpy(&branch_regs[i-1],&current,sizeof(current));
9513             branch_regs[i-1].isconst=0;
9514             branch_regs[i-1].wasconst=0;
9515             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9516             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9517           }
9518           else
9519           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9520           {
9521             alloc_cc(&current,i-1);
9522             dirty_reg(&current,CCREG);
9523             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9524               // The delay slot overwrote the branch condition
9525               // Delay slot goes after the test (in order)
9526               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9527               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9528               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9529               current.u|=1;
9530               current.uu|=1;
9531               delayslot_alloc(&current,i);
9532               current.isconst=0;
9533             }
9534             else
9535             {
9536               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9537               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9538               // Alloc the branch condition register
9539               alloc_reg(&current,i-1,rs1[i-1]);
9540               if(!(current.is32>>rs1[i-1]&1))
9541               {
9542                 alloc_reg64(&current,i-1,rs1[i-1]);
9543               }
9544             }
9545             memcpy(&branch_regs[i-1],&current,sizeof(current));
9546             branch_regs[i-1].isconst=0;
9547             branch_regs[i-1].wasconst=0;
9548             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9549             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9550           }
9551           else
9552           // Alloc the delay slot in case the branch is taken
9553           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9554           {
9555             memcpy(&branch_regs[i-1],&current,sizeof(current));
9556             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9557             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9558             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9559             alloc_cc(&branch_regs[i-1],i);
9560             dirty_reg(&branch_regs[i-1],CCREG);
9561             delayslot_alloc(&branch_regs[i-1],i);
9562             branch_regs[i-1].isconst=0;
9563             alloc_reg(&current,i,CCREG); // Not taken path
9564             dirty_reg(&current,CCREG);
9565             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9566           }
9567           else
9568           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9569           {
9570             memcpy(&branch_regs[i-1],&current,sizeof(current));
9571             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9572             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9573             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9574             alloc_cc(&branch_regs[i-1],i);
9575             dirty_reg(&branch_regs[i-1],CCREG);
9576             delayslot_alloc(&branch_regs[i-1],i);
9577             branch_regs[i-1].isconst=0;
9578             alloc_reg(&current,i,CCREG); // Not taken path
9579             dirty_reg(&current,CCREG);
9580             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9581           }
9582           break;
9583         case SJUMP:
9584           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9585           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9586           {
9587             alloc_cc(&current,i-1);
9588             dirty_reg(&current,CCREG);
9589             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9590               // The delay slot overwrote the branch condition
9591               // Delay slot goes after the test (in order)
9592               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9593               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9594               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9595               current.u|=1;
9596               current.uu|=1;
9597               delayslot_alloc(&current,i);
9598               current.isconst=0;
9599             }
9600             else
9601             {
9602               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9603               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9604               // Alloc the branch condition register
9605               alloc_reg(&current,i-1,rs1[i-1]);
9606               if(!(current.is32>>rs1[i-1]&1))
9607               {
9608                 alloc_reg64(&current,i-1,rs1[i-1]);
9609               }
9610             }
9611             memcpy(&branch_regs[i-1],&current,sizeof(current));
9612             branch_regs[i-1].isconst=0;
9613             branch_regs[i-1].wasconst=0;
9614             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9615             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9616           }
9617           else
9618           // Alloc the delay slot in case the branch is taken
9619           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9620           {
9621             memcpy(&branch_regs[i-1],&current,sizeof(current));
9622             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9623             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9624             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9625             alloc_cc(&branch_regs[i-1],i);
9626             dirty_reg(&branch_regs[i-1],CCREG);
9627             delayslot_alloc(&branch_regs[i-1],i);
9628             branch_regs[i-1].isconst=0;
9629             alloc_reg(&current,i,CCREG); // Not taken path
9630             dirty_reg(&current,CCREG);
9631             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9632           }
9633           // FIXME: BLTZAL/BGEZAL
9634           if(opcode2[i-1]&0x10) { // BxxZAL
9635             alloc_reg(&branch_regs[i-1],i-1,31);
9636             dirty_reg(&branch_regs[i-1],31);
9637             branch_regs[i-1].is32|=1LL<<31;
9638           }
9639           break;
9640         case FJUMP:
9641           if(likely[i-1]==0) // BC1F/BC1T
9642           {
9643             alloc_cc(&current,i-1);
9644             dirty_reg(&current,CCREG);
9645             if(itype[i]==FCOMP) {
9646               // The delay slot overwrote the branch condition
9647               // Delay slot goes after the test (in order)
9648               delayslot_alloc(&current,i);
9649               current.isconst=0;
9650             }
9651             else
9652             {
9653               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9654               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9655               // Alloc the branch condition register
9656               alloc_reg(&current,i-1,FSREG);
9657             }
9658             memcpy(&branch_regs[i-1],&current,sizeof(current));
9659             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9660           }
9661           else // BC1FL/BC1TL
9662           {
9663             // Alloc the delay slot in case the branch is taken
9664             memcpy(&branch_regs[i-1],&current,sizeof(current));
9665             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9666             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9667             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9668             alloc_cc(&branch_regs[i-1],i);
9669             dirty_reg(&branch_regs[i-1],CCREG);
9670             delayslot_alloc(&branch_regs[i-1],i);
9671             branch_regs[i-1].isconst=0;
9672             alloc_reg(&current,i,CCREG); // Not taken path
9673             dirty_reg(&current,CCREG);
9674             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9675           }
9676           break;
9677       }
9678
9679       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9680       {
9681         if(rt1[i-1]==31) // JAL/JALR
9682         {
9683           // Subroutine call will return here, don't alloc any registers
9684           current.is32=1;
9685           current.dirty=0;
9686           clear_all_regs(current.regmap);
9687           alloc_reg(&current,i,CCREG);
9688           dirty_reg(&current,CCREG);
9689         }
9690         else if(i+1<slen)
9691         {
9692           // Internal branch will jump here, match registers to caller
9693           current.is32=0x3FFFFFFFFLL;
9694           current.dirty=0;
9695           clear_all_regs(current.regmap);
9696           alloc_reg(&current,i,CCREG);
9697           dirty_reg(&current,CCREG);
9698           for(j=i-1;j>=0;j--)
9699           {
9700             if(ba[j]==start+i*4+4) {
9701               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9702               current.is32=branch_regs[j].is32;
9703               current.dirty=branch_regs[j].dirty;
9704               break;
9705             }
9706           }
9707           while(j>=0) {
9708             if(ba[j]==start+i*4+4) {
9709               for(hr=0;hr<HOST_REGS;hr++) {
9710                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9711                   current.regmap[hr]=-1;
9712                 }
9713                 current.is32&=branch_regs[j].is32;
9714                 current.dirty&=branch_regs[j].dirty;
9715               }
9716             }
9717             j--;
9718           }
9719         }
9720       }
9721     }
9722
9723     // Count cycles in between branches
9724     ccadj[i]=cc;
9725     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9726     {
9727       cc=0;
9728     }
9729 #ifdef PCSX
9730     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9731     {
9732       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9733     }
9734     else if(itype[i]==C2LS)
9735     {
9736       cc+=4;
9737     }
9738 #endif
9739     else
9740     {
9741       cc++;
9742     }
9743
9744     flush_dirty_uppers(&current);
9745     if(!is_ds[i]) {
9746       regs[i].is32=current.is32;
9747       regs[i].dirty=current.dirty;
9748       regs[i].isconst=current.isconst;
9749       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9750     }
9751     for(hr=0;hr<HOST_REGS;hr++) {
9752       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9753         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9754           regs[i].wasconst&=~(1<<hr);
9755         }
9756       }
9757     }
9758     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9759   }
9760   
9761   /* Pass 4 - Cull unused host registers */
9762   
9763   uint64_t nr=0;
9764   
9765   for (i=slen-1;i>=0;i--)
9766   {
9767     int hr;
9768     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9769     {
9770       if(ba[i]<start || ba[i]>=(start+slen*4))
9771       {
9772         // Branch out of this block, don't need anything
9773         nr=0;
9774       }
9775       else
9776       {
9777         // Internal branch
9778         // Need whatever matches the target
9779         nr=0;
9780         int t=(ba[i]-start)>>2;
9781         for(hr=0;hr<HOST_REGS;hr++)
9782         {
9783           if(regs[i].regmap_entry[hr]>=0) {
9784             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9785           }
9786         }
9787       }
9788       // Conditional branch may need registers for following instructions
9789       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9790       {
9791         if(i<slen-2) {
9792           nr|=needed_reg[i+2];
9793           for(hr=0;hr<HOST_REGS;hr++)
9794           {
9795             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9796             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9797           }
9798         }
9799       }
9800       // Don't need stuff which is overwritten
9801       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9802       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9803       // Merge in delay slot
9804       for(hr=0;hr<HOST_REGS;hr++)
9805       {
9806         if(!likely[i]) {
9807           // These are overwritten unless the branch is "likely"
9808           // and the delay slot is nullified if not taken
9809           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9810           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9811         }
9812         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9813         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9814         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9815         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9816         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9817         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9818         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9819         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9820         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9821           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9822           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9823         }
9824         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9825           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9826           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9827         }
9828         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9829           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9830           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9831         }
9832       }
9833     }
9834     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9835     {
9836       // SYSCALL instruction (software interrupt)
9837       nr=0;
9838     }
9839     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9840     {
9841       // ERET instruction (return from interrupt)
9842       nr=0;
9843     }
9844     else // Non-branch
9845     {
9846       if(i<slen-1) {
9847         for(hr=0;hr<HOST_REGS;hr++) {
9848           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9849           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9850           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9851           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9852         }
9853       }
9854     }
9855     for(hr=0;hr<HOST_REGS;hr++)
9856     {
9857       // Overwritten registers are not needed
9858       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9859       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9860       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9861       // Source registers are needed
9862       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9863       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9864       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9865       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9866       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9867       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9868       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9869       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9870       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9871         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9872         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9873       }
9874       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9875         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9876         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9877       }
9878       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9879         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9880         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9881       }
9882       // Don't store a register immediately after writing it,
9883       // may prevent dual-issue.
9884       // But do so if this is a branch target, otherwise we
9885       // might have to load the register before the branch.
9886       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9887         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9888            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9889           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9890           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9891         }
9892         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9893            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9894           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9895           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9896         }
9897       }
9898     }
9899     // Cycle count is needed at branches.  Assume it is needed at the target too.
9900     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9901       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9902       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9903     }
9904     // Save it
9905     needed_reg[i]=nr;
9906     
9907     // Deallocate unneeded registers
9908     for(hr=0;hr<HOST_REGS;hr++)
9909     {
9910       if(!((nr>>hr)&1)) {
9911         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9912         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9913            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9914            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9915         {
9916           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9917           {
9918             if(likely[i]) {
9919               regs[i].regmap[hr]=-1;
9920               regs[i].isconst&=~(1<<hr);
9921               if(i<slen-2) {
9922                 regmap_pre[i+2][hr]=-1;
9923                 regs[i+2].wasconst&=~(1<<hr);
9924               }
9925             }
9926           }
9927         }
9928         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9929         {
9930           int d1=0,d2=0,map=0,temp=0;
9931           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9932           {
9933             d1=dep1[i+1];
9934             d2=dep2[i+1];
9935           }
9936           if(using_tlb) {
9937             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9938                itype[i+1]==STORE || itype[i+1]==STORELR ||
9939                itype[i+1]==C1LS || itype[i+1]==C2LS)
9940             map=TLREG;
9941           } else
9942           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9943              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9944             map=INVCP;
9945           }
9946           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9947              itype[i+1]==C1LS || itype[i+1]==C2LS)
9948             temp=FTEMP;
9949           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9950              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9951              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9952              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9953              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9954              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9955              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9956              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9957              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9958              regs[i].regmap[hr]!=map )
9959           {
9960             regs[i].regmap[hr]=-1;
9961             regs[i].isconst&=~(1<<hr);
9962             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9963                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9964                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9965                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9966                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9967                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9968                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9969                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9970                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9971                branch_regs[i].regmap[hr]!=map)
9972             {
9973               branch_regs[i].regmap[hr]=-1;
9974               branch_regs[i].regmap_entry[hr]=-1;
9975               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9976               {
9977                 if(!likely[i]&&i<slen-2) {
9978                   regmap_pre[i+2][hr]=-1;
9979                   regs[i+2].wasconst&=~(1<<hr);
9980                 }
9981               }
9982             }
9983           }
9984         }
9985         else
9986         {
9987           // Non-branch
9988           if(i>0)
9989           {
9990             int d1=0,d2=0,map=-1,temp=-1;
9991             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9992             {
9993               d1=dep1[i];
9994               d2=dep2[i];
9995             }
9996             if(using_tlb) {
9997               if(itype[i]==LOAD || itype[i]==LOADLR ||
9998                  itype[i]==STORE || itype[i]==STORELR ||
9999                  itype[i]==C1LS || itype[i]==C2LS)
10000               map=TLREG;
10001             } else if(itype[i]==STORE || itype[i]==STORELR ||
10002                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10003               map=INVCP;
10004             }
10005             if(itype[i]==LOADLR || itype[i]==STORELR ||
10006                itype[i]==C1LS || itype[i]==C2LS)
10007               temp=FTEMP;
10008             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10009                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10010                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10011                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10012                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10013                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10014             {
10015               if(i<slen-1&&!is_ds[i]) {
10016                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10017                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10018                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10019                 {
10020                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10021                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10022                 }
10023                 regmap_pre[i+1][hr]=-1;
10024                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10025                 regs[i+1].wasconst&=~(1<<hr);
10026               }
10027               regs[i].regmap[hr]=-1;
10028               regs[i].isconst&=~(1<<hr);
10029             }
10030           }
10031         }
10032       }
10033     }
10034   }
10035   
10036   /* Pass 5 - Pre-allocate registers */
10037   
10038   // If a register is allocated during a loop, try to allocate it for the
10039   // entire loop, if possible.  This avoids loading/storing registers
10040   // inside of the loop.
10041   
10042   signed char f_regmap[HOST_REGS];
10043   clear_all_regs(f_regmap);
10044   for(i=0;i<slen-1;i++)
10045   {
10046     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10047     {
10048       if(ba[i]>=start && ba[i]<(start+i*4)) 
10049       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10050       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10051       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10052       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10053       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10054       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10055       {
10056         int t=(ba[i]-start)>>2;
10057         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10058         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10059         for(hr=0;hr<HOST_REGS;hr++)
10060         {
10061           if(regs[i].regmap[hr]>64) {
10062             if(!((regs[i].dirty>>hr)&1))
10063               f_regmap[hr]=regs[i].regmap[hr];
10064             else f_regmap[hr]=-1;
10065           }
10066           else if(regs[i].regmap[hr]>=0) {
10067             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10068               // dealloc old register
10069               int n;
10070               for(n=0;n<HOST_REGS;n++)
10071               {
10072                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10073               }
10074               // and alloc new one
10075               f_regmap[hr]=regs[i].regmap[hr];
10076             }
10077           }
10078           if(branch_regs[i].regmap[hr]>64) {
10079             if(!((branch_regs[i].dirty>>hr)&1))
10080               f_regmap[hr]=branch_regs[i].regmap[hr];
10081             else f_regmap[hr]=-1;
10082           }
10083           else if(branch_regs[i].regmap[hr]>=0) {
10084             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10085               // dealloc old register
10086               int n;
10087               for(n=0;n<HOST_REGS;n++)
10088               {
10089                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10090               }
10091               // and alloc new one
10092               f_regmap[hr]=branch_regs[i].regmap[hr];
10093             }
10094           }
10095           if(ooo[i]) {
10096             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
10097               f_regmap[hr]=branch_regs[i].regmap[hr];
10098           }else{
10099             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
10100               f_regmap[hr]=branch_regs[i].regmap[hr];
10101           }
10102           // Avoid dirty->clean transition
10103           #ifdef DESTRUCTIVE_WRITEBACK
10104           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10105           #endif
10106           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10107           // case above, however it's always a good idea.  We can't hoist the
10108           // load if the register was already allocated, so there's no point
10109           // wasting time analyzing most of these cases.  It only "succeeds"
10110           // when the mapping was different and the load can be replaced with
10111           // a mov, which is of negligible benefit.  So such cases are
10112           // skipped below.
10113           if(f_regmap[hr]>0) {
10114             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10115               int r=f_regmap[hr];
10116               for(j=t;j<=i;j++)
10117               {
10118                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10119                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10120                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10121                 if(r>63) {
10122                   // NB This can exclude the case where the upper-half
10123                   // register is lower numbered than the lower-half
10124                   // register.  Not sure if it's worth fixing...
10125                   if(get_reg(regs[j].regmap,r&63)<0) break;
10126                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10127                   if(regs[j].is32&(1LL<<(r&63))) break;
10128                 }
10129                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10130                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10131                   int k;
10132                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10133                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10134                     if(r>63) {
10135                       if(get_reg(regs[i].regmap,r&63)<0) break;
10136                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10137                     }
10138                     k=i;
10139                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10140                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10141                         //printf("no free regs for store %x\n",start+(k-1)*4);
10142                         break;
10143                       }
10144                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10145                         //printf("no-match due to different register\n");
10146                         break;
10147                       }
10148                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10149                         //printf("no-match due to branch\n");
10150                         break;
10151                       }
10152                       // call/ret fast path assumes no registers allocated
10153                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10154                         break;
10155                       }
10156                       if(r>63) {
10157                         // NB This can exclude the case where the upper-half
10158                         // register is lower numbered than the lower-half
10159                         // register.  Not sure if it's worth fixing...
10160                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10161                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10162                       }
10163                       k--;
10164                     }
10165                     if(i<slen-1) {
10166                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10167                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10168                         //printf("bad match after branch\n");
10169                         break;
10170                       }
10171                     }
10172                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10173                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10174                       while(k<i) {
10175                         regs[k].regmap_entry[hr]=f_regmap[hr];
10176                         regs[k].regmap[hr]=f_regmap[hr];
10177                         regmap_pre[k+1][hr]=f_regmap[hr];
10178                         regs[k].wasdirty&=~(1<<hr);
10179                         regs[k].dirty&=~(1<<hr);
10180                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10181                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10182                         regs[k].wasconst&=~(1<<hr);
10183                         regs[k].isconst&=~(1<<hr);
10184                         k++;
10185                       }
10186                     }
10187                     else {
10188                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10189                       break;
10190                     }
10191                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10192                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10193                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10194                       regs[i].regmap_entry[hr]=f_regmap[hr];
10195                       regs[i].regmap[hr]=f_regmap[hr];
10196                       regs[i].wasdirty&=~(1<<hr);
10197                       regs[i].dirty&=~(1<<hr);
10198                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10199                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10200                       regs[i].wasconst&=~(1<<hr);
10201                       regs[i].isconst&=~(1<<hr);
10202                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10203                       branch_regs[i].wasdirty&=~(1<<hr);
10204                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10205                       branch_regs[i].regmap[hr]=f_regmap[hr];
10206                       branch_regs[i].dirty&=~(1<<hr);
10207                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10208                       branch_regs[i].wasconst&=~(1<<hr);
10209                       branch_regs[i].isconst&=~(1<<hr);
10210                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10211                         regmap_pre[i+2][hr]=f_regmap[hr];
10212                         regs[i+2].wasdirty&=~(1<<hr);
10213                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10214                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10215                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10216                       }
10217                     }
10218                   }
10219                   for(k=t;k<j;k++) {
10220                     // Alloc register clean at beginning of loop,
10221                     // but may dirty it in pass 6
10222                     regs[k].regmap_entry[hr]=f_regmap[hr];
10223                     regs[k].regmap[hr]=f_regmap[hr];
10224                     regs[k].dirty&=~(1<<hr);
10225                     regs[k].wasconst&=~(1<<hr);
10226                     regs[k].isconst&=~(1<<hr);
10227                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10228                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10229                       branch_regs[k].regmap[hr]=f_regmap[hr];
10230                       branch_regs[k].dirty&=~(1<<hr);
10231                       branch_regs[k].wasconst&=~(1<<hr);
10232                       branch_regs[k].isconst&=~(1<<hr);
10233                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10234                         regmap_pre[k+2][hr]=f_regmap[hr];
10235                         regs[k+2].wasdirty&=~(1<<hr);
10236                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10237                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10238                       }
10239                     }
10240                     else
10241                     {
10242                       regmap_pre[k+1][hr]=f_regmap[hr];
10243                       regs[k+1].wasdirty&=~(1<<hr);
10244                     }
10245                   }
10246                   if(regs[j].regmap[hr]==f_regmap[hr])
10247                     regs[j].regmap_entry[hr]=f_regmap[hr];
10248                   break;
10249                 }
10250                 if(j==i) break;
10251                 if(regs[j].regmap[hr]>=0)
10252                   break;
10253                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10254                   //printf("no-match due to different register\n");
10255                   break;
10256                 }
10257                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10258                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10259                   break;
10260                 }
10261                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10262                 {
10263                   // Stop on unconditional branch
10264                   break;
10265                 }
10266                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10267                 {
10268                   if(ooo[j]) {
10269                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10270                       break;
10271                   }else{
10272                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10273                       break;
10274                   }
10275                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10276                     //printf("no-match due to different register (branch)\n");
10277                     break;
10278                   }
10279                 }
10280                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10281                   //printf("No free regs for store %x\n",start+j*4);
10282                   break;
10283                 }
10284                 if(f_regmap[hr]>=64) {
10285                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10286                     break;
10287                   }
10288                   else
10289                   {
10290                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10291                       break;
10292                     }
10293                   }
10294                 }
10295               }
10296             }
10297           }
10298         }
10299       }
10300     }else{
10301       // Non branch or undetermined branch target
10302       for(hr=0;hr<HOST_REGS;hr++)
10303       {
10304         if(hr!=EXCLUDE_REG) {
10305           if(regs[i].regmap[hr]>64) {
10306             if(!((regs[i].dirty>>hr)&1))
10307               f_regmap[hr]=regs[i].regmap[hr];
10308           }
10309           else if(regs[i].regmap[hr]>=0) {
10310             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10311               // dealloc old register
10312               int n;
10313               for(n=0;n<HOST_REGS;n++)
10314               {
10315                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10316               }
10317               // and alloc new one
10318               f_regmap[hr]=regs[i].regmap[hr];
10319             }
10320           }
10321         }
10322       }
10323       // Try to restore cycle count at branch targets
10324       if(bt[i]) {
10325         for(j=i;j<slen-1;j++) {
10326           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10327           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10328             //printf("no free regs for store %x\n",start+j*4);
10329             break;
10330           }
10331         }
10332         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10333           int k=i;
10334           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10335           while(k<j) {
10336             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10337             regs[k].regmap[HOST_CCREG]=CCREG;
10338             regmap_pre[k+1][HOST_CCREG]=CCREG;
10339             regs[k+1].wasdirty|=1<<HOST_CCREG;
10340             regs[k].dirty|=1<<HOST_CCREG;
10341             regs[k].wasconst&=~(1<<HOST_CCREG);
10342             regs[k].isconst&=~(1<<HOST_CCREG);
10343             k++;
10344           }
10345           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10346         }
10347         // Work backwards from the branch target
10348         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10349         {
10350           //printf("Extend backwards\n");
10351           int k;
10352           k=i;
10353           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10354             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10355               //printf("no free regs for store %x\n",start+(k-1)*4);
10356               break;
10357             }
10358             k--;
10359           }
10360           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10361             //printf("Extend CC, %x ->\n",start+k*4);
10362             while(k<=i) {
10363               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10364               regs[k].regmap[HOST_CCREG]=CCREG;
10365               regmap_pre[k+1][HOST_CCREG]=CCREG;
10366               regs[k+1].wasdirty|=1<<HOST_CCREG;
10367               regs[k].dirty|=1<<HOST_CCREG;
10368               regs[k].wasconst&=~(1<<HOST_CCREG);
10369               regs[k].isconst&=~(1<<HOST_CCREG);
10370               k++;
10371             }
10372           }
10373           else {
10374             //printf("Fail Extend CC, %x ->\n",start+k*4);
10375           }
10376         }
10377       }
10378       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10379          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10380          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10381          itype[i]!=FCONV&&itype[i]!=FCOMP)
10382       {
10383         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10384       }
10385     }
10386   }
10387   
10388   // Cache memory offset or tlb map pointer if a register is available
10389   #ifndef HOST_IMM_ADDR32
10390   #ifndef RAM_OFFSET
10391   if(using_tlb)
10392   #endif
10393   {
10394     int earliest_available[HOST_REGS];
10395     int loop_start[HOST_REGS];
10396     int score[HOST_REGS];
10397     int end[HOST_REGS];
10398     int reg=using_tlb?MMREG:ROREG;
10399
10400     // Init
10401     for(hr=0;hr<HOST_REGS;hr++) {
10402       score[hr]=0;earliest_available[hr]=0;
10403       loop_start[hr]=MAXBLOCK;
10404     }
10405     for(i=0;i<slen-1;i++)
10406     {
10407       // Can't do anything if no registers are available
10408       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10409         for(hr=0;hr<HOST_REGS;hr++) {
10410           score[hr]=0;earliest_available[hr]=i+1;
10411           loop_start[hr]=MAXBLOCK;
10412         }
10413       }
10414       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10415         if(!ooo[i]) {
10416           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10417             for(hr=0;hr<HOST_REGS;hr++) {
10418               score[hr]=0;earliest_available[hr]=i+1;
10419               loop_start[hr]=MAXBLOCK;
10420             }
10421           }
10422         }else{
10423           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10424             for(hr=0;hr<HOST_REGS;hr++) {
10425               score[hr]=0;earliest_available[hr]=i+1;
10426               loop_start[hr]=MAXBLOCK;
10427             }
10428           }
10429         }
10430       }
10431       // Mark unavailable registers
10432       for(hr=0;hr<HOST_REGS;hr++) {
10433         if(regs[i].regmap[hr]>=0) {
10434           score[hr]=0;earliest_available[hr]=i+1;
10435           loop_start[hr]=MAXBLOCK;
10436         }
10437         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10438           if(branch_regs[i].regmap[hr]>=0) {
10439             score[hr]=0;earliest_available[hr]=i+2;
10440             loop_start[hr]=MAXBLOCK;
10441           }
10442         }
10443       }
10444       // No register allocations after unconditional jumps
10445       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10446       {
10447         for(hr=0;hr<HOST_REGS;hr++) {
10448           score[hr]=0;earliest_available[hr]=i+2;
10449           loop_start[hr]=MAXBLOCK;
10450         }
10451         i++; // Skip delay slot too
10452         //printf("skip delay slot: %x\n",start+i*4);
10453       }
10454       else
10455       // Possible match
10456       if(itype[i]==LOAD||itype[i]==LOADLR||
10457          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10458         for(hr=0;hr<HOST_REGS;hr++) {
10459           if(hr!=EXCLUDE_REG) {
10460             end[hr]=i-1;
10461             for(j=i;j<slen-1;j++) {
10462               if(regs[j].regmap[hr]>=0) break;
10463               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10464                 if(branch_regs[j].regmap[hr]>=0) break;
10465                 if(ooo[j]) {
10466                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10467                 }else{
10468                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10469                 }
10470               }
10471               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10472               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10473                 int t=(ba[j]-start)>>2;
10474                 if(t<j&&t>=earliest_available[hr]) {
10475                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10476                     // Score a point for hoisting loop invariant
10477                     if(t<loop_start[hr]) loop_start[hr]=t;
10478                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10479                     score[hr]++;
10480                     end[hr]=j;
10481                   }
10482                 }
10483                 else if(t<j) {
10484                   if(regs[t].regmap[hr]==reg) {
10485                     // Score a point if the branch target matches this register
10486                     score[hr]++;
10487                     end[hr]=j;
10488                   }
10489                 }
10490                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10491                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10492                   score[hr]++;
10493                   end[hr]=j;
10494                 }
10495               }
10496               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10497               {
10498                 // Stop on unconditional branch
10499                 break;
10500               }
10501               else
10502               if(itype[j]==LOAD||itype[j]==LOADLR||
10503                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10504                 score[hr]++;
10505                 end[hr]=j;
10506               }
10507             }
10508           }
10509         }
10510         // Find highest score and allocate that register
10511         int maxscore=0;
10512         for(hr=0;hr<HOST_REGS;hr++) {
10513           if(hr!=EXCLUDE_REG) {
10514             if(score[hr]>score[maxscore]) {
10515               maxscore=hr;
10516               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10517             }
10518           }
10519         }
10520         if(score[maxscore]>1)
10521         {
10522           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10523           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10524             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10525             assert(regs[j].regmap[maxscore]<0);
10526             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10527             regs[j].regmap[maxscore]=reg;
10528             regs[j].dirty&=~(1<<maxscore);
10529             regs[j].wasconst&=~(1<<maxscore);
10530             regs[j].isconst&=~(1<<maxscore);
10531             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10532               branch_regs[j].regmap[maxscore]=reg;
10533               branch_regs[j].wasdirty&=~(1<<maxscore);
10534               branch_regs[j].dirty&=~(1<<maxscore);
10535               branch_regs[j].wasconst&=~(1<<maxscore);
10536               branch_regs[j].isconst&=~(1<<maxscore);
10537               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10538                 regmap_pre[j+2][maxscore]=reg;
10539                 regs[j+2].wasdirty&=~(1<<maxscore);
10540               }
10541               // loop optimization (loop_preload)
10542               int t=(ba[j]-start)>>2;
10543               if(t==loop_start[maxscore]) {
10544                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10545                   regs[t].regmap_entry[maxscore]=reg;
10546               }
10547             }
10548             else
10549             {
10550               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10551                 regmap_pre[j+1][maxscore]=reg;
10552                 regs[j+1].wasdirty&=~(1<<maxscore);
10553               }
10554             }
10555           }
10556           i=j-1;
10557           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10558           for(hr=0;hr<HOST_REGS;hr++) {
10559             score[hr]=0;earliest_available[hr]=i+i;
10560             loop_start[hr]=MAXBLOCK;
10561           }
10562         }
10563       }
10564     }
10565   }
10566   #endif
10567   
10568   // This allocates registers (if possible) one instruction prior
10569   // to use, which can avoid a load-use penalty on certain CPUs.
10570   for(i=0;i<slen-1;i++)
10571   {
10572     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10573     {
10574       if(!bt[i+1])
10575       {
10576         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10577            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10578         {
10579           if(rs1[i+1]) {
10580             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10581             {
10582               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10583               {
10584                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10585                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10586                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10587                 regs[i].isconst&=~(1<<hr);
10588                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10589                 constmap[i][hr]=constmap[i+1][hr];
10590                 regs[i+1].wasdirty&=~(1<<hr);
10591                 regs[i].dirty&=~(1<<hr);
10592               }
10593             }
10594           }
10595           if(rs2[i+1]) {
10596             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10597             {
10598               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10599               {
10600                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10601                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10602                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10603                 regs[i].isconst&=~(1<<hr);
10604                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10605                 constmap[i][hr]=constmap[i+1][hr];
10606                 regs[i+1].wasdirty&=~(1<<hr);
10607                 regs[i].dirty&=~(1<<hr);
10608               }
10609             }
10610           }
10611           // Preload target address for load instruction (non-constant)
10612           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10613             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10614             {
10615               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10616               {
10617                 regs[i].regmap[hr]=rs1[i+1];
10618                 regmap_pre[i+1][hr]=rs1[i+1];
10619                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10620                 regs[i].isconst&=~(1<<hr);
10621                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10622                 constmap[i][hr]=constmap[i+1][hr];
10623                 regs[i+1].wasdirty&=~(1<<hr);
10624                 regs[i].dirty&=~(1<<hr);
10625               }
10626             }
10627           }
10628           // Load source into target register 
10629           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10630             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10631             {
10632               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10633               {
10634                 regs[i].regmap[hr]=rs1[i+1];
10635                 regmap_pre[i+1][hr]=rs1[i+1];
10636                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10637                 regs[i].isconst&=~(1<<hr);
10638                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10639                 constmap[i][hr]=constmap[i+1][hr];
10640                 regs[i+1].wasdirty&=~(1<<hr);
10641                 regs[i].dirty&=~(1<<hr);
10642               }
10643             }
10644           }
10645           // Preload map address
10646           #ifndef HOST_IMM_ADDR32
10647           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10648             hr=get_reg(regs[i+1].regmap,TLREG);
10649             if(hr>=0) {
10650               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10651               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10652                 int nr;
10653                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10654                 {
10655                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10656                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10657                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10658                   regs[i].isconst&=~(1<<hr);
10659                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10660                   constmap[i][hr]=constmap[i+1][hr];
10661                   regs[i+1].wasdirty&=~(1<<hr);
10662                   regs[i].dirty&=~(1<<hr);
10663                 }
10664                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10665                 {
10666                   // move it to another register
10667                   regs[i+1].regmap[hr]=-1;
10668                   regmap_pre[i+2][hr]=-1;
10669                   regs[i+1].regmap[nr]=TLREG;
10670                   regmap_pre[i+2][nr]=TLREG;
10671                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10672                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10673                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10674                   regs[i].isconst&=~(1<<nr);
10675                   regs[i+1].isconst&=~(1<<nr);
10676                   regs[i].dirty&=~(1<<nr);
10677                   regs[i+1].wasdirty&=~(1<<nr);
10678                   regs[i+1].dirty&=~(1<<nr);
10679                   regs[i+2].wasdirty&=~(1<<nr);
10680                 }
10681               }
10682             }
10683           }
10684           #endif
10685           // Address for store instruction (non-constant)
10686           if(itype[i+1]==STORE||itype[i+1]==STORELR
10687              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10688             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10689               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10690               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10691               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10692               assert(hr>=0);
10693               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10694               {
10695                 regs[i].regmap[hr]=rs1[i+1];
10696                 regmap_pre[i+1][hr]=rs1[i+1];
10697                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10698                 regs[i].isconst&=~(1<<hr);
10699                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10700                 constmap[i][hr]=constmap[i+1][hr];
10701                 regs[i+1].wasdirty&=~(1<<hr);
10702                 regs[i].dirty&=~(1<<hr);
10703               }
10704             }
10705           }
10706           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10707             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10708               int nr;
10709               hr=get_reg(regs[i+1].regmap,FTEMP);
10710               assert(hr>=0);
10711               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10712               {
10713                 regs[i].regmap[hr]=rs1[i+1];
10714                 regmap_pre[i+1][hr]=rs1[i+1];
10715                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10716                 regs[i].isconst&=~(1<<hr);
10717                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10718                 constmap[i][hr]=constmap[i+1][hr];
10719                 regs[i+1].wasdirty&=~(1<<hr);
10720                 regs[i].dirty&=~(1<<hr);
10721               }
10722               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10723               {
10724                 // move it to another register
10725                 regs[i+1].regmap[hr]=-1;
10726                 regmap_pre[i+2][hr]=-1;
10727                 regs[i+1].regmap[nr]=FTEMP;
10728                 regmap_pre[i+2][nr]=FTEMP;
10729                 regs[i].regmap[nr]=rs1[i+1];
10730                 regmap_pre[i+1][nr]=rs1[i+1];
10731                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10732                 regs[i].isconst&=~(1<<nr);
10733                 regs[i+1].isconst&=~(1<<nr);
10734                 regs[i].dirty&=~(1<<nr);
10735                 regs[i+1].wasdirty&=~(1<<nr);
10736                 regs[i+1].dirty&=~(1<<nr);
10737                 regs[i+2].wasdirty&=~(1<<nr);
10738               }
10739             }
10740           }
10741           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10742             if(itype[i+1]==LOAD) 
10743               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10744             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10745               hr=get_reg(regs[i+1].regmap,FTEMP);
10746             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10747               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10748               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10749             }
10750             if(hr>=0&&regs[i].regmap[hr]<0) {
10751               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10752               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10753                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10754                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10755                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10756                 regs[i].isconst&=~(1<<hr);
10757                 regs[i+1].wasdirty&=~(1<<hr);
10758                 regs[i].dirty&=~(1<<hr);
10759               }
10760             }
10761           }
10762         }
10763       }
10764     }
10765   }
10766   
10767   /* Pass 6 - Optimize clean/dirty state */
10768   clean_registers(0,slen-1,1);
10769   
10770   /* Pass 7 - Identify 32-bit registers */
10771 #ifndef FORCE32
10772   provisional_r32();
10773
10774   u_int r32=0;
10775   
10776   for (i=slen-1;i>=0;i--)
10777   {
10778     int hr;
10779     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10780     {
10781       if(ba[i]<start || ba[i]>=(start+slen*4))
10782       {
10783         // Branch out of this block, don't need anything
10784         r32=0;
10785       }
10786       else
10787       {
10788         // Internal branch
10789         // Need whatever matches the target
10790         // (and doesn't get overwritten by the delay slot instruction)
10791         r32=0;
10792         int t=(ba[i]-start)>>2;
10793         if(ba[i]>start+i*4) {
10794           // Forward branch
10795           if(!(requires_32bit[t]&~regs[i].was32))
10796             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10797         }else{
10798           // Backward branch
10799           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10800           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10801           if(!(pr32[t]&~regs[i].was32))
10802             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10803         }
10804       }
10805       // Conditional branch may need registers for following instructions
10806       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10807       {
10808         if(i<slen-2) {
10809           r32|=requires_32bit[i+2];
10810           r32&=regs[i].was32;
10811           // Mark this address as a branch target since it may be called
10812           // upon return from interrupt
10813           bt[i+2]=1;
10814         }
10815       }
10816       // Merge in delay slot
10817       if(!likely[i]) {
10818         // These are overwritten unless the branch is "likely"
10819         // and the delay slot is nullified if not taken
10820         r32&=~(1LL<<rt1[i+1]);
10821         r32&=~(1LL<<rt2[i+1]);
10822       }
10823       // Assume these are needed (delay slot)
10824       if(us1[i+1]>0)
10825       {
10826         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10827       }
10828       if(us2[i+1]>0)
10829       {
10830         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10831       }
10832       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10833       {
10834         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10835       }
10836       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10837       {
10838         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10839       }
10840     }
10841     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10842     {
10843       // SYSCALL instruction (software interrupt)
10844       r32=0;
10845     }
10846     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10847     {
10848       // ERET instruction (return from interrupt)
10849       r32=0;
10850     }
10851     // Check 32 bits
10852     r32&=~(1LL<<rt1[i]);
10853     r32&=~(1LL<<rt2[i]);
10854     if(us1[i]>0)
10855     {
10856       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10857     }
10858     if(us2[i]>0)
10859     {
10860       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10861     }
10862     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10863     {
10864       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10865     }
10866     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10867     {
10868       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10869     }
10870     requires_32bit[i]=r32;
10871     
10872     // Dirty registers which are 32-bit, require 32-bit input
10873     // as they will be written as 32-bit values
10874     for(hr=0;hr<HOST_REGS;hr++)
10875     {
10876       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10877         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10878           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10879           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10880         }
10881       }
10882     }
10883     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10884   }
10885 #else
10886   for (i=slen-1;i>=0;i--)
10887   {
10888     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10889     {
10890       // Conditional branch
10891       if((source[i]>>16)!=0x1000&&i<slen-2) {
10892         // Mark this address as a branch target since it may be called
10893         // upon return from interrupt
10894         bt[i+2]=1;
10895       }
10896     }
10897   }
10898 #endif
10899
10900   if(itype[slen-1]==SPAN) {
10901     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10902   }
10903
10904 #ifdef DISASM
10905   /* Debug/disassembly */
10906   for(i=0;i<slen;i++)
10907   {
10908     printf("U:");
10909     int r;
10910     for(r=1;r<=CCREG;r++) {
10911       if((unneeded_reg[i]>>r)&1) {
10912         if(r==HIREG) printf(" HI");
10913         else if(r==LOREG) printf(" LO");
10914         else printf(" r%d",r);
10915       }
10916     }
10917 #ifndef FORCE32
10918     printf(" UU:");
10919     for(r=1;r<=CCREG;r++) {
10920       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10921         if(r==HIREG) printf(" HI");
10922         else if(r==LOREG) printf(" LO");
10923         else printf(" r%d",r);
10924       }
10925     }
10926     printf(" 32:");
10927     for(r=0;r<=CCREG;r++) {
10928       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10929       if((regs[i].was32>>r)&1) {
10930         if(r==CCREG) printf(" CC");
10931         else if(r==HIREG) printf(" HI");
10932         else if(r==LOREG) printf(" LO");
10933         else printf(" r%d",r);
10934       }
10935     }
10936 #endif
10937     printf("\n");
10938     #if defined(__i386__) || defined(__x86_64__)
10939     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10940     #endif
10941     #ifdef __arm__
10942     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10943     #endif
10944     printf("needs: ");
10945     if(needed_reg[i]&1) printf("eax ");
10946     if((needed_reg[i]>>1)&1) printf("ecx ");
10947     if((needed_reg[i]>>2)&1) printf("edx ");
10948     if((needed_reg[i]>>3)&1) printf("ebx ");
10949     if((needed_reg[i]>>5)&1) printf("ebp ");
10950     if((needed_reg[i]>>6)&1) printf("esi ");
10951     if((needed_reg[i]>>7)&1) printf("edi ");
10952     printf("r:");
10953     for(r=0;r<=CCREG;r++) {
10954       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10955       if((requires_32bit[i]>>r)&1) {
10956         if(r==CCREG) printf(" CC");
10957         else if(r==HIREG) printf(" HI");
10958         else if(r==LOREG) printf(" LO");
10959         else printf(" r%d",r);
10960       }
10961     }
10962     printf("\n");
10963     /*printf("pr:");
10964     for(r=0;r<=CCREG;r++) {
10965       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10966       if((pr32[i]>>r)&1) {
10967         if(r==CCREG) printf(" CC");
10968         else if(r==HIREG) printf(" HI");
10969         else if(r==LOREG) printf(" LO");
10970         else printf(" r%d",r);
10971       }
10972     }
10973     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10974     printf("\n");*/
10975     #if defined(__i386__) || defined(__x86_64__)
10976     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10977     printf("dirty: ");
10978     if(regs[i].wasdirty&1) printf("eax ");
10979     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10980     if((regs[i].wasdirty>>2)&1) printf("edx ");
10981     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10982     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10983     if((regs[i].wasdirty>>6)&1) printf("esi ");
10984     if((regs[i].wasdirty>>7)&1) printf("edi ");
10985     #endif
10986     #ifdef __arm__
10987     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10988     printf("dirty: ");
10989     if(regs[i].wasdirty&1) printf("r0 ");
10990     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10991     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10992     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10993     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10994     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10995     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10996     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10997     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10998     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10999     if((regs[i].wasdirty>>10)&1) printf("r10 ");
11000     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11001     #endif
11002     printf("\n");
11003     disassemble_inst(i);
11004     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11005     #if defined(__i386__) || defined(__x86_64__)
11006     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11007     if(regs[i].dirty&1) printf("eax ");
11008     if((regs[i].dirty>>1)&1) printf("ecx ");
11009     if((regs[i].dirty>>2)&1) printf("edx ");
11010     if((regs[i].dirty>>3)&1) printf("ebx ");
11011     if((regs[i].dirty>>5)&1) printf("ebp ");
11012     if((regs[i].dirty>>6)&1) printf("esi ");
11013     if((regs[i].dirty>>7)&1) printf("edi ");
11014     #endif
11015     #ifdef __arm__
11016     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11017     if(regs[i].dirty&1) printf("r0 ");
11018     if((regs[i].dirty>>1)&1) printf("r1 ");
11019     if((regs[i].dirty>>2)&1) printf("r2 ");
11020     if((regs[i].dirty>>3)&1) printf("r3 ");
11021     if((regs[i].dirty>>4)&1) printf("r4 ");
11022     if((regs[i].dirty>>5)&1) printf("r5 ");
11023     if((regs[i].dirty>>6)&1) printf("r6 ");
11024     if((regs[i].dirty>>7)&1) printf("r7 ");
11025     if((regs[i].dirty>>8)&1) printf("r8 ");
11026     if((regs[i].dirty>>9)&1) printf("r9 ");
11027     if((regs[i].dirty>>10)&1) printf("r10 ");
11028     if((regs[i].dirty>>12)&1) printf("r12 ");
11029     #endif
11030     printf("\n");
11031     if(regs[i].isconst) {
11032       printf("constants: ");
11033       #if defined(__i386__) || defined(__x86_64__)
11034       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11035       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11036       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11037       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11038       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11039       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11040       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11041       #endif
11042       #ifdef __arm__
11043       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11044       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11045       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11046       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11047       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11048       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11049       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11050       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11051       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11052       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11053       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11054       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11055       #endif
11056       printf("\n");
11057     }
11058 #ifndef FORCE32
11059     printf(" 32:");
11060     for(r=0;r<=CCREG;r++) {
11061       if((regs[i].is32>>r)&1) {
11062         if(r==CCREG) printf(" CC");
11063         else if(r==HIREG) printf(" HI");
11064         else if(r==LOREG) printf(" LO");
11065         else printf(" r%d",r);
11066       }
11067     }
11068     printf("\n");
11069 #endif
11070     /*printf(" p32:");
11071     for(r=0;r<=CCREG;r++) {
11072       if((p32[i]>>r)&1) {
11073         if(r==CCREG) printf(" CC");
11074         else if(r==HIREG) printf(" HI");
11075         else if(r==LOREG) printf(" LO");
11076         else printf(" r%d",r);
11077       }
11078     }
11079     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11080     else printf("\n");*/
11081     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11082       #if defined(__i386__) || defined(__x86_64__)
11083       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11084       if(branch_regs[i].dirty&1) printf("eax ");
11085       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11086       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11087       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11088       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11089       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11090       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11091       #endif
11092       #ifdef __arm__
11093       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11094       if(branch_regs[i].dirty&1) printf("r0 ");
11095       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11096       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11097       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11098       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11099       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11100       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11101       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11102       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11103       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11104       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11105       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11106       #endif
11107 #ifndef FORCE32
11108       printf(" 32:");
11109       for(r=0;r<=CCREG;r++) {
11110         if((branch_regs[i].is32>>r)&1) {
11111           if(r==CCREG) printf(" CC");
11112           else if(r==HIREG) printf(" HI");
11113           else if(r==LOREG) printf(" LO");
11114           else printf(" r%d",r);
11115         }
11116       }
11117       printf("\n");
11118 #endif
11119     }
11120   }
11121 #endif // DISASM
11122
11123   /* Pass 8 - Assembly */
11124   linkcount=0;stubcount=0;
11125   ds=0;is_delayslot=0;
11126   cop1_usable=0;
11127   uint64_t is32_pre=0;
11128   u_int dirty_pre=0;
11129   u_int beginning=(u_int)out;
11130   if((u_int)addr&1) {
11131     ds=1;
11132     pagespan_ds();
11133   }
11134   u_int instr_addr0_override=0;
11135
11136 #ifdef PCSX
11137   if (start == 0x80030000) {
11138     // nasty hack for fastbios thing
11139     // override block entry to this code
11140     instr_addr0_override=(u_int)out;
11141     emit_movimm(start,0);
11142     // abuse io address var as a flag that we
11143     // have already returned here once
11144     emit_readword((int)&address,1);
11145     emit_writeword(0,(int)&pcaddr);
11146     emit_writeword(0,(int)&address);
11147     emit_cmp(0,1);
11148     emit_jne((int)new_dyna_leave);
11149   }
11150 #endif
11151   for(i=0;i<slen;i++)
11152   {
11153     //if(ds) printf("ds: ");
11154     disassemble_inst(i);
11155     if(ds) {
11156       ds=0; // Skip delay slot
11157       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11158       instr_addr[i]=0;
11159     } else {
11160       #ifndef DESTRUCTIVE_WRITEBACK
11161       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11162       {
11163         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11164               unneeded_reg[i],unneeded_reg_upper[i]);
11165         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11166               unneeded_reg[i],unneeded_reg_upper[i]);
11167       }
11168       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11169         is32_pre=branch_regs[i].is32;
11170         dirty_pre=branch_regs[i].dirty;
11171       }else{
11172         is32_pre=regs[i].is32;
11173         dirty_pre=regs[i].dirty;
11174       }
11175       #endif
11176       // write back
11177       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11178       {
11179         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11180                       unneeded_reg[i],unneeded_reg_upper[i]);
11181         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11182       }
11183       // branch target entry point
11184       instr_addr[i]=(u_int)out;
11185       assem_debug("<->\n");
11186       // load regs
11187       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11188         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11189       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11190       address_generation(i,&regs[i],regs[i].regmap_entry);
11191       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11192       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11193       {
11194         // Load the delay slot registers if necessary
11195         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11196           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11197         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11198           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11199         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11200           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11201       }
11202       else if(i+1<slen)
11203       {
11204         // Preload registers for following instruction
11205         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11206           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11207             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11208         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11209           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11210             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11211       }
11212       // TODO: if(is_ooo(i)) address_generation(i+1);
11213       if(itype[i]==CJUMP||itype[i]==FJUMP)
11214         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11215       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11216         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11217       if(bt[i]) cop1_usable=0;
11218       // assemble
11219       switch(itype[i]) {
11220         case ALU:
11221           alu_assemble(i,&regs[i]);break;
11222         case IMM16:
11223           imm16_assemble(i,&regs[i]);break;
11224         case SHIFT:
11225           shift_assemble(i,&regs[i]);break;
11226         case SHIFTIMM:
11227           shiftimm_assemble(i,&regs[i]);break;
11228         case LOAD:
11229           load_assemble(i,&regs[i]);break;
11230         case LOADLR:
11231           loadlr_assemble(i,&regs[i]);break;
11232         case STORE:
11233           store_assemble(i,&regs[i]);break;
11234         case STORELR:
11235           storelr_assemble(i,&regs[i]);break;
11236         case COP0:
11237           cop0_assemble(i,&regs[i]);break;
11238         case COP1:
11239           cop1_assemble(i,&regs[i]);break;
11240         case C1LS:
11241           c1ls_assemble(i,&regs[i]);break;
11242         case COP2:
11243           cop2_assemble(i,&regs[i]);break;
11244         case C2LS:
11245           c2ls_assemble(i,&regs[i]);break;
11246         case C2OP:
11247           c2op_assemble(i,&regs[i]);break;
11248         case FCONV:
11249           fconv_assemble(i,&regs[i]);break;
11250         case FLOAT:
11251           float_assemble(i,&regs[i]);break;
11252         case FCOMP:
11253           fcomp_assemble(i,&regs[i]);break;
11254         case MULTDIV:
11255           multdiv_assemble(i,&regs[i]);break;
11256         case MOV:
11257           mov_assemble(i,&regs[i]);break;
11258         case SYSCALL:
11259           syscall_assemble(i,&regs[i]);break;
11260         case HLECALL:
11261           hlecall_assemble(i,&regs[i]);break;
11262         case INTCALL:
11263           intcall_assemble(i,&regs[i]);break;
11264         case UJUMP:
11265           ujump_assemble(i,&regs[i]);ds=1;break;
11266         case RJUMP:
11267           rjump_assemble(i,&regs[i]);ds=1;break;
11268         case CJUMP:
11269           cjump_assemble(i,&regs[i]);ds=1;break;
11270         case SJUMP:
11271           sjump_assemble(i,&regs[i]);ds=1;break;
11272         case FJUMP:
11273           fjump_assemble(i,&regs[i]);ds=1;break;
11274         case SPAN:
11275           pagespan_assemble(i,&regs[i]);break;
11276       }
11277       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11278         literal_pool(1024);
11279       else
11280         literal_pool_jumpover(256);
11281     }
11282   }
11283   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11284   // If the block did not end with an unconditional branch,
11285   // add a jump to the next instruction.
11286   if(i>1) {
11287     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11288       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11289       assert(i==slen);
11290       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11291         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11292         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11293           emit_loadreg(CCREG,HOST_CCREG);
11294         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11295       }
11296       else if(!likely[i-2])
11297       {
11298         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11299         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11300       }
11301       else
11302       {
11303         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11304         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11305       }
11306       add_to_linker((int)out,start+i*4,0);
11307       emit_jmp(0);
11308     }
11309   }
11310   else
11311   {
11312     assert(i>0);
11313     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11314     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11315     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11316       emit_loadreg(CCREG,HOST_CCREG);
11317     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11318     add_to_linker((int)out,start+i*4,0);
11319     emit_jmp(0);
11320   }
11321
11322   // TODO: delay slot stubs?
11323   // Stubs
11324   for(i=0;i<stubcount;i++)
11325   {
11326     switch(stubs[i][0])
11327     {
11328       case LOADB_STUB:
11329       case LOADH_STUB:
11330       case LOADW_STUB:
11331       case LOADD_STUB:
11332       case LOADBU_STUB:
11333       case LOADHU_STUB:
11334         do_readstub(i);break;
11335       case STOREB_STUB:
11336       case STOREH_STUB:
11337       case STOREW_STUB:
11338       case STORED_STUB:
11339         do_writestub(i);break;
11340       case CC_STUB:
11341         do_ccstub(i);break;
11342       case INVCODE_STUB:
11343         do_invstub(i);break;
11344       case FP_STUB:
11345         do_cop1stub(i);break;
11346       case STORELR_STUB:
11347         do_unalignedwritestub(i);break;
11348     }
11349   }
11350
11351   if (instr_addr0_override)
11352     instr_addr[0] = instr_addr0_override;
11353
11354   /* Pass 9 - Linker */
11355   for(i=0;i<linkcount;i++)
11356   {
11357     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11358     literal_pool(64);
11359     if(!link_addr[i][2])
11360     {
11361       void *stub=out;
11362       void *addr=check_addr(link_addr[i][1]);
11363       emit_extjump(link_addr[i][0],link_addr[i][1]);
11364       if(addr) {
11365         set_jump_target(link_addr[i][0],(int)addr);
11366         add_link(link_addr[i][1],stub);
11367       }
11368       else set_jump_target(link_addr[i][0],(int)stub);
11369     }
11370     else
11371     {
11372       // Internal branch
11373       int target=(link_addr[i][1]-start)>>2;
11374       assert(target>=0&&target<slen);
11375       assert(instr_addr[target]);
11376       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11377       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11378       //#else
11379       set_jump_target(link_addr[i][0],instr_addr[target]);
11380       //#endif
11381     }
11382   }
11383   // External Branch Targets (jump_in)
11384   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11385   for(i=0;i<slen;i++)
11386   {
11387     if(bt[i]||i==0)
11388     {
11389       if(instr_addr[i]) // TODO - delay slots (=null)
11390       {
11391         u_int vaddr=start+i*4;
11392         u_int page=get_page(vaddr);
11393         u_int vpage=get_vpage(vaddr);
11394         literal_pool(256);
11395         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11396 #ifndef FORCE32
11397         if(!requires_32bit[i])
11398 #else
11399         if(1)
11400 #endif
11401         {
11402           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11403           assem_debug("jump_in: %x\n",start+i*4);
11404           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11405           int entry_point=do_dirty_stub(i);
11406           ll_add(jump_in+page,vaddr,(void *)entry_point);
11407           // If there was an existing entry in the hash table,
11408           // replace it with the new address.
11409           // Don't add new entries.  We'll insert the
11410           // ones that actually get used in check_addr().
11411           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11412           if(ht_bin[0]==vaddr) {
11413             ht_bin[1]=entry_point;
11414           }
11415           if(ht_bin[2]==vaddr) {
11416             ht_bin[3]=entry_point;
11417           }
11418         }
11419         else
11420         {
11421           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11422           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11423           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11424           //int entry_point=(int)out;
11425           ////assem_debug("entry_point: %x\n",entry_point);
11426           //load_regs_entry(i);
11427           //if(entry_point==(int)out)
11428           //  entry_point=instr_addr[i];
11429           //else
11430           //  emit_jmp(instr_addr[i]);
11431           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11432           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11433           int entry_point=do_dirty_stub(i);
11434           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11435         }
11436       }
11437     }
11438   }
11439   // Write out the literal pool if necessary
11440   literal_pool(0);
11441   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11442   // Align code
11443   if(((u_int)out)&7) emit_addnop(13);
11444   #endif
11445   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11446   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11447   memcpy(copy,source,slen*4);
11448   copy+=slen*4;
11449   
11450   #ifdef __arm__
11451   __clear_cache((void *)beginning,out);
11452   #endif
11453   
11454   // If we're within 256K of the end of the buffer,
11455   // start over from the beginning. (Is 256K enough?)
11456   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11457   
11458   // Trap writes to any of the pages we compiled
11459   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11460     invalid_code[i]=0;
11461 #ifndef DISABLE_TLB
11462     memory_map[i]|=0x40000000;
11463     if((signed int)start>=(signed int)0xC0000000) {
11464       assert(using_tlb);
11465       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11466       invalid_code[j]=0;
11467       memory_map[j]|=0x40000000;
11468       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11469     }
11470 #endif
11471   }
11472   inv_code_start=inv_code_end=~0;
11473 #ifdef PCSX
11474   // for PCSX we need to mark all mirrors too
11475   if(get_page(start)<(RAM_SIZE>>12))
11476     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11477       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11478       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11479       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11480 #endif
11481   
11482   /* Pass 10 - Free memory by expiring oldest blocks */
11483   
11484   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11485   while(expirep!=end)
11486   {
11487     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11488     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11489     inv_debug("EXP: Phase %d\n",expirep);
11490     switch((expirep>>11)&3)
11491     {
11492       case 0:
11493         // Clear jump_in and jump_dirty
11494         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11495         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11496         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11497         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11498         break;
11499       case 1:
11500         // Clear pointers
11501         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11502         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11503         break;
11504       case 2:
11505         // Clear hash table
11506         for(i=0;i<32;i++) {
11507           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11508           if((ht_bin[3]>>shift)==(base>>shift) ||
11509              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11510             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11511             ht_bin[2]=ht_bin[3]=-1;
11512           }
11513           if((ht_bin[1]>>shift)==(base>>shift) ||
11514              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11515             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11516             ht_bin[0]=ht_bin[2];
11517             ht_bin[1]=ht_bin[3];
11518             ht_bin[2]=ht_bin[3]=-1;
11519           }
11520         }
11521         break;
11522       case 3:
11523         // Clear jump_out
11524         #ifdef __arm__
11525         if((expirep&2047)==0) 
11526           do_clear_cache();
11527         #endif
11528         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11529         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11530         break;
11531     }
11532     expirep=(expirep+1)&65535;
11533   }
11534   return 0;
11535 }
11536
11537 // vim:shiftwidth=2:expandtab