64ea3bc1f5172d7bb1eb055ea686a3f299ddce02
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <sys/mman.h>
25
26 #include "emu_if.h" //emulator interface
27
28 //#define DISASM
29 //#define assem_debug printf
30 //#define inv_debug printf
31 #define assem_debug(...)
32 #define inv_debug(...)
33
34 #ifdef __i386__
35 #include "assem_x86.h"
36 #endif
37 #ifdef __x86_64__
38 #include "assem_x64.h"
39 #endif
40 #ifdef __arm__
41 #include "assem_arm.h"
42 #endif
43
44 #define MAXBLOCK 4096
45 #define MAX_OUTPUT_BLOCK_SIZE 262144
46 #define CLOCK_DIVIDER 2
47
48 struct regstat
49 {
50   signed char regmap_entry[HOST_REGS];
51   signed char regmap[HOST_REGS];
52   uint64_t was32;
53   uint64_t is32;
54   uint64_t wasdirty;
55   uint64_t dirty;
56   uint64_t u;
57   uint64_t uu;
58   u_int wasconst;
59   u_int isconst;
60   uint64_t constmap[HOST_REGS];
61 };
62
63 struct ll_entry
64 {
65   u_int vaddr;
66   u_int reg32;
67   void *addr;
68   struct ll_entry *next;
69 };
70
71   u_int start;
72   u_int *source;
73   u_int pagelimit;
74   char insn[MAXBLOCK][10];
75   u_char itype[MAXBLOCK];
76   u_char opcode[MAXBLOCK];
77   u_char opcode2[MAXBLOCK];
78   u_char bt[MAXBLOCK];
79   u_char rs1[MAXBLOCK];
80   u_char rs2[MAXBLOCK];
81   u_char rt1[MAXBLOCK];
82   u_char rt2[MAXBLOCK];
83   u_char us1[MAXBLOCK];
84   u_char us2[MAXBLOCK];
85   u_char dep1[MAXBLOCK];
86   u_char dep2[MAXBLOCK];
87   u_char lt1[MAXBLOCK];
88   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
89   static uint64_t gte_rt[MAXBLOCK];
90   static uint64_t gte_unneeded[MAXBLOCK];
91   static int gte_reads_flags; // gte flag read encountered
92   static u_int smrv[32]; // speculated MIPS register values
93   static u_int smrv_strong; // mask or regs that are likely to have correct values
94   static u_int smrv_weak; // same, but somewhat less likely
95   static u_int smrv_strong_next; // same, but after current insn executes
96   static u_int smrv_weak_next;
97   int imm[MAXBLOCK];
98   u_int ba[MAXBLOCK];
99   char likely[MAXBLOCK];
100   char is_ds[MAXBLOCK];
101   char ooo[MAXBLOCK];
102   uint64_t unneeded_reg[MAXBLOCK];
103   uint64_t unneeded_reg_upper[MAXBLOCK];
104   uint64_t branch_unneeded_reg[MAXBLOCK];
105   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
106   uint64_t p32[MAXBLOCK];
107   uint64_t pr32[MAXBLOCK];
108   signed char regmap_pre[MAXBLOCK][HOST_REGS];
109   signed char regmap[MAXBLOCK][HOST_REGS];
110   signed char regmap_entry[MAXBLOCK][HOST_REGS];
111   uint64_t constmap[MAXBLOCK][HOST_REGS];
112   struct regstat regs[MAXBLOCK];
113   struct regstat branch_regs[MAXBLOCK];
114   signed char minimum_free_regs[MAXBLOCK];
115   u_int needed_reg[MAXBLOCK];
116   uint64_t requires_32bit[MAXBLOCK];
117   u_int wont_dirty[MAXBLOCK];
118   u_int will_dirty[MAXBLOCK];
119   int ccadj[MAXBLOCK];
120   int slen;
121   u_int instr_addr[MAXBLOCK];
122   u_int link_addr[MAXBLOCK][3];
123   int linkcount;
124   u_int stubs[MAXBLOCK*3][8];
125   int stubcount;
126   u_int literals[1024][2];
127   int literalcount;
128   int is_delayslot;
129   int cop1_usable;
130   u_char *out;
131   struct ll_entry *jump_in[4096];
132   struct ll_entry *jump_out[4096];
133   struct ll_entry *jump_dirty[4096];
134   u_int hash_table[65536][4]  __attribute__((aligned(16)));
135   char shadow[1048576]  __attribute__((aligned(16)));
136   void *copy;
137   int expirep;
138 #ifndef PCSX
139   u_int using_tlb;
140 #else
141   static const u_int using_tlb=0;
142 #endif
143   int new_dynarec_did_compile;
144   u_int stop_after_jal;
145   extern u_char restore_candidate[512];
146   extern int cycle_count;
147
148   /* registers that may be allocated */
149   /* 1-31 gpr */
150 #define HIREG 32 // hi
151 #define LOREG 33 // lo
152 #define FSREG 34 // FPU status (FCSR)
153 #define CSREG 35 // Coprocessor status
154 #define CCREG 36 // Cycle count
155 #define INVCP 37 // Pointer to invalid_code
156 #define MMREG 38 // Pointer to memory_map
157 #define ROREG 39 // ram offset (if rdram!=0x80000000)
158 #define TEMPREG 40
159 #define FTEMP 40 // FPU temporary register
160 #define PTEMP 41 // Prefetch temporary register
161 #define TLREG 42 // TLB mapping offset
162 #define RHASH 43 // Return address hash
163 #define RHTBL 44 // Return address hash table address
164 #define RTEMP 45 // JR/JALR address register
165 #define MAXREG 45
166 #define AGEN1 46 // Address generation temporary register
167 #define AGEN2 47 // Address generation temporary register
168 #define MGEN1 48 // Maptable address generation temporary register
169 #define MGEN2 49 // Maptable address generation temporary register
170 #define BTREG 50 // Branch target temporary register
171
172   /* instruction types */
173 #define NOP 0     // No operation
174 #define LOAD 1    // Load
175 #define STORE 2   // Store
176 #define LOADLR 3  // Unaligned load
177 #define STORELR 4 // Unaligned store
178 #define MOV 5     // Move 
179 #define ALU 6     // Arithmetic/logic
180 #define MULTDIV 7 // Multiply/divide
181 #define SHIFT 8   // Shift by register
182 #define SHIFTIMM 9// Shift by immediate
183 #define IMM16 10  // 16-bit immediate
184 #define RJUMP 11  // Unconditional jump to register
185 #define UJUMP 12  // Unconditional jump
186 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
187 #define SJUMP 14  // Conditional branch (regimm format)
188 #define COP0 15   // Coprocessor 0
189 #define COP1 16   // Coprocessor 1
190 #define C1LS 17   // Coprocessor 1 load/store
191 #define FJUMP 18  // Conditional branch (floating point)
192 #define FLOAT 19  // Floating point unit
193 #define FCONV 20  // Convert integer to float
194 #define FCOMP 21  // Floating point compare (sets FSREG)
195 #define SYSCALL 22// SYSCALL
196 #define OTHER 23  // Other
197 #define SPAN 24   // Branch/delay slot spans 2 pages
198 #define NI 25     // Not implemented
199 #define HLECALL 26// PCSX fake opcodes for HLE
200 #define COP2 27   // Coprocessor 2 move
201 #define C2LS 28   // Coprocessor 2 load/store
202 #define C2OP 29   // Coprocessor 2 operation
203 #define INTCALL 30// Call interpreter to handle rare corner cases
204
205   /* stubs */
206 #define CC_STUB 1
207 #define FP_STUB 2
208 #define LOADB_STUB 3
209 #define LOADH_STUB 4
210 #define LOADW_STUB 5
211 #define LOADD_STUB 6
212 #define LOADBU_STUB 7
213 #define LOADHU_STUB 8
214 #define STOREB_STUB 9
215 #define STOREH_STUB 10
216 #define STOREW_STUB 11
217 #define STORED_STUB 12
218 #define STORELR_STUB 13
219 #define INVCODE_STUB 14
220
221   /* branch codes */
222 #define TAKEN 1
223 #define NOTTAKEN 2
224 #define NULLDS 3
225
226 // asm linkage
227 int new_recompile_block(int addr);
228 void *get_addr_ht(u_int vaddr);
229 void invalidate_block(u_int block);
230 void invalidate_addr(u_int addr);
231 void remove_hash(int vaddr);
232 void jump_vaddr();
233 void dyna_linker();
234 void dyna_linker_ds();
235 void verify_code();
236 void verify_code_vm();
237 void verify_code_ds();
238 void cc_interrupt();
239 void fp_exception();
240 void fp_exception_ds();
241 void jump_syscall();
242 void jump_syscall_hle();
243 void jump_eret();
244 void jump_hlecall();
245 void jump_intcall();
246 void new_dyna_leave();
247
248 // TLB
249 void TLBWI_new();
250 void TLBWR_new();
251 void read_nomem_new();
252 void read_nomemb_new();
253 void read_nomemh_new();
254 void read_nomemd_new();
255 void write_nomem_new();
256 void write_nomemb_new();
257 void write_nomemh_new();
258 void write_nomemd_new();
259 void write_rdram_new();
260 void write_rdramb_new();
261 void write_rdramh_new();
262 void write_rdramd_new();
263 extern u_int memory_map[1048576];
264
265 // Needed by assembler
266 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
267 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
268 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
269 void load_all_regs(signed char i_regmap[]);
270 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
271 void load_regs_entry(int t);
272 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
273
274 int tracedebug=0;
275
276 //#define DEBUG_CYCLE_COUNT 1
277
278 static void tlb_hacks()
279 {
280 #ifndef DISABLE_TLB
281   // Goldeneye hack
282   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
283   {
284     u_int addr;
285     int n;
286     switch (ROM_HEADER->Country_code&0xFF) 
287     {
288       case 0x45: // U
289         addr=0x34b30;
290         break;                   
291       case 0x4A: // J 
292         addr=0x34b70;    
293         break;    
294       case 0x50: // E 
295         addr=0x329f0;
296         break;                        
297       default: 
298         // Unknown country code
299         addr=0;
300         break;
301     }
302     u_int rom_addr=(u_int)rom;
303     #ifdef ROM_COPY
304     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
305     // in the lower 4G of memory to use this hack.  Copy it if necessary.
306     if((void *)rom>(void *)0xffffffff) {
307       munmap(ROM_COPY, 67108864);
308       if(mmap(ROM_COPY, 12582912,
309               PROT_READ | PROT_WRITE,
310               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
311               -1, 0) <= 0) {printf("mmap() failed\n");}
312       memcpy(ROM_COPY,rom,12582912);
313       rom_addr=(u_int)ROM_COPY;
314     }
315     #endif
316     if(addr) {
317       for(n=0x7F000;n<0x80000;n++) {
318         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
319       }
320     }
321   }
322 #endif
323 }
324
325 static u_int get_page(u_int vaddr)
326 {
327 #ifndef PCSX
328   u_int page=(vaddr^0x80000000)>>12;
329 #else
330   u_int page=vaddr&~0xe0000000;
331   if (page < 0x1000000)
332     page &= ~0x0e00000; // RAM mirrors
333   page>>=12;
334 #endif
335 #ifndef DISABLE_TLB
336   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
337 #endif
338   if(page>2048) page=2048+(page&2047);
339   return page;
340 }
341
342 static u_int get_vpage(u_int vaddr)
343 {
344   u_int vpage=(vaddr^0x80000000)>>12;
345 #ifndef DISABLE_TLB
346   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
347 #endif
348   if(vpage>2048) vpage=2048+(vpage&2047);
349   return vpage;
350 }
351
352 // Get address from virtual address
353 // This is called from the recompiled JR/JALR instructions
354 void *get_addr(u_int vaddr)
355 {
356   u_int page=get_page(vaddr);
357   u_int vpage=get_vpage(vaddr);
358   struct ll_entry *head;
359   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
360   head=jump_in[page];
361   while(head!=NULL) {
362     if(head->vaddr==vaddr&&head->reg32==0) {
363   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
364       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
365       ht_bin[3]=ht_bin[1];
366       ht_bin[2]=ht_bin[0];
367       ht_bin[1]=(int)head->addr;
368       ht_bin[0]=vaddr;
369       return head->addr;
370     }
371     head=head->next;
372   }
373   head=jump_dirty[vpage];
374   while(head!=NULL) {
375     if(head->vaddr==vaddr&&head->reg32==0) {
376       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
377       // Don't restore blocks which are about to expire from the cache
378       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
379       if(verify_dirty(head->addr)) {
380         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
381         invalid_code[vaddr>>12]=0;
382         inv_code_start=inv_code_end=~0;
383 #ifndef DISABLE_TLB
384         memory_map[vaddr>>12]|=0x40000000;
385 #endif
386         if(vpage<2048) {
387 #ifndef DISABLE_TLB
388           if(tlb_LUT_r[vaddr>>12]) {
389             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
390             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
391           }
392 #endif
393           restore_candidate[vpage>>3]|=1<<(vpage&7);
394         }
395         else restore_candidate[page>>3]|=1<<(page&7);
396         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
397         if(ht_bin[0]==vaddr) {
398           ht_bin[1]=(int)head->addr; // Replace existing entry
399         }
400         else
401         {
402           ht_bin[3]=ht_bin[1];
403           ht_bin[2]=ht_bin[0];
404           ht_bin[1]=(int)head->addr;
405           ht_bin[0]=vaddr;
406         }
407         return head->addr;
408       }
409     }
410     head=head->next;
411   }
412   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
413   int r=new_recompile_block(vaddr);
414   if(r==0) return get_addr(vaddr);
415   // Execute in unmapped page, generate pagefault execption
416   Status|=2;
417   Cause=(vaddr<<31)|0x8;
418   EPC=(vaddr&1)?vaddr-5:vaddr;
419   BadVAddr=(vaddr&~1);
420   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
421   EntryHi=BadVAddr&0xFFFFE000;
422   return get_addr_ht(0x80000000);
423 }
424 // Look up address in hash table first
425 void *get_addr_ht(u_int vaddr)
426 {
427   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
428   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
429   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
430   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
431   return get_addr(vaddr);
432 }
433
434 void *get_addr_32(u_int vaddr,u_int flags)
435 {
436 #ifdef FORCE32
437   return get_addr(vaddr);
438 #else
439   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
440   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
441   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
442   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
443   u_int page=get_page(vaddr);
444   u_int vpage=get_vpage(vaddr);
445   struct ll_entry *head;
446   head=jump_in[page];
447   while(head!=NULL) {
448     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
449       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
450       if(head->reg32==0) {
451         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
452         if(ht_bin[0]==-1) {
453           ht_bin[1]=(int)head->addr;
454           ht_bin[0]=vaddr;
455         }else if(ht_bin[2]==-1) {
456           ht_bin[3]=(int)head->addr;
457           ht_bin[2]=vaddr;
458         }
459         //ht_bin[3]=ht_bin[1];
460         //ht_bin[2]=ht_bin[0];
461         //ht_bin[1]=(int)head->addr;
462         //ht_bin[0]=vaddr;
463       }
464       return head->addr;
465     }
466     head=head->next;
467   }
468   head=jump_dirty[vpage];
469   while(head!=NULL) {
470     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
471       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
472       // Don't restore blocks which are about to expire from the cache
473       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
474       if(verify_dirty(head->addr)) {
475         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
476         invalid_code[vaddr>>12]=0;
477         inv_code_start=inv_code_end=~0;
478         memory_map[vaddr>>12]|=0x40000000;
479         if(vpage<2048) {
480 #ifndef DISABLE_TLB
481           if(tlb_LUT_r[vaddr>>12]) {
482             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
483             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
484           }
485 #endif
486           restore_candidate[vpage>>3]|=1<<(vpage&7);
487         }
488         else restore_candidate[page>>3]|=1<<(page&7);
489         if(head->reg32==0) {
490           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
491           if(ht_bin[0]==-1) {
492             ht_bin[1]=(int)head->addr;
493             ht_bin[0]=vaddr;
494           }else if(ht_bin[2]==-1) {
495             ht_bin[3]=(int)head->addr;
496             ht_bin[2]=vaddr;
497           }
498           //ht_bin[3]=ht_bin[1];
499           //ht_bin[2]=ht_bin[0];
500           //ht_bin[1]=(int)head->addr;
501           //ht_bin[0]=vaddr;
502         }
503         return head->addr;
504       }
505     }
506     head=head->next;
507   }
508   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
509   int r=new_recompile_block(vaddr);
510   if(r==0) return get_addr(vaddr);
511   // Execute in unmapped page, generate pagefault execption
512   Status|=2;
513   Cause=(vaddr<<31)|0x8;
514   EPC=(vaddr&1)?vaddr-5:vaddr;
515   BadVAddr=(vaddr&~1);
516   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
517   EntryHi=BadVAddr&0xFFFFE000;
518   return get_addr_ht(0x80000000);
519 #endif
520 }
521
522 void clear_all_regs(signed char regmap[])
523 {
524   int hr;
525   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
526 }
527
528 signed char get_reg(signed char regmap[],int r)
529 {
530   int hr;
531   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
532   return -1;
533 }
534
535 // Find a register that is available for two consecutive cycles
536 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
537 {
538   int hr;
539   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
540   return -1;
541 }
542
543 int count_free_regs(signed char regmap[])
544 {
545   int count=0;
546   int hr;
547   for(hr=0;hr<HOST_REGS;hr++)
548   {
549     if(hr!=EXCLUDE_REG) {
550       if(regmap[hr]<0) count++;
551     }
552   }
553   return count;
554 }
555
556 void dirty_reg(struct regstat *cur,signed char reg)
557 {
558   int hr;
559   if(!reg) return;
560   for (hr=0;hr<HOST_REGS;hr++) {
561     if((cur->regmap[hr]&63)==reg) {
562       cur->dirty|=1<<hr;
563     }
564   }
565 }
566
567 // If we dirty the lower half of a 64 bit register which is now being
568 // sign-extended, we need to dump the upper half.
569 // Note: Do this only after completion of the instruction, because
570 // some instructions may need to read the full 64-bit value even if
571 // overwriting it (eg SLTI, DSRA32).
572 static void flush_dirty_uppers(struct regstat *cur)
573 {
574   int hr,reg;
575   for (hr=0;hr<HOST_REGS;hr++) {
576     if((cur->dirty>>hr)&1) {
577       reg=cur->regmap[hr];
578       if(reg>=64) 
579         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
580     }
581   }
582 }
583
584 void set_const(struct regstat *cur,signed char reg,uint64_t value)
585 {
586   int hr;
587   if(!reg) return;
588   for (hr=0;hr<HOST_REGS;hr++) {
589     if(cur->regmap[hr]==reg) {
590       cur->isconst|=1<<hr;
591       cur->constmap[hr]=value;
592     }
593     else if((cur->regmap[hr]^64)==reg) {
594       cur->isconst|=1<<hr;
595       cur->constmap[hr]=value>>32;
596     }
597   }
598 }
599
600 void clear_const(struct regstat *cur,signed char reg)
601 {
602   int hr;
603   if(!reg) return;
604   for (hr=0;hr<HOST_REGS;hr++) {
605     if((cur->regmap[hr]&63)==reg) {
606       cur->isconst&=~(1<<hr);
607     }
608   }
609 }
610
611 int is_const(struct regstat *cur,signed char reg)
612 {
613   int hr;
614   if(reg<0) return 0;
615   if(!reg) return 1;
616   for (hr=0;hr<HOST_REGS;hr++) {
617     if((cur->regmap[hr]&63)==reg) {
618       return (cur->isconst>>hr)&1;
619     }
620   }
621   return 0;
622 }
623 uint64_t get_const(struct regstat *cur,signed char reg)
624 {
625   int hr;
626   if(!reg) return 0;
627   for (hr=0;hr<HOST_REGS;hr++) {
628     if(cur->regmap[hr]==reg) {
629       return cur->constmap[hr];
630     }
631   }
632   printf("Unknown constant in r%d\n",reg);
633   exit(1);
634 }
635
636 // Least soon needed registers
637 // Look at the next ten instructions and see which registers
638 // will be used.  Try not to reallocate these.
639 void lsn(u_char hsn[], int i, int *preferred_reg)
640 {
641   int j;
642   int b=-1;
643   for(j=0;j<9;j++)
644   {
645     if(i+j>=slen) {
646       j=slen-i-1;
647       break;
648     }
649     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
650     {
651       // Don't go past an unconditonal jump
652       j++;
653       break;
654     }
655   }
656   for(;j>=0;j--)
657   {
658     if(rs1[i+j]) hsn[rs1[i+j]]=j;
659     if(rs2[i+j]) hsn[rs2[i+j]]=j;
660     if(rt1[i+j]) hsn[rt1[i+j]]=j;
661     if(rt2[i+j]) hsn[rt2[i+j]]=j;
662     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
663       // Stores can allocate zero
664       hsn[rs1[i+j]]=j;
665       hsn[rs2[i+j]]=j;
666     }
667     // On some architectures stores need invc_ptr
668     #if defined(HOST_IMM8)
669     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
670       hsn[INVCP]=j;
671     }
672     #endif
673     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
674     {
675       hsn[CCREG]=j;
676       b=j;
677     }
678   }
679   if(b>=0)
680   {
681     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
682     {
683       // Follow first branch
684       int t=(ba[i+b]-start)>>2;
685       j=7-b;if(t+j>=slen) j=slen-t-1;
686       for(;j>=0;j--)
687       {
688         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
689         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
690         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
691         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
692       }
693     }
694     // TODO: preferred register based on backward branch
695   }
696   // Delay slot should preferably not overwrite branch conditions or cycle count
697   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
698     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
699     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
700     hsn[CCREG]=1;
701     // ...or hash tables
702     hsn[RHASH]=1;
703     hsn[RHTBL]=1;
704   }
705   // Coprocessor load/store needs FTEMP, even if not declared
706   if(itype[i]==C1LS||itype[i]==C2LS) {
707     hsn[FTEMP]=0;
708   }
709   // Load L/R also uses FTEMP as a temporary register
710   if(itype[i]==LOADLR) {
711     hsn[FTEMP]=0;
712   }
713   // Also SWL/SWR/SDL/SDR
714   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
715     hsn[FTEMP]=0;
716   }
717   // Don't remove the TLB registers either
718   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
719     hsn[TLREG]=0;
720   }
721   // Don't remove the miniht registers
722   if(itype[i]==UJUMP||itype[i]==RJUMP)
723   {
724     hsn[RHASH]=0;
725     hsn[RHTBL]=0;
726   }
727 }
728
729 // We only want to allocate registers if we're going to use them again soon
730 int needed_again(int r, int i)
731 {
732   int j;
733   int b=-1;
734   int rn=10;
735   
736   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
737   {
738     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
739       return 0; // Don't need any registers if exiting the block
740   }
741   for(j=0;j<9;j++)
742   {
743     if(i+j>=slen) {
744       j=slen-i-1;
745       break;
746     }
747     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
748     {
749       // Don't go past an unconditonal jump
750       j++;
751       break;
752     }
753     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
754     {
755       break;
756     }
757   }
758   for(;j>=1;j--)
759   {
760     if(rs1[i+j]==r) rn=j;
761     if(rs2[i+j]==r) rn=j;
762     if((unneeded_reg[i+j]>>r)&1) rn=10;
763     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
764     {
765       b=j;
766     }
767   }
768   /*
769   if(b>=0)
770   {
771     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
772     {
773       // Follow first branch
774       int o=rn;
775       int t=(ba[i+b]-start)>>2;
776       j=7-b;if(t+j>=slen) j=slen-t-1;
777       for(;j>=0;j--)
778       {
779         if(!((unneeded_reg[t+j]>>r)&1)) {
780           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
781           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
782         }
783         else rn=o;
784       }
785     }
786   }*/
787   if(rn<10) return 1;
788   return 0;
789 }
790
791 // Try to match register allocations at the end of a loop with those
792 // at the beginning
793 int loop_reg(int i, int r, int hr)
794 {
795   int j,k;
796   for(j=0;j<9;j++)
797   {
798     if(i+j>=slen) {
799       j=slen-i-1;
800       break;
801     }
802     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
803     {
804       // Don't go past an unconditonal jump
805       j++;
806       break;
807     }
808   }
809   k=0;
810   if(i>0){
811     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
812       k--;
813   }
814   for(;k<j;k++)
815   {
816     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
817     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
818     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
819     {
820       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
821       {
822         int t=(ba[i+k]-start)>>2;
823         int reg=get_reg(regs[t].regmap_entry,r);
824         if(reg>=0) return reg;
825         //reg=get_reg(regs[t+1].regmap_entry,r);
826         //if(reg>=0) return reg;
827       }
828     }
829   }
830   return hr;
831 }
832
833
834 // Allocate every register, preserving source/target regs
835 void alloc_all(struct regstat *cur,int i)
836 {
837   int hr;
838   
839   for(hr=0;hr<HOST_REGS;hr++) {
840     if(hr!=EXCLUDE_REG) {
841       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
842          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
843       {
844         cur->regmap[hr]=-1;
845         cur->dirty&=~(1<<hr);
846       }
847       // Don't need zeros
848       if((cur->regmap[hr]&63)==0)
849       {
850         cur->regmap[hr]=-1;
851         cur->dirty&=~(1<<hr);
852       }
853     }
854   }
855 }
856
857 #ifndef FORCE32
858 void div64(int64_t dividend,int64_t divisor)
859 {
860   lo=dividend/divisor;
861   hi=dividend%divisor;
862   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
863   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
864 }
865 void divu64(uint64_t dividend,uint64_t divisor)
866 {
867   lo=dividend/divisor;
868   hi=dividend%divisor;
869   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
870   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
871 }
872
873 void mult64(uint64_t m1,uint64_t m2)
874 {
875    unsigned long long int op1, op2, op3, op4;
876    unsigned long long int result1, result2, result3, result4;
877    unsigned long long int temp1, temp2, temp3, temp4;
878    int sign = 0;
879    
880    if (m1 < 0)
881      {
882     op2 = -m1;
883     sign = 1 - sign;
884      }
885    else op2 = m1;
886    if (m2 < 0)
887      {
888     op4 = -m2;
889     sign = 1 - sign;
890      }
891    else op4 = m2;
892    
893    op1 = op2 & 0xFFFFFFFF;
894    op2 = (op2 >> 32) & 0xFFFFFFFF;
895    op3 = op4 & 0xFFFFFFFF;
896    op4 = (op4 >> 32) & 0xFFFFFFFF;
897    
898    temp1 = op1 * op3;
899    temp2 = (temp1 >> 32) + op1 * op4;
900    temp3 = op2 * op3;
901    temp4 = (temp3 >> 32) + op2 * op4;
902    
903    result1 = temp1 & 0xFFFFFFFF;
904    result2 = temp2 + (temp3 & 0xFFFFFFFF);
905    result3 = (result2 >> 32) + temp4;
906    result4 = (result3 >> 32);
907    
908    lo = result1 | (result2 << 32);
909    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
910    if (sign)
911      {
912     hi = ~hi;
913     if (!lo) hi++;
914     else lo = ~lo + 1;
915      }
916 }
917
918 void multu64(uint64_t m1,uint64_t m2)
919 {
920    unsigned long long int op1, op2, op3, op4;
921    unsigned long long int result1, result2, result3, result4;
922    unsigned long long int temp1, temp2, temp3, temp4;
923    
924    op1 = m1 & 0xFFFFFFFF;
925    op2 = (m1 >> 32) & 0xFFFFFFFF;
926    op3 = m2 & 0xFFFFFFFF;
927    op4 = (m2 >> 32) & 0xFFFFFFFF;
928    
929    temp1 = op1 * op3;
930    temp2 = (temp1 >> 32) + op1 * op4;
931    temp3 = op2 * op3;
932    temp4 = (temp3 >> 32) + op2 * op4;
933    
934    result1 = temp1 & 0xFFFFFFFF;
935    result2 = temp2 + (temp3 & 0xFFFFFFFF);
936    result3 = (result2 >> 32) + temp4;
937    result4 = (result3 >> 32);
938    
939    lo = result1 | (result2 << 32);
940    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
941    
942   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
943   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
944 }
945
946 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
947 {
948   if(bits) {
949     original<<=64-bits;
950     original>>=64-bits;
951     loaded<<=bits;
952     original|=loaded;
953   }
954   else original=loaded;
955   return original;
956 }
957 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
958 {
959   if(bits^56) {
960     original>>=64-(bits^56);
961     original<<=64-(bits^56);
962     loaded>>=bits^56;
963     original|=loaded;
964   }
965   else original=loaded;
966   return original;
967 }
968 #endif
969
970 #ifdef __i386__
971 #include "assem_x86.c"
972 #endif
973 #ifdef __x86_64__
974 #include "assem_x64.c"
975 #endif
976 #ifdef __arm__
977 #include "assem_arm.c"
978 #endif
979
980 // Add virtual address mapping to linked list
981 void ll_add(struct ll_entry **head,int vaddr,void *addr)
982 {
983   struct ll_entry *new_entry;
984   new_entry=malloc(sizeof(struct ll_entry));
985   assert(new_entry!=NULL);
986   new_entry->vaddr=vaddr;
987   new_entry->reg32=0;
988   new_entry->addr=addr;
989   new_entry->next=*head;
990   *head=new_entry;
991 }
992
993 // Add virtual address mapping for 32-bit compiled block
994 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
995 {
996   ll_add(head,vaddr,addr);
997 #ifndef FORCE32
998   (*head)->reg32=reg32;
999 #endif
1000 }
1001
1002 // Check if an address is already compiled
1003 // but don't return addresses which are about to expire from the cache
1004 void *check_addr(u_int vaddr)
1005 {
1006   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1007   if(ht_bin[0]==vaddr) {
1008     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1009       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1010   }
1011   if(ht_bin[2]==vaddr) {
1012     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1013       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1014   }
1015   u_int page=get_page(vaddr);
1016   struct ll_entry *head;
1017   head=jump_in[page];
1018   while(head!=NULL) {
1019     if(head->vaddr==vaddr&&head->reg32==0) {
1020       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1021         // Update existing entry with current address
1022         if(ht_bin[0]==vaddr) {
1023           ht_bin[1]=(int)head->addr;
1024           return head->addr;
1025         }
1026         if(ht_bin[2]==vaddr) {
1027           ht_bin[3]=(int)head->addr;
1028           return head->addr;
1029         }
1030         // Insert into hash table with low priority.
1031         // Don't evict existing entries, as they are probably
1032         // addresses that are being accessed frequently.
1033         if(ht_bin[0]==-1) {
1034           ht_bin[1]=(int)head->addr;
1035           ht_bin[0]=vaddr;
1036         }else if(ht_bin[2]==-1) {
1037           ht_bin[3]=(int)head->addr;
1038           ht_bin[2]=vaddr;
1039         }
1040         return head->addr;
1041       }
1042     }
1043     head=head->next;
1044   }
1045   return 0;
1046 }
1047
1048 void remove_hash(int vaddr)
1049 {
1050   //printf("remove hash: %x\n",vaddr);
1051   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1052   if(ht_bin[2]==vaddr) {
1053     ht_bin[2]=ht_bin[3]=-1;
1054   }
1055   if(ht_bin[0]==vaddr) {
1056     ht_bin[0]=ht_bin[2];
1057     ht_bin[1]=ht_bin[3];
1058     ht_bin[2]=ht_bin[3]=-1;
1059   }
1060 }
1061
1062 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1063 {
1064   struct ll_entry *next;
1065   while(*head) {
1066     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1067        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1068     {
1069       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1070       remove_hash((*head)->vaddr);
1071       next=(*head)->next;
1072       free(*head);
1073       *head=next;
1074     }
1075     else
1076     {
1077       head=&((*head)->next);
1078     }
1079   }
1080 }
1081
1082 // Remove all entries from linked list
1083 void ll_clear(struct ll_entry **head)
1084 {
1085   struct ll_entry *cur;
1086   struct ll_entry *next;
1087   if(cur=*head) {
1088     *head=0;
1089     while(cur) {
1090       next=cur->next;
1091       free(cur);
1092       cur=next;
1093     }
1094   }
1095 }
1096
1097 // Dereference the pointers and remove if it matches
1098 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1099 {
1100   while(head) {
1101     int ptr=get_pointer(head->addr);
1102     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1103     if(((ptr>>shift)==(addr>>shift)) ||
1104        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1105     {
1106       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1107       u_int host_addr=(u_int)kill_pointer(head->addr);
1108       #ifdef __arm__
1109         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1110       #endif
1111     }
1112     head=head->next;
1113   }
1114 }
1115
1116 // This is called when we write to a compiled block (see do_invstub)
1117 void invalidate_page(u_int page)
1118 {
1119   struct ll_entry *head;
1120   struct ll_entry *next;
1121   head=jump_in[page];
1122   jump_in[page]=0;
1123   while(head!=NULL) {
1124     inv_debug("INVALIDATE: %x\n",head->vaddr);
1125     remove_hash(head->vaddr);
1126     next=head->next;
1127     free(head);
1128     head=next;
1129   }
1130   head=jump_out[page];
1131   jump_out[page]=0;
1132   while(head!=NULL) {
1133     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1134     u_int host_addr=(u_int)kill_pointer(head->addr);
1135     #ifdef __arm__
1136       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1137     #endif
1138     next=head->next;
1139     free(head);
1140     head=next;
1141   }
1142 }
1143
1144 static void invalidate_block_range(u_int block, u_int first, u_int last)
1145 {
1146   u_int page=get_page(block<<12);
1147   //printf("first=%d last=%d\n",first,last);
1148   invalidate_page(page);
1149   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1150   assert(last<page+5);
1151   // Invalidate the adjacent pages if a block crosses a 4K boundary
1152   while(first<page) {
1153     invalidate_page(first);
1154     first++;
1155   }
1156   for(first=page+1;first<last;first++) {
1157     invalidate_page(first);
1158   }
1159   #ifdef __arm__
1160     do_clear_cache();
1161   #endif
1162   
1163   // Don't trap writes
1164   invalid_code[block]=1;
1165 #ifndef DISABLE_TLB
1166   // If there is a valid TLB entry for this page, remove write protect
1167   if(tlb_LUT_w[block]) {
1168     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1169     // CHECK: Is this right?
1170     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1171     u_int real_block=tlb_LUT_w[block]>>12;
1172     invalid_code[real_block]=1;
1173     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1174   }
1175   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1176 #endif
1177
1178   #ifdef USE_MINI_HT
1179   memset(mini_ht,-1,sizeof(mini_ht));
1180   #endif
1181 }
1182
1183 void invalidate_block(u_int block)
1184 {
1185   u_int page=get_page(block<<12);
1186   u_int vpage=get_vpage(block<<12);
1187   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1188   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1189   u_int first,last;
1190   first=last=page;
1191   struct ll_entry *head;
1192   head=jump_dirty[vpage];
1193   //printf("page=%d vpage=%d\n",page,vpage);
1194   while(head!=NULL) {
1195     u_int start,end;
1196     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1197       get_bounds((int)head->addr,&start,&end);
1198       //printf("start: %x end: %x\n",start,end);
1199       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1200         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1201           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1202           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1203         }
1204       }
1205 #ifndef DISABLE_TLB
1206       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1207         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1208           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1209           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1210         }
1211       }
1212 #endif
1213     }
1214     head=head->next;
1215   }
1216   invalidate_block_range(block,first,last);
1217 }
1218
1219 void invalidate_addr(u_int addr)
1220 {
1221 #ifdef PCSX
1222   //static int rhits;
1223   // this check is done by the caller
1224   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1225   u_int page=get_page(addr);
1226   if(page<2048) { // RAM
1227     struct ll_entry *head;
1228     u_int addr_min=~0, addr_max=0;
1229     int mask=RAM_SIZE-1;
1230     int pg1;
1231     inv_code_start=addr&~0xfff;
1232     inv_code_end=addr|0xfff;
1233     pg1=page;
1234     if (pg1>0) {
1235       // must check previous page too because of spans..
1236       pg1--;
1237       inv_code_start-=0x1000;
1238     }
1239     for(;pg1<=page;pg1++) {
1240       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1241         u_int start,end;
1242         get_bounds((int)head->addr,&start,&end);
1243         if((start&mask)<=(addr&mask)&&(addr&mask)<(end&mask)) {
1244           if(start<addr_min) addr_min=start;
1245           if(end>addr_max) addr_max=end;
1246         }
1247         else if(addr<start) {
1248           if(start<inv_code_end)
1249             inv_code_end=start-1;
1250         }
1251         else {
1252           if(end>inv_code_start)
1253             inv_code_start=end;
1254         }
1255       }
1256     }
1257     if (addr_min!=~0) {
1258       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1259       inv_code_start=inv_code_end=~0;
1260       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1261       return;
1262     }
1263     else {
1264       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);//rhits);
1265     }
1266     //rhits=0;
1267     if(page!=0) // FIXME: don't know what's up with page 0 (Klonoa)
1268       return;
1269   }
1270 #endif
1271   invalidate_block(addr>>12);
1272 }
1273
1274 // This is called when loading a save state.
1275 // Anything could have changed, so invalidate everything.
1276 void invalidate_all_pages()
1277 {
1278   u_int page,n;
1279   for(page=0;page<4096;page++)
1280     invalidate_page(page);
1281   for(page=0;page<1048576;page++)
1282     if(!invalid_code[page]) {
1283       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1284       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1285     }
1286   #ifdef __arm__
1287   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1288   #endif
1289   #ifdef USE_MINI_HT
1290   memset(mini_ht,-1,sizeof(mini_ht));
1291   #endif
1292   #ifndef DISABLE_TLB
1293   // TLB
1294   for(page=0;page<0x100000;page++) {
1295     if(tlb_LUT_r[page]) {
1296       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1297       if(!tlb_LUT_w[page]||!invalid_code[page])
1298         memory_map[page]|=0x40000000; // Write protect
1299     }
1300     else memory_map[page]=-1;
1301     if(page==0x80000) page=0xC0000;
1302   }
1303   tlb_hacks();
1304   #endif
1305 }
1306
1307 // Add an entry to jump_out after making a link
1308 void add_link(u_int vaddr,void *src)
1309 {
1310   u_int page=get_page(vaddr);
1311   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1312   int *ptr=(int *)(src+4);
1313   assert((*ptr&0x0fff0000)==0x059f0000);
1314   ll_add(jump_out+page,vaddr,src);
1315   //int ptr=get_pointer(src);
1316   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1317 }
1318
1319 // If a code block was found to be unmodified (bit was set in
1320 // restore_candidate) and it remains unmodified (bit is clear
1321 // in invalid_code) then move the entries for that 4K page from
1322 // the dirty list to the clean list.
1323 void clean_blocks(u_int page)
1324 {
1325   struct ll_entry *head;
1326   inv_debug("INV: clean_blocks page=%d\n",page);
1327   head=jump_dirty[page];
1328   while(head!=NULL) {
1329     if(!invalid_code[head->vaddr>>12]) {
1330       // Don't restore blocks which are about to expire from the cache
1331       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1332         u_int start,end;
1333         if(verify_dirty((int)head->addr)) {
1334           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1335           u_int i;
1336           u_int inv=0;
1337           get_bounds((int)head->addr,&start,&end);
1338           if(start-(u_int)rdram<RAM_SIZE) {
1339             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1340               inv|=invalid_code[i];
1341             }
1342           }
1343 #ifndef DISABLE_TLB
1344           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1345             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1346             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1347             if(addr<start||addr>=end) inv=1;
1348           }
1349 #endif
1350           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1351             inv=1;
1352           }
1353           if(!inv) {
1354             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1355             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1356               u_int ppage=page;
1357 #ifndef DISABLE_TLB
1358               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1359 #endif
1360               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1361               //printf("page=%x, addr=%x\n",page,head->vaddr);
1362               //assert(head->vaddr>>12==(page|0x80000));
1363               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1364               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1365               if(!head->reg32) {
1366                 if(ht_bin[0]==head->vaddr) {
1367                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1368                 }
1369                 if(ht_bin[2]==head->vaddr) {
1370                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1371                 }
1372               }
1373             }
1374           }
1375         }
1376       }
1377     }
1378     head=head->next;
1379   }
1380 }
1381
1382
1383 void mov_alloc(struct regstat *current,int i)
1384 {
1385   // Note: Don't need to actually alloc the source registers
1386   if((~current->is32>>rs1[i])&1) {
1387     //alloc_reg64(current,i,rs1[i]);
1388     alloc_reg64(current,i,rt1[i]);
1389     current->is32&=~(1LL<<rt1[i]);
1390   } else {
1391     //alloc_reg(current,i,rs1[i]);
1392     alloc_reg(current,i,rt1[i]);
1393     current->is32|=(1LL<<rt1[i]);
1394   }
1395   clear_const(current,rs1[i]);
1396   clear_const(current,rt1[i]);
1397   dirty_reg(current,rt1[i]);
1398 }
1399
1400 void shiftimm_alloc(struct regstat *current,int i)
1401 {
1402   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1403   {
1404     if(rt1[i]) {
1405       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1406       else lt1[i]=rs1[i];
1407       alloc_reg(current,i,rt1[i]);
1408       current->is32|=1LL<<rt1[i];
1409       dirty_reg(current,rt1[i]);
1410       if(is_const(current,rs1[i])) {
1411         int v=get_const(current,rs1[i]);
1412         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1413         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1414         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1415       }
1416       else clear_const(current,rt1[i]);
1417     }
1418   }
1419   else
1420   {
1421     clear_const(current,rs1[i]);
1422     clear_const(current,rt1[i]);
1423   }
1424
1425   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1426   {
1427     if(rt1[i]) {
1428       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1429       alloc_reg64(current,i,rt1[i]);
1430       current->is32&=~(1LL<<rt1[i]);
1431       dirty_reg(current,rt1[i]);
1432     }
1433   }
1434   if(opcode2[i]==0x3c) // DSLL32
1435   {
1436     if(rt1[i]) {
1437       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1438       alloc_reg64(current,i,rt1[i]);
1439       current->is32&=~(1LL<<rt1[i]);
1440       dirty_reg(current,rt1[i]);
1441     }
1442   }
1443   if(opcode2[i]==0x3e) // DSRL32
1444   {
1445     if(rt1[i]) {
1446       alloc_reg64(current,i,rs1[i]);
1447       if(imm[i]==32) {
1448         alloc_reg64(current,i,rt1[i]);
1449         current->is32&=~(1LL<<rt1[i]);
1450       } else {
1451         alloc_reg(current,i,rt1[i]);
1452         current->is32|=1LL<<rt1[i];
1453       }
1454       dirty_reg(current,rt1[i]);
1455     }
1456   }
1457   if(opcode2[i]==0x3f) // DSRA32
1458   {
1459     if(rt1[i]) {
1460       alloc_reg64(current,i,rs1[i]);
1461       alloc_reg(current,i,rt1[i]);
1462       current->is32|=1LL<<rt1[i];
1463       dirty_reg(current,rt1[i]);
1464     }
1465   }
1466 }
1467
1468 void shift_alloc(struct regstat *current,int i)
1469 {
1470   if(rt1[i]) {
1471     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1472     {
1473       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1474       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1475       alloc_reg(current,i,rt1[i]);
1476       if(rt1[i]==rs2[i]) {
1477         alloc_reg_temp(current,i,-1);
1478         minimum_free_regs[i]=1;
1479       }
1480       current->is32|=1LL<<rt1[i];
1481     } else { // DSLLV/DSRLV/DSRAV
1482       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1483       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1484       alloc_reg64(current,i,rt1[i]);
1485       current->is32&=~(1LL<<rt1[i]);
1486       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1487       {
1488         alloc_reg_temp(current,i,-1);
1489         minimum_free_regs[i]=1;
1490       }
1491     }
1492     clear_const(current,rs1[i]);
1493     clear_const(current,rs2[i]);
1494     clear_const(current,rt1[i]);
1495     dirty_reg(current,rt1[i]);
1496   }
1497 }
1498
1499 void alu_alloc(struct regstat *current,int i)
1500 {
1501   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1502     if(rt1[i]) {
1503       if(rs1[i]&&rs2[i]) {
1504         alloc_reg(current,i,rs1[i]);
1505         alloc_reg(current,i,rs2[i]);
1506       }
1507       else {
1508         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1509         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1510       }
1511       alloc_reg(current,i,rt1[i]);
1512     }
1513     current->is32|=1LL<<rt1[i];
1514   }
1515   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1516     if(rt1[i]) {
1517       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1518       {
1519         alloc_reg64(current,i,rs1[i]);
1520         alloc_reg64(current,i,rs2[i]);
1521         alloc_reg(current,i,rt1[i]);
1522       } else {
1523         alloc_reg(current,i,rs1[i]);
1524         alloc_reg(current,i,rs2[i]);
1525         alloc_reg(current,i,rt1[i]);
1526       }
1527     }
1528     current->is32|=1LL<<rt1[i];
1529   }
1530   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1531     if(rt1[i]) {
1532       if(rs1[i]&&rs2[i]) {
1533         alloc_reg(current,i,rs1[i]);
1534         alloc_reg(current,i,rs2[i]);
1535       }
1536       else
1537       {
1538         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1539         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1540       }
1541       alloc_reg(current,i,rt1[i]);
1542       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1543       {
1544         if(!((current->uu>>rt1[i])&1)) {
1545           alloc_reg64(current,i,rt1[i]);
1546         }
1547         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1548           if(rs1[i]&&rs2[i]) {
1549             alloc_reg64(current,i,rs1[i]);
1550             alloc_reg64(current,i,rs2[i]);
1551           }
1552           else
1553           {
1554             // Is is really worth it to keep 64-bit values in registers?
1555             #ifdef NATIVE_64BIT
1556             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1557             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1558             #endif
1559           }
1560         }
1561         current->is32&=~(1LL<<rt1[i]);
1562       } else {
1563         current->is32|=1LL<<rt1[i];
1564       }
1565     }
1566   }
1567   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1568     if(rt1[i]) {
1569       if(rs1[i]&&rs2[i]) {
1570         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1571           alloc_reg64(current,i,rs1[i]);
1572           alloc_reg64(current,i,rs2[i]);
1573           alloc_reg64(current,i,rt1[i]);
1574         } else {
1575           alloc_reg(current,i,rs1[i]);
1576           alloc_reg(current,i,rs2[i]);
1577           alloc_reg(current,i,rt1[i]);
1578         }
1579       }
1580       else {
1581         alloc_reg(current,i,rt1[i]);
1582         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1583           // DADD used as move, or zeroing
1584           // If we have a 64-bit source, then make the target 64 bits too
1585           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1586             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1587             alloc_reg64(current,i,rt1[i]);
1588           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1589             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1590             alloc_reg64(current,i,rt1[i]);
1591           }
1592           if(opcode2[i]>=0x2e&&rs2[i]) {
1593             // DSUB used as negation - 64-bit result
1594             // If we have a 32-bit register, extend it to 64 bits
1595             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1596             alloc_reg64(current,i,rt1[i]);
1597           }
1598         }
1599       }
1600       if(rs1[i]&&rs2[i]) {
1601         current->is32&=~(1LL<<rt1[i]);
1602       } else if(rs1[i]) {
1603         current->is32&=~(1LL<<rt1[i]);
1604         if((current->is32>>rs1[i])&1)
1605           current->is32|=1LL<<rt1[i];
1606       } else if(rs2[i]) {
1607         current->is32&=~(1LL<<rt1[i]);
1608         if((current->is32>>rs2[i])&1)
1609           current->is32|=1LL<<rt1[i];
1610       } else {
1611         current->is32|=1LL<<rt1[i];
1612       }
1613     }
1614   }
1615   clear_const(current,rs1[i]);
1616   clear_const(current,rs2[i]);
1617   clear_const(current,rt1[i]);
1618   dirty_reg(current,rt1[i]);
1619 }
1620
1621 void imm16_alloc(struct regstat *current,int i)
1622 {
1623   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1624   else lt1[i]=rs1[i];
1625   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1626   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1627     current->is32&=~(1LL<<rt1[i]);
1628     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1629       // TODO: Could preserve the 32-bit flag if the immediate is zero
1630       alloc_reg64(current,i,rt1[i]);
1631       alloc_reg64(current,i,rs1[i]);
1632     }
1633     clear_const(current,rs1[i]);
1634     clear_const(current,rt1[i]);
1635   }
1636   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1637     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1638     current->is32|=1LL<<rt1[i];
1639     clear_const(current,rs1[i]);
1640     clear_const(current,rt1[i]);
1641   }
1642   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1643     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1644       if(rs1[i]!=rt1[i]) {
1645         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1646         alloc_reg64(current,i,rt1[i]);
1647         current->is32&=~(1LL<<rt1[i]);
1648       }
1649     }
1650     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1651     if(is_const(current,rs1[i])) {
1652       int v=get_const(current,rs1[i]);
1653       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1654       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1655       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1656     }
1657     else clear_const(current,rt1[i]);
1658   }
1659   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1660     if(is_const(current,rs1[i])) {
1661       int v=get_const(current,rs1[i]);
1662       set_const(current,rt1[i],v+imm[i]);
1663     }
1664     else clear_const(current,rt1[i]);
1665     current->is32|=1LL<<rt1[i];
1666   }
1667   else {
1668     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1669     current->is32|=1LL<<rt1[i];
1670   }
1671   dirty_reg(current,rt1[i]);
1672 }
1673
1674 void load_alloc(struct regstat *current,int i)
1675 {
1676   clear_const(current,rt1[i]);
1677   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1678   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1679   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1680   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1681     alloc_reg(current,i,rt1[i]);
1682     assert(get_reg(current->regmap,rt1[i])>=0);
1683     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1684     {
1685       current->is32&=~(1LL<<rt1[i]);
1686       alloc_reg64(current,i,rt1[i]);
1687     }
1688     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1689     {
1690       current->is32&=~(1LL<<rt1[i]);
1691       alloc_reg64(current,i,rt1[i]);
1692       alloc_all(current,i);
1693       alloc_reg64(current,i,FTEMP);
1694       minimum_free_regs[i]=HOST_REGS;
1695     }
1696     else current->is32|=1LL<<rt1[i];
1697     dirty_reg(current,rt1[i]);
1698     // If using TLB, need a register for pointer to the mapping table
1699     if(using_tlb) alloc_reg(current,i,TLREG);
1700     // LWL/LWR need a temporary register for the old value
1701     if(opcode[i]==0x22||opcode[i]==0x26)
1702     {
1703       alloc_reg(current,i,FTEMP);
1704       alloc_reg_temp(current,i,-1);
1705       minimum_free_regs[i]=1;
1706     }
1707   }
1708   else
1709   {
1710     // Load to r0 or unneeded register (dummy load)
1711     // but we still need a register to calculate the address
1712     if(opcode[i]==0x22||opcode[i]==0x26)
1713     {
1714       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1715     }
1716     // If using TLB, need a register for pointer to the mapping table
1717     if(using_tlb) alloc_reg(current,i,TLREG);
1718     alloc_reg_temp(current,i,-1);
1719     minimum_free_regs[i]=1;
1720     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1721     {
1722       alloc_all(current,i);
1723       alloc_reg64(current,i,FTEMP);
1724       minimum_free_regs[i]=HOST_REGS;
1725     }
1726   }
1727 }
1728
1729 void store_alloc(struct regstat *current,int i)
1730 {
1731   clear_const(current,rs2[i]);
1732   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1733   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1734   alloc_reg(current,i,rs2[i]);
1735   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1736     alloc_reg64(current,i,rs2[i]);
1737     if(rs2[i]) alloc_reg(current,i,FTEMP);
1738   }
1739   // If using TLB, need a register for pointer to the mapping table
1740   if(using_tlb) alloc_reg(current,i,TLREG);
1741   #if defined(HOST_IMM8)
1742   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1743   else alloc_reg(current,i,INVCP);
1744   #endif
1745   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1746     alloc_reg(current,i,FTEMP);
1747   }
1748   // We need a temporary register for address generation
1749   alloc_reg_temp(current,i,-1);
1750   minimum_free_regs[i]=1;
1751 }
1752
1753 void c1ls_alloc(struct regstat *current,int i)
1754 {
1755   //clear_const(current,rs1[i]); // FIXME
1756   clear_const(current,rt1[i]);
1757   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1758   alloc_reg(current,i,CSREG); // Status
1759   alloc_reg(current,i,FTEMP);
1760   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1761     alloc_reg64(current,i,FTEMP);
1762   }
1763   // If using TLB, need a register for pointer to the mapping table
1764   if(using_tlb) alloc_reg(current,i,TLREG);
1765   #if defined(HOST_IMM8)
1766   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1767   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1768     alloc_reg(current,i,INVCP);
1769   #endif
1770   // We need a temporary register for address generation
1771   alloc_reg_temp(current,i,-1);
1772 }
1773
1774 void c2ls_alloc(struct regstat *current,int i)
1775 {
1776   clear_const(current,rt1[i]);
1777   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1778   alloc_reg(current,i,FTEMP);
1779   // If using TLB, need a register for pointer to the mapping table
1780   if(using_tlb) alloc_reg(current,i,TLREG);
1781   #if defined(HOST_IMM8)
1782   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1783   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1784     alloc_reg(current,i,INVCP);
1785   #endif
1786   // We need a temporary register for address generation
1787   alloc_reg_temp(current,i,-1);
1788   minimum_free_regs[i]=1;
1789 }
1790
1791 #ifndef multdiv_alloc
1792 void multdiv_alloc(struct regstat *current,int i)
1793 {
1794   //  case 0x18: MULT
1795   //  case 0x19: MULTU
1796   //  case 0x1A: DIV
1797   //  case 0x1B: DIVU
1798   //  case 0x1C: DMULT
1799   //  case 0x1D: DMULTU
1800   //  case 0x1E: DDIV
1801   //  case 0x1F: DDIVU
1802   clear_const(current,rs1[i]);
1803   clear_const(current,rs2[i]);
1804   if(rs1[i]&&rs2[i])
1805   {
1806     if((opcode2[i]&4)==0) // 32-bit
1807     {
1808       current->u&=~(1LL<<HIREG);
1809       current->u&=~(1LL<<LOREG);
1810       alloc_reg(current,i,HIREG);
1811       alloc_reg(current,i,LOREG);
1812       alloc_reg(current,i,rs1[i]);
1813       alloc_reg(current,i,rs2[i]);
1814       current->is32|=1LL<<HIREG;
1815       current->is32|=1LL<<LOREG;
1816       dirty_reg(current,HIREG);
1817       dirty_reg(current,LOREG);
1818     }
1819     else // 64-bit
1820     {
1821       current->u&=~(1LL<<HIREG);
1822       current->u&=~(1LL<<LOREG);
1823       current->uu&=~(1LL<<HIREG);
1824       current->uu&=~(1LL<<LOREG);
1825       alloc_reg64(current,i,HIREG);
1826       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1827       alloc_reg64(current,i,rs1[i]);
1828       alloc_reg64(current,i,rs2[i]);
1829       alloc_all(current,i);
1830       current->is32&=~(1LL<<HIREG);
1831       current->is32&=~(1LL<<LOREG);
1832       dirty_reg(current,HIREG);
1833       dirty_reg(current,LOREG);
1834       minimum_free_regs[i]=HOST_REGS;
1835     }
1836   }
1837   else
1838   {
1839     // Multiply by zero is zero.
1840     // MIPS does not have a divide by zero exception.
1841     // The result is undefined, we return zero.
1842     alloc_reg(current,i,HIREG);
1843     alloc_reg(current,i,LOREG);
1844     current->is32|=1LL<<HIREG;
1845     current->is32|=1LL<<LOREG;
1846     dirty_reg(current,HIREG);
1847     dirty_reg(current,LOREG);
1848   }
1849 }
1850 #endif
1851
1852 void cop0_alloc(struct regstat *current,int i)
1853 {
1854   if(opcode2[i]==0) // MFC0
1855   {
1856     if(rt1[i]) {
1857       clear_const(current,rt1[i]);
1858       alloc_all(current,i);
1859       alloc_reg(current,i,rt1[i]);
1860       current->is32|=1LL<<rt1[i];
1861       dirty_reg(current,rt1[i]);
1862     }
1863   }
1864   else if(opcode2[i]==4) // MTC0
1865   {
1866     if(rs1[i]){
1867       clear_const(current,rs1[i]);
1868       alloc_reg(current,i,rs1[i]);
1869       alloc_all(current,i);
1870     }
1871     else {
1872       alloc_all(current,i); // FIXME: Keep r0
1873       current->u&=~1LL;
1874       alloc_reg(current,i,0);
1875     }
1876   }
1877   else
1878   {
1879     // TLBR/TLBWI/TLBWR/TLBP/ERET
1880     assert(opcode2[i]==0x10);
1881     alloc_all(current,i);
1882   }
1883   minimum_free_regs[i]=HOST_REGS;
1884 }
1885
1886 void cop1_alloc(struct regstat *current,int i)
1887 {
1888   alloc_reg(current,i,CSREG); // Load status
1889   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1890   {
1891     if(rt1[i]){
1892       clear_const(current,rt1[i]);
1893       if(opcode2[i]==1) {
1894         alloc_reg64(current,i,rt1[i]); // DMFC1
1895         current->is32&=~(1LL<<rt1[i]);
1896       }else{
1897         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1898         current->is32|=1LL<<rt1[i];
1899       }
1900       dirty_reg(current,rt1[i]);
1901     }
1902     alloc_reg_temp(current,i,-1);
1903   }
1904   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1905   {
1906     if(rs1[i]){
1907       clear_const(current,rs1[i]);
1908       if(opcode2[i]==5)
1909         alloc_reg64(current,i,rs1[i]); // DMTC1
1910       else
1911         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1912       alloc_reg_temp(current,i,-1);
1913     }
1914     else {
1915       current->u&=~1LL;
1916       alloc_reg(current,i,0);
1917       alloc_reg_temp(current,i,-1);
1918     }
1919   }
1920   minimum_free_regs[i]=1;
1921 }
1922 void fconv_alloc(struct regstat *current,int i)
1923 {
1924   alloc_reg(current,i,CSREG); // Load status
1925   alloc_reg_temp(current,i,-1);
1926   minimum_free_regs[i]=1;
1927 }
1928 void float_alloc(struct regstat *current,int i)
1929 {
1930   alloc_reg(current,i,CSREG); // Load status
1931   alloc_reg_temp(current,i,-1);
1932   minimum_free_regs[i]=1;
1933 }
1934 void c2op_alloc(struct regstat *current,int i)
1935 {
1936   alloc_reg_temp(current,i,-1);
1937 }
1938 void fcomp_alloc(struct regstat *current,int i)
1939 {
1940   alloc_reg(current,i,CSREG); // Load status
1941   alloc_reg(current,i,FSREG); // Load flags
1942   dirty_reg(current,FSREG); // Flag will be modified
1943   alloc_reg_temp(current,i,-1);
1944   minimum_free_regs[i]=1;
1945 }
1946
1947 void syscall_alloc(struct regstat *current,int i)
1948 {
1949   alloc_cc(current,i);
1950   dirty_reg(current,CCREG);
1951   alloc_all(current,i);
1952   minimum_free_regs[i]=HOST_REGS;
1953   current->isconst=0;
1954 }
1955
1956 void delayslot_alloc(struct regstat *current,int i)
1957 {
1958   switch(itype[i]) {
1959     case UJUMP:
1960     case CJUMP:
1961     case SJUMP:
1962     case RJUMP:
1963     case FJUMP:
1964     case SYSCALL:
1965     case HLECALL:
1966     case SPAN:
1967       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1968       printf("Disabled speculative precompilation\n");
1969       stop_after_jal=1;
1970       break;
1971     case IMM16:
1972       imm16_alloc(current,i);
1973       break;
1974     case LOAD:
1975     case LOADLR:
1976       load_alloc(current,i);
1977       break;
1978     case STORE:
1979     case STORELR:
1980       store_alloc(current,i);
1981       break;
1982     case ALU:
1983       alu_alloc(current,i);
1984       break;
1985     case SHIFT:
1986       shift_alloc(current,i);
1987       break;
1988     case MULTDIV:
1989       multdiv_alloc(current,i);
1990       break;
1991     case SHIFTIMM:
1992       shiftimm_alloc(current,i);
1993       break;
1994     case MOV:
1995       mov_alloc(current,i);
1996       break;
1997     case COP0:
1998       cop0_alloc(current,i);
1999       break;
2000     case COP1:
2001     case COP2:
2002       cop1_alloc(current,i);
2003       break;
2004     case C1LS:
2005       c1ls_alloc(current,i);
2006       break;
2007     case C2LS:
2008       c2ls_alloc(current,i);
2009       break;
2010     case FCONV:
2011       fconv_alloc(current,i);
2012       break;
2013     case FLOAT:
2014       float_alloc(current,i);
2015       break;
2016     case FCOMP:
2017       fcomp_alloc(current,i);
2018       break;
2019     case C2OP:
2020       c2op_alloc(current,i);
2021       break;
2022   }
2023 }
2024
2025 // Special case where a branch and delay slot span two pages in virtual memory
2026 static void pagespan_alloc(struct regstat *current,int i)
2027 {
2028   current->isconst=0;
2029   current->wasconst=0;
2030   regs[i].wasconst=0;
2031   minimum_free_regs[i]=HOST_REGS;
2032   alloc_all(current,i);
2033   alloc_cc(current,i);
2034   dirty_reg(current,CCREG);
2035   if(opcode[i]==3) // JAL
2036   {
2037     alloc_reg(current,i,31);
2038     dirty_reg(current,31);
2039   }
2040   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2041   {
2042     alloc_reg(current,i,rs1[i]);
2043     if (rt1[i]!=0) {
2044       alloc_reg(current,i,rt1[i]);
2045       dirty_reg(current,rt1[i]);
2046     }
2047   }
2048   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2049   {
2050     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2051     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2052     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2053     {
2054       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2055       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2056     }
2057   }
2058   else
2059   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2060   {
2061     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2062     if(!((current->is32>>rs1[i])&1))
2063     {
2064       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2065     }
2066   }
2067   else
2068   if(opcode[i]==0x11) // BC1
2069   {
2070     alloc_reg(current,i,FSREG);
2071     alloc_reg(current,i,CSREG);
2072   }
2073   //else ...
2074 }
2075
2076 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2077 {
2078   stubs[stubcount][0]=type;
2079   stubs[stubcount][1]=addr;
2080   stubs[stubcount][2]=retaddr;
2081   stubs[stubcount][3]=a;
2082   stubs[stubcount][4]=b;
2083   stubs[stubcount][5]=c;
2084   stubs[stubcount][6]=d;
2085   stubs[stubcount][7]=e;
2086   stubcount++;
2087 }
2088
2089 // Write out a single register
2090 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2091 {
2092   int hr;
2093   for(hr=0;hr<HOST_REGS;hr++) {
2094     if(hr!=EXCLUDE_REG) {
2095       if((regmap[hr]&63)==r) {
2096         if((dirty>>hr)&1) {
2097           if(regmap[hr]<64) {
2098             emit_storereg(r,hr);
2099 #ifndef FORCE32
2100             if((is32>>regmap[hr])&1) {
2101               emit_sarimm(hr,31,hr);
2102               emit_storereg(r|64,hr);
2103             }
2104 #endif
2105           }else{
2106             emit_storereg(r|64,hr);
2107           }
2108         }
2109       }
2110     }
2111   }
2112 }
2113
2114 int mchecksum()
2115 {
2116   //if(!tracedebug) return 0;
2117   int i;
2118   int sum=0;
2119   for(i=0;i<2097152;i++) {
2120     unsigned int temp=sum;
2121     sum<<=1;
2122     sum|=(~temp)>>31;
2123     sum^=((u_int *)rdram)[i];
2124   }
2125   return sum;
2126 }
2127 int rchecksum()
2128 {
2129   int i;
2130   int sum=0;
2131   for(i=0;i<64;i++)
2132     sum^=((u_int *)reg)[i];
2133   return sum;
2134 }
2135 void rlist()
2136 {
2137   int i;
2138   printf("TRACE: ");
2139   for(i=0;i<32;i++)
2140     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2141   printf("\n");
2142 #ifndef DISABLE_COP1
2143   printf("TRACE: ");
2144   for(i=0;i<32;i++)
2145     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2146   printf("\n");
2147 #endif
2148 }
2149
2150 void enabletrace()
2151 {
2152   tracedebug=1;
2153 }
2154
2155 void memdebug(int i)
2156 {
2157   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2158   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2159   //rlist();
2160   //if(tracedebug) {
2161   //if(Count>=-2084597794) {
2162   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2163   //if(0) {
2164     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2165     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2166     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2167     rlist();
2168     #ifdef __i386__
2169     printf("TRACE: %x\n",(&i)[-1]);
2170     #endif
2171     #ifdef __arm__
2172     int j;
2173     printf("TRACE: %x \n",(&j)[10]);
2174     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2175     #endif
2176     //fflush(stdout);
2177   }
2178   //printf("TRACE: %x\n",(&i)[-1]);
2179 }
2180
2181 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2182 {
2183   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2184 }
2185
2186 void alu_assemble(int i,struct regstat *i_regs)
2187 {
2188   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2189     if(rt1[i]) {
2190       signed char s1,s2,t;
2191       t=get_reg(i_regs->regmap,rt1[i]);
2192       if(t>=0) {
2193         s1=get_reg(i_regs->regmap,rs1[i]);
2194         s2=get_reg(i_regs->regmap,rs2[i]);
2195         if(rs1[i]&&rs2[i]) {
2196           assert(s1>=0);
2197           assert(s2>=0);
2198           if(opcode2[i]&2) emit_sub(s1,s2,t);
2199           else emit_add(s1,s2,t);
2200         }
2201         else if(rs1[i]) {
2202           if(s1>=0) emit_mov(s1,t);
2203           else emit_loadreg(rs1[i],t);
2204         }
2205         else if(rs2[i]) {
2206           if(s2>=0) {
2207             if(opcode2[i]&2) emit_neg(s2,t);
2208             else emit_mov(s2,t);
2209           }
2210           else {
2211             emit_loadreg(rs2[i],t);
2212             if(opcode2[i]&2) emit_neg(t,t);
2213           }
2214         }
2215         else emit_zeroreg(t);
2216       }
2217     }
2218   }
2219   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2220     if(rt1[i]) {
2221       signed char s1l,s2l,s1h,s2h,tl,th;
2222       tl=get_reg(i_regs->regmap,rt1[i]);
2223       th=get_reg(i_regs->regmap,rt1[i]|64);
2224       if(tl>=0) {
2225         s1l=get_reg(i_regs->regmap,rs1[i]);
2226         s2l=get_reg(i_regs->regmap,rs2[i]);
2227         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2228         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2229         if(rs1[i]&&rs2[i]) {
2230           assert(s1l>=0);
2231           assert(s2l>=0);
2232           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2233           else emit_adds(s1l,s2l,tl);
2234           if(th>=0) {
2235             #ifdef INVERTED_CARRY
2236             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2237             #else
2238             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2239             #endif
2240             else emit_add(s1h,s2h,th);
2241           }
2242         }
2243         else if(rs1[i]) {
2244           if(s1l>=0) emit_mov(s1l,tl);
2245           else emit_loadreg(rs1[i],tl);
2246           if(th>=0) {
2247             if(s1h>=0) emit_mov(s1h,th);
2248             else emit_loadreg(rs1[i]|64,th);
2249           }
2250         }
2251         else if(rs2[i]) {
2252           if(s2l>=0) {
2253             if(opcode2[i]&2) emit_negs(s2l,tl);
2254             else emit_mov(s2l,tl);
2255           }
2256           else {
2257             emit_loadreg(rs2[i],tl);
2258             if(opcode2[i]&2) emit_negs(tl,tl);
2259           }
2260           if(th>=0) {
2261             #ifdef INVERTED_CARRY
2262             if(s2h>=0) emit_mov(s2h,th);
2263             else emit_loadreg(rs2[i]|64,th);
2264             if(opcode2[i]&2) {
2265               emit_adcimm(-1,th); // x86 has inverted carry flag
2266               emit_not(th,th);
2267             }
2268             #else
2269             if(opcode2[i]&2) {
2270               if(s2h>=0) emit_rscimm(s2h,0,th);
2271               else {
2272                 emit_loadreg(rs2[i]|64,th);
2273                 emit_rscimm(th,0,th);
2274               }
2275             }else{
2276               if(s2h>=0) emit_mov(s2h,th);
2277               else emit_loadreg(rs2[i]|64,th);
2278             }
2279             #endif
2280           }
2281         }
2282         else {
2283           emit_zeroreg(tl);
2284           if(th>=0) emit_zeroreg(th);
2285         }
2286       }
2287     }
2288   }
2289   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2290     if(rt1[i]) {
2291       signed char s1l,s1h,s2l,s2h,t;
2292       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2293       {
2294         t=get_reg(i_regs->regmap,rt1[i]);
2295         //assert(t>=0);
2296         if(t>=0) {
2297           s1l=get_reg(i_regs->regmap,rs1[i]);
2298           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2299           s2l=get_reg(i_regs->regmap,rs2[i]);
2300           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2301           if(rs2[i]==0) // rx<r0
2302           {
2303             assert(s1h>=0);
2304             if(opcode2[i]==0x2a) // SLT
2305               emit_shrimm(s1h,31,t);
2306             else // SLTU (unsigned can not be less than zero)
2307               emit_zeroreg(t);
2308           }
2309           else if(rs1[i]==0) // r0<rx
2310           {
2311             assert(s2h>=0);
2312             if(opcode2[i]==0x2a) // SLT
2313               emit_set_gz64_32(s2h,s2l,t);
2314             else // SLTU (set if not zero)
2315               emit_set_nz64_32(s2h,s2l,t);
2316           }
2317           else {
2318             assert(s1l>=0);assert(s1h>=0);
2319             assert(s2l>=0);assert(s2h>=0);
2320             if(opcode2[i]==0x2a) // SLT
2321               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2322             else // SLTU
2323               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2324           }
2325         }
2326       } else {
2327         t=get_reg(i_regs->regmap,rt1[i]);
2328         //assert(t>=0);
2329         if(t>=0) {
2330           s1l=get_reg(i_regs->regmap,rs1[i]);
2331           s2l=get_reg(i_regs->regmap,rs2[i]);
2332           if(rs2[i]==0) // rx<r0
2333           {
2334             assert(s1l>=0);
2335             if(opcode2[i]==0x2a) // SLT
2336               emit_shrimm(s1l,31,t);
2337             else // SLTU (unsigned can not be less than zero)
2338               emit_zeroreg(t);
2339           }
2340           else if(rs1[i]==0) // r0<rx
2341           {
2342             assert(s2l>=0);
2343             if(opcode2[i]==0x2a) // SLT
2344               emit_set_gz32(s2l,t);
2345             else // SLTU (set if not zero)
2346               emit_set_nz32(s2l,t);
2347           }
2348           else{
2349             assert(s1l>=0);assert(s2l>=0);
2350             if(opcode2[i]==0x2a) // SLT
2351               emit_set_if_less32(s1l,s2l,t);
2352             else // SLTU
2353               emit_set_if_carry32(s1l,s2l,t);
2354           }
2355         }
2356       }
2357     }
2358   }
2359   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2360     if(rt1[i]) {
2361       signed char s1l,s1h,s2l,s2h,th,tl;
2362       tl=get_reg(i_regs->regmap,rt1[i]);
2363       th=get_reg(i_regs->regmap,rt1[i]|64);
2364       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2365       {
2366         assert(tl>=0);
2367         if(tl>=0) {
2368           s1l=get_reg(i_regs->regmap,rs1[i]);
2369           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2370           s2l=get_reg(i_regs->regmap,rs2[i]);
2371           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2372           if(rs1[i]&&rs2[i]) {
2373             assert(s1l>=0);assert(s1h>=0);
2374             assert(s2l>=0);assert(s2h>=0);
2375             if(opcode2[i]==0x24) { // AND
2376               emit_and(s1l,s2l,tl);
2377               emit_and(s1h,s2h,th);
2378             } else
2379             if(opcode2[i]==0x25) { // OR
2380               emit_or(s1l,s2l,tl);
2381               emit_or(s1h,s2h,th);
2382             } else
2383             if(opcode2[i]==0x26) { // XOR
2384               emit_xor(s1l,s2l,tl);
2385               emit_xor(s1h,s2h,th);
2386             } else
2387             if(opcode2[i]==0x27) { // NOR
2388               emit_or(s1l,s2l,tl);
2389               emit_or(s1h,s2h,th);
2390               emit_not(tl,tl);
2391               emit_not(th,th);
2392             }
2393           }
2394           else
2395           {
2396             if(opcode2[i]==0x24) { // AND
2397               emit_zeroreg(tl);
2398               emit_zeroreg(th);
2399             } else
2400             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2401               if(rs1[i]){
2402                 if(s1l>=0) emit_mov(s1l,tl);
2403                 else emit_loadreg(rs1[i],tl);
2404                 if(s1h>=0) emit_mov(s1h,th);
2405                 else emit_loadreg(rs1[i]|64,th);
2406               }
2407               else
2408               if(rs2[i]){
2409                 if(s2l>=0) emit_mov(s2l,tl);
2410                 else emit_loadreg(rs2[i],tl);
2411                 if(s2h>=0) emit_mov(s2h,th);
2412                 else emit_loadreg(rs2[i]|64,th);
2413               }
2414               else{
2415                 emit_zeroreg(tl);
2416                 emit_zeroreg(th);
2417               }
2418             } else
2419             if(opcode2[i]==0x27) { // NOR
2420               if(rs1[i]){
2421                 if(s1l>=0) emit_not(s1l,tl);
2422                 else{
2423                   emit_loadreg(rs1[i],tl);
2424                   emit_not(tl,tl);
2425                 }
2426                 if(s1h>=0) emit_not(s1h,th);
2427                 else{
2428                   emit_loadreg(rs1[i]|64,th);
2429                   emit_not(th,th);
2430                 }
2431               }
2432               else
2433               if(rs2[i]){
2434                 if(s2l>=0) emit_not(s2l,tl);
2435                 else{
2436                   emit_loadreg(rs2[i],tl);
2437                   emit_not(tl,tl);
2438                 }
2439                 if(s2h>=0) emit_not(s2h,th);
2440                 else{
2441                   emit_loadreg(rs2[i]|64,th);
2442                   emit_not(th,th);
2443                 }
2444               }
2445               else {
2446                 emit_movimm(-1,tl);
2447                 emit_movimm(-1,th);
2448               }
2449             }
2450           }
2451         }
2452       }
2453       else
2454       {
2455         // 32 bit
2456         if(tl>=0) {
2457           s1l=get_reg(i_regs->regmap,rs1[i]);
2458           s2l=get_reg(i_regs->regmap,rs2[i]);
2459           if(rs1[i]&&rs2[i]) {
2460             assert(s1l>=0);
2461             assert(s2l>=0);
2462             if(opcode2[i]==0x24) { // AND
2463               emit_and(s1l,s2l,tl);
2464             } else
2465             if(opcode2[i]==0x25) { // OR
2466               emit_or(s1l,s2l,tl);
2467             } else
2468             if(opcode2[i]==0x26) { // XOR
2469               emit_xor(s1l,s2l,tl);
2470             } else
2471             if(opcode2[i]==0x27) { // NOR
2472               emit_or(s1l,s2l,tl);
2473               emit_not(tl,tl);
2474             }
2475           }
2476           else
2477           {
2478             if(opcode2[i]==0x24) { // AND
2479               emit_zeroreg(tl);
2480             } else
2481             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2482               if(rs1[i]){
2483                 if(s1l>=0) emit_mov(s1l,tl);
2484                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2485               }
2486               else
2487               if(rs2[i]){
2488                 if(s2l>=0) emit_mov(s2l,tl);
2489                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2490               }
2491               else emit_zeroreg(tl);
2492             } else
2493             if(opcode2[i]==0x27) { // NOR
2494               if(rs1[i]){
2495                 if(s1l>=0) emit_not(s1l,tl);
2496                 else {
2497                   emit_loadreg(rs1[i],tl);
2498                   emit_not(tl,tl);
2499                 }
2500               }
2501               else
2502               if(rs2[i]){
2503                 if(s2l>=0) emit_not(s2l,tl);
2504                 else {
2505                   emit_loadreg(rs2[i],tl);
2506                   emit_not(tl,tl);
2507                 }
2508               }
2509               else emit_movimm(-1,tl);
2510             }
2511           }
2512         }
2513       }
2514     }
2515   }
2516 }
2517
2518 void imm16_assemble(int i,struct regstat *i_regs)
2519 {
2520   if (opcode[i]==0x0f) { // LUI
2521     if(rt1[i]) {
2522       signed char t;
2523       t=get_reg(i_regs->regmap,rt1[i]);
2524       //assert(t>=0);
2525       if(t>=0) {
2526         if(!((i_regs->isconst>>t)&1))
2527           emit_movimm(imm[i]<<16,t);
2528       }
2529     }
2530   }
2531   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2532     if(rt1[i]) {
2533       signed char s,t;
2534       t=get_reg(i_regs->regmap,rt1[i]);
2535       s=get_reg(i_regs->regmap,rs1[i]);
2536       if(rs1[i]) {
2537         //assert(t>=0);
2538         //assert(s>=0);
2539         if(t>=0) {
2540           if(!((i_regs->isconst>>t)&1)) {
2541             if(s<0) {
2542               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2543               emit_addimm(t,imm[i],t);
2544             }else{
2545               if(!((i_regs->wasconst>>s)&1))
2546                 emit_addimm(s,imm[i],t);
2547               else
2548                 emit_movimm(constmap[i][s]+imm[i],t);
2549             }
2550           }
2551         }
2552       } else {
2553         if(t>=0) {
2554           if(!((i_regs->isconst>>t)&1))
2555             emit_movimm(imm[i],t);
2556         }
2557       }
2558     }
2559   }
2560   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2561     if(rt1[i]) {
2562       signed char sh,sl,th,tl;
2563       th=get_reg(i_regs->regmap,rt1[i]|64);
2564       tl=get_reg(i_regs->regmap,rt1[i]);
2565       sh=get_reg(i_regs->regmap,rs1[i]|64);
2566       sl=get_reg(i_regs->regmap,rs1[i]);
2567       if(tl>=0) {
2568         if(rs1[i]) {
2569           assert(sh>=0);
2570           assert(sl>=0);
2571           if(th>=0) {
2572             emit_addimm64_32(sh,sl,imm[i],th,tl);
2573           }
2574           else {
2575             emit_addimm(sl,imm[i],tl);
2576           }
2577         } else {
2578           emit_movimm(imm[i],tl);
2579           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2580         }
2581       }
2582     }
2583   }
2584   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2585     if(rt1[i]) {
2586       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2587       signed char sh,sl,t;
2588       t=get_reg(i_regs->regmap,rt1[i]);
2589       sh=get_reg(i_regs->regmap,rs1[i]|64);
2590       sl=get_reg(i_regs->regmap,rs1[i]);
2591       //assert(t>=0);
2592       if(t>=0) {
2593         if(rs1[i]>0) {
2594           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2595           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2596             if(opcode[i]==0x0a) { // SLTI
2597               if(sl<0) {
2598                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2599                 emit_slti32(t,imm[i],t);
2600               }else{
2601                 emit_slti32(sl,imm[i],t);
2602               }
2603             }
2604             else { // SLTIU
2605               if(sl<0) {
2606                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2607                 emit_sltiu32(t,imm[i],t);
2608               }else{
2609                 emit_sltiu32(sl,imm[i],t);
2610               }
2611             }
2612           }else{ // 64-bit
2613             assert(sl>=0);
2614             if(opcode[i]==0x0a) // SLTI
2615               emit_slti64_32(sh,sl,imm[i],t);
2616             else // SLTIU
2617               emit_sltiu64_32(sh,sl,imm[i],t);
2618           }
2619         }else{
2620           // SLTI(U) with r0 is just stupid,
2621           // nonetheless examples can be found
2622           if(opcode[i]==0x0a) // SLTI
2623             if(0<imm[i]) emit_movimm(1,t);
2624             else emit_zeroreg(t);
2625           else // SLTIU
2626           {
2627             if(imm[i]) emit_movimm(1,t);
2628             else emit_zeroreg(t);
2629           }
2630         }
2631       }
2632     }
2633   }
2634   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2635     if(rt1[i]) {
2636       signed char sh,sl,th,tl;
2637       th=get_reg(i_regs->regmap,rt1[i]|64);
2638       tl=get_reg(i_regs->regmap,rt1[i]);
2639       sh=get_reg(i_regs->regmap,rs1[i]|64);
2640       sl=get_reg(i_regs->regmap,rs1[i]);
2641       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2642         if(opcode[i]==0x0c) //ANDI
2643         {
2644           if(rs1[i]) {
2645             if(sl<0) {
2646               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2647               emit_andimm(tl,imm[i],tl);
2648             }else{
2649               if(!((i_regs->wasconst>>sl)&1))
2650                 emit_andimm(sl,imm[i],tl);
2651               else
2652                 emit_movimm(constmap[i][sl]&imm[i],tl);
2653             }
2654           }
2655           else
2656             emit_zeroreg(tl);
2657           if(th>=0) emit_zeroreg(th);
2658         }
2659         else
2660         {
2661           if(rs1[i]) {
2662             if(sl<0) {
2663               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2664             }
2665             if(th>=0) {
2666               if(sh<0) {
2667                 emit_loadreg(rs1[i]|64,th);
2668               }else{
2669                 emit_mov(sh,th);
2670               }
2671             }
2672             if(opcode[i]==0x0d) //ORI
2673             if(sl<0) {
2674               emit_orimm(tl,imm[i],tl);
2675             }else{
2676               if(!((i_regs->wasconst>>sl)&1))
2677                 emit_orimm(sl,imm[i],tl);
2678               else
2679                 emit_movimm(constmap[i][sl]|imm[i],tl);
2680             }
2681             if(opcode[i]==0x0e) //XORI
2682             if(sl<0) {
2683               emit_xorimm(tl,imm[i],tl);
2684             }else{
2685               if(!((i_regs->wasconst>>sl)&1))
2686                 emit_xorimm(sl,imm[i],tl);
2687               else
2688                 emit_movimm(constmap[i][sl]^imm[i],tl);
2689             }
2690           }
2691           else {
2692             emit_movimm(imm[i],tl);
2693             if(th>=0) emit_zeroreg(th);
2694           }
2695         }
2696       }
2697     }
2698   }
2699 }
2700
2701 void shiftimm_assemble(int i,struct regstat *i_regs)
2702 {
2703   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2704   {
2705     if(rt1[i]) {
2706       signed char s,t;
2707       t=get_reg(i_regs->regmap,rt1[i]);
2708       s=get_reg(i_regs->regmap,rs1[i]);
2709       //assert(t>=0);
2710       if(t>=0&&!((i_regs->isconst>>t)&1)){
2711         if(rs1[i]==0)
2712         {
2713           emit_zeroreg(t);
2714         }
2715         else
2716         {
2717           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2718           if(imm[i]) {
2719             if(opcode2[i]==0) // SLL
2720             {
2721               emit_shlimm(s<0?t:s,imm[i],t);
2722             }
2723             if(opcode2[i]==2) // SRL
2724             {
2725               emit_shrimm(s<0?t:s,imm[i],t);
2726             }
2727             if(opcode2[i]==3) // SRA
2728             {
2729               emit_sarimm(s<0?t:s,imm[i],t);
2730             }
2731           }else{
2732             // Shift by zero
2733             if(s>=0 && s!=t) emit_mov(s,t);
2734           }
2735         }
2736       }
2737       //emit_storereg(rt1[i],t); //DEBUG
2738     }
2739   }
2740   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2741   {
2742     if(rt1[i]) {
2743       signed char sh,sl,th,tl;
2744       th=get_reg(i_regs->regmap,rt1[i]|64);
2745       tl=get_reg(i_regs->regmap,rt1[i]);
2746       sh=get_reg(i_regs->regmap,rs1[i]|64);
2747       sl=get_reg(i_regs->regmap,rs1[i]);
2748       if(tl>=0) {
2749         if(rs1[i]==0)
2750         {
2751           emit_zeroreg(tl);
2752           if(th>=0) emit_zeroreg(th);
2753         }
2754         else
2755         {
2756           assert(sl>=0);
2757           assert(sh>=0);
2758           if(imm[i]) {
2759             if(opcode2[i]==0x38) // DSLL
2760             {
2761               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2762               emit_shlimm(sl,imm[i],tl);
2763             }
2764             if(opcode2[i]==0x3a) // DSRL
2765             {
2766               emit_shrdimm(sl,sh,imm[i],tl);
2767               if(th>=0) emit_shrimm(sh,imm[i],th);
2768             }
2769             if(opcode2[i]==0x3b) // DSRA
2770             {
2771               emit_shrdimm(sl,sh,imm[i],tl);
2772               if(th>=0) emit_sarimm(sh,imm[i],th);
2773             }
2774           }else{
2775             // Shift by zero
2776             if(sl!=tl) emit_mov(sl,tl);
2777             if(th>=0&&sh!=th) emit_mov(sh,th);
2778           }
2779         }
2780       }
2781     }
2782   }
2783   if(opcode2[i]==0x3c) // DSLL32
2784   {
2785     if(rt1[i]) {
2786       signed char sl,tl,th;
2787       tl=get_reg(i_regs->regmap,rt1[i]);
2788       th=get_reg(i_regs->regmap,rt1[i]|64);
2789       sl=get_reg(i_regs->regmap,rs1[i]);
2790       if(th>=0||tl>=0){
2791         assert(tl>=0);
2792         assert(th>=0);
2793         assert(sl>=0);
2794         emit_mov(sl,th);
2795         emit_zeroreg(tl);
2796         if(imm[i]>32)
2797         {
2798           emit_shlimm(th,imm[i]&31,th);
2799         }
2800       }
2801     }
2802   }
2803   if(opcode2[i]==0x3e) // DSRL32
2804   {
2805     if(rt1[i]) {
2806       signed char sh,tl,th;
2807       tl=get_reg(i_regs->regmap,rt1[i]);
2808       th=get_reg(i_regs->regmap,rt1[i]|64);
2809       sh=get_reg(i_regs->regmap,rs1[i]|64);
2810       if(tl>=0){
2811         assert(sh>=0);
2812         emit_mov(sh,tl);
2813         if(th>=0) emit_zeroreg(th);
2814         if(imm[i]>32)
2815         {
2816           emit_shrimm(tl,imm[i]&31,tl);
2817         }
2818       }
2819     }
2820   }
2821   if(opcode2[i]==0x3f) // DSRA32
2822   {
2823     if(rt1[i]) {
2824       signed char sh,tl;
2825       tl=get_reg(i_regs->regmap,rt1[i]);
2826       sh=get_reg(i_regs->regmap,rs1[i]|64);
2827       if(tl>=0){
2828         assert(sh>=0);
2829         emit_mov(sh,tl);
2830         if(imm[i]>32)
2831         {
2832           emit_sarimm(tl,imm[i]&31,tl);
2833         }
2834       }
2835     }
2836   }
2837 }
2838
2839 #ifndef shift_assemble
2840 void shift_assemble(int i,struct regstat *i_regs)
2841 {
2842   printf("Need shift_assemble for this architecture.\n");
2843   exit(1);
2844 }
2845 #endif
2846
2847 void load_assemble(int i,struct regstat *i_regs)
2848 {
2849   int s,th,tl,addr,map=-1;
2850   int offset;
2851   int jaddr=0;
2852   int memtarget=0,c=0;
2853   int fastload_reg_override=0;
2854   u_int hr,reglist=0;
2855   th=get_reg(i_regs->regmap,rt1[i]|64);
2856   tl=get_reg(i_regs->regmap,rt1[i]);
2857   s=get_reg(i_regs->regmap,rs1[i]);
2858   offset=imm[i];
2859   for(hr=0;hr<HOST_REGS;hr++) {
2860     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2861   }
2862   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2863   if(s>=0) {
2864     c=(i_regs->wasconst>>s)&1;
2865     if (c) {
2866       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2867       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2868     }
2869   }
2870   //printf("load_assemble: c=%d\n",c);
2871   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2872   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2873 #ifdef PCSX
2874   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2875     ||rt1[i]==0) {
2876       // could be FIFO, must perform the read
2877       // ||dummy read
2878       assem_debug("(forced read)\n");
2879       tl=get_reg(i_regs->regmap,-1);
2880       assert(tl>=0);
2881   }
2882 #endif
2883   if(offset||s<0||c) addr=tl;
2884   else addr=s;
2885   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2886  if(tl>=0) {
2887   //printf("load_assemble: c=%d\n",c);
2888   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2889   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2890   reglist&=~(1<<tl);
2891   if(th>=0) reglist&=~(1<<th);
2892   if(!using_tlb) {
2893     if(!c) {
2894       #ifdef RAM_OFFSET
2895       map=get_reg(i_regs->regmap,ROREG);
2896       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2897       #endif
2898 //#define R29_HACK 1
2899       #ifdef R29_HACK
2900       // Strmnnrmn's speed hack
2901       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2902       #endif
2903       {
2904         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2905       }
2906     }
2907   }else{ // using tlb
2908     int x=0;
2909     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2910     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2911     map=get_reg(i_regs->regmap,TLREG);
2912     assert(map>=0);
2913     reglist&=~(1<<map);
2914     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2915     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2916   }
2917   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2918   if (opcode[i]==0x20) { // LB
2919     if(!c||memtarget) {
2920       if(!dummy) {
2921         #ifdef HOST_IMM_ADDR32
2922         if(c)
2923           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2924         else
2925         #endif
2926         {
2927           //emit_xorimm(addr,3,tl);
2928           //gen_tlb_addr_r(tl,map);
2929           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2930           int x=0,a=tl;
2931 #ifdef BIG_ENDIAN_MIPS
2932           if(!c) emit_xorimm(addr,3,tl);
2933           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2934 #else
2935           if(!c) a=addr;
2936 #endif
2937           if(fastload_reg_override) a=fastload_reg_override;
2938
2939           emit_movsbl_indexed_tlb(x,a,map,tl);
2940         }
2941       }
2942       if(jaddr)
2943         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2944     }
2945     else
2946       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2947   }
2948   if (opcode[i]==0x21) { // LH
2949     if(!c||memtarget) {
2950       if(!dummy) {
2951         #ifdef HOST_IMM_ADDR32
2952         if(c)
2953           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2954         else
2955         #endif
2956         {
2957           int x=0,a=tl;
2958 #ifdef BIG_ENDIAN_MIPS
2959           if(!c) emit_xorimm(addr,2,tl);
2960           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2961 #else
2962           if(!c) a=addr;
2963 #endif
2964           if(fastload_reg_override) a=fastload_reg_override;
2965           //#ifdef
2966           //emit_movswl_indexed_tlb(x,tl,map,tl);
2967           //else
2968           if(map>=0) {
2969             gen_tlb_addr_r(a,map);
2970             emit_movswl_indexed(x,a,tl);
2971           }else{
2972             #ifdef RAM_OFFSET
2973             emit_movswl_indexed(x,a,tl);
2974             #else
2975             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2976             #endif
2977           }
2978         }
2979       }
2980       if(jaddr)
2981         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2982     }
2983     else
2984       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2985   }
2986   if (opcode[i]==0x23) { // LW
2987     if(!c||memtarget) {
2988       if(!dummy) {
2989         int a=addr;
2990         if(fastload_reg_override) a=fastload_reg_override;
2991         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2992         #ifdef HOST_IMM_ADDR32
2993         if(c)
2994           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2995         else
2996         #endif
2997         emit_readword_indexed_tlb(0,a,map,tl);
2998       }
2999       if(jaddr)
3000         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3001     }
3002     else
3003       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3004   }
3005   if (opcode[i]==0x24) { // LBU
3006     if(!c||memtarget) {
3007       if(!dummy) {
3008         #ifdef HOST_IMM_ADDR32
3009         if(c)
3010           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
3011         else
3012         #endif
3013         {
3014           //emit_xorimm(addr,3,tl);
3015           //gen_tlb_addr_r(tl,map);
3016           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
3017           int x=0,a=tl;
3018 #ifdef BIG_ENDIAN_MIPS
3019           if(!c) emit_xorimm(addr,3,tl);
3020           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3021 #else
3022           if(!c) a=addr;
3023 #endif
3024           if(fastload_reg_override) a=fastload_reg_override;
3025
3026           emit_movzbl_indexed_tlb(x,a,map,tl);
3027         }
3028       }
3029       if(jaddr)
3030         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3031     }
3032     else
3033       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3034   }
3035   if (opcode[i]==0x25) { // LHU
3036     if(!c||memtarget) {
3037       if(!dummy) {
3038         #ifdef HOST_IMM_ADDR32
3039         if(c)
3040           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
3041         else
3042         #endif
3043         {
3044           int x=0,a=tl;
3045 #ifdef BIG_ENDIAN_MIPS
3046           if(!c) emit_xorimm(addr,2,tl);
3047           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3048 #else
3049           if(!c) a=addr;
3050 #endif
3051           if(fastload_reg_override) a=fastload_reg_override;
3052           //#ifdef
3053           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3054           //#else
3055           if(map>=0) {
3056             gen_tlb_addr_r(a,map);
3057             emit_movzwl_indexed(x,a,tl);
3058           }else{
3059             #ifdef RAM_OFFSET
3060             emit_movzwl_indexed(x,a,tl);
3061             #else
3062             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3063             #endif
3064           }
3065         }
3066       }
3067       if(jaddr)
3068         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3069     }
3070     else
3071       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3072   }
3073   if (opcode[i]==0x27) { // LWU
3074     assert(th>=0);
3075     if(!c||memtarget) {
3076       if(!dummy) {
3077         int a=addr;
3078         if(fastload_reg_override) a=fastload_reg_override;
3079         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3080         #ifdef HOST_IMM_ADDR32
3081         if(c)
3082           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3083         else
3084         #endif
3085         emit_readword_indexed_tlb(0,a,map,tl);
3086       }
3087       if(jaddr)
3088         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3089     }
3090     else {
3091       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3092     }
3093     emit_zeroreg(th);
3094   }
3095   if (opcode[i]==0x37) { // LD
3096     if(!c||memtarget) {
3097       if(!dummy) {
3098         int a=addr;
3099         if(fastload_reg_override) a=fastload_reg_override;
3100         //gen_tlb_addr_r(tl,map);
3101         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3102         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3103         #ifdef HOST_IMM_ADDR32
3104         if(c)
3105           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3106         else
3107         #endif
3108         emit_readdword_indexed_tlb(0,a,map,th,tl);
3109       }
3110       if(jaddr)
3111         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3112     }
3113     else
3114       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3115   }
3116  }
3117   //emit_storereg(rt1[i],tl); // DEBUG
3118   //if(opcode[i]==0x23)
3119   //if(opcode[i]==0x24)
3120   //if(opcode[i]==0x23||opcode[i]==0x24)
3121   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3122   {
3123     //emit_pusha();
3124     save_regs(0x100f);
3125         emit_readword((int)&last_count,ECX);
3126         #ifdef __i386__
3127         if(get_reg(i_regs->regmap,CCREG)<0)
3128           emit_loadreg(CCREG,HOST_CCREG);
3129         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3130         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3131         emit_writeword(HOST_CCREG,(int)&Count);
3132         #endif
3133         #ifdef __arm__
3134         if(get_reg(i_regs->regmap,CCREG)<0)
3135           emit_loadreg(CCREG,0);
3136         else
3137           emit_mov(HOST_CCREG,0);
3138         emit_add(0,ECX,0);
3139         emit_addimm(0,2*ccadj[i],0);
3140         emit_writeword(0,(int)&Count);
3141         #endif
3142     emit_call((int)memdebug);
3143     //emit_popa();
3144     restore_regs(0x100f);
3145   }/**/
3146 }
3147
3148 #ifndef loadlr_assemble
3149 void loadlr_assemble(int i,struct regstat *i_regs)
3150 {
3151   printf("Need loadlr_assemble for this architecture.\n");
3152   exit(1);
3153 }
3154 #endif
3155
3156 void store_assemble(int i,struct regstat *i_regs)
3157 {
3158   int s,th,tl,map=-1;
3159   int addr,temp;
3160   int offset;
3161   int jaddr=0,jaddr2,type;
3162   int memtarget=0,c=0;
3163   int agr=AGEN1+(i&1);
3164   int faststore_reg_override=0;
3165   u_int hr,reglist=0;
3166   th=get_reg(i_regs->regmap,rs2[i]|64);
3167   tl=get_reg(i_regs->regmap,rs2[i]);
3168   s=get_reg(i_regs->regmap,rs1[i]);
3169   temp=get_reg(i_regs->regmap,agr);
3170   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3171   offset=imm[i];
3172   if(s>=0) {
3173     c=(i_regs->wasconst>>s)&1;
3174     if(c) {
3175       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3176       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3177     }
3178   }
3179   assert(tl>=0);
3180   assert(temp>=0);
3181   for(hr=0;hr<HOST_REGS;hr++) {
3182     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3183   }
3184   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3185   if(offset||s<0||c) addr=temp;
3186   else addr=s;
3187   if(!using_tlb) {
3188     if(!c) {
3189       #ifndef PCSX
3190       #ifdef R29_HACK
3191       // Strmnnrmn's speed hack
3192       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3193       #endif
3194       emit_cmpimm(addr,RAM_SIZE);
3195       #ifdef DESTRUCTIVE_SHIFT
3196       if(s==addr) emit_mov(s,temp);
3197       #endif
3198       #ifdef R29_HACK
3199       memtarget=1;
3200       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3201       #endif
3202       {
3203         jaddr=(int)out;
3204         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3205         // Hint to branch predictor that the branch is unlikely to be taken
3206         if(rs1[i]>=28)
3207           emit_jno_unlikely(0);
3208         else
3209         #endif
3210         emit_jno(0);
3211       }
3212       #else
3213         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3214       #endif
3215     }
3216   }else{ // using tlb
3217     int x=0;
3218     if (opcode[i]==0x28) x=3; // SB
3219     if (opcode[i]==0x29) x=2; // SH
3220     map=get_reg(i_regs->regmap,TLREG);
3221     assert(map>=0);
3222     reglist&=~(1<<map);
3223     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3224     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3225   }
3226
3227   if (opcode[i]==0x28) { // SB
3228     if(!c||memtarget) {
3229       int x=0,a=temp;
3230 #ifdef BIG_ENDIAN_MIPS
3231       if(!c) emit_xorimm(addr,3,temp);
3232       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3233 #else
3234       if(!c) a=addr;
3235 #endif
3236       if(faststore_reg_override) a=faststore_reg_override;
3237       //gen_tlb_addr_w(temp,map);
3238       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3239       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3240     }
3241     type=STOREB_STUB;
3242   }
3243   if (opcode[i]==0x29) { // SH
3244     if(!c||memtarget) {
3245       int x=0,a=temp;
3246 #ifdef BIG_ENDIAN_MIPS
3247       if(!c) emit_xorimm(addr,2,temp);
3248       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3249 #else
3250       if(!c) a=addr;
3251 #endif
3252       if(faststore_reg_override) a=faststore_reg_override;
3253       //#ifdef
3254       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3255       //#else
3256       if(map>=0) {
3257         gen_tlb_addr_w(a,map);
3258         emit_writehword_indexed(tl,x,a);
3259       }else
3260         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3261     }
3262     type=STOREH_STUB;
3263   }
3264   if (opcode[i]==0x2B) { // SW
3265     if(!c||memtarget) {
3266       int a=addr;
3267       if(faststore_reg_override) a=faststore_reg_override;
3268       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3269       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3270     }
3271     type=STOREW_STUB;
3272   }
3273   if (opcode[i]==0x3F) { // SD
3274     if(!c||memtarget) {
3275       int a=addr;
3276       if(faststore_reg_override) a=faststore_reg_override;
3277       if(rs2[i]) {
3278         assert(th>=0);
3279         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3280         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3281         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3282       }else{
3283         // Store zero
3284         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3285         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3286         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3287       }
3288     }
3289     type=STORED_STUB;
3290   }
3291 #ifdef PCSX
3292   if(jaddr) {
3293     // PCSX store handlers don't check invcode again
3294     reglist|=1<<addr;
3295     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3296     jaddr=0;
3297   }
3298 #endif
3299   if(!using_tlb) {
3300     if(!c||memtarget) {
3301       #ifdef DESTRUCTIVE_SHIFT
3302       // The x86 shift operation is 'destructive'; it overwrites the
3303       // source register, so we need to make a copy first and use that.
3304       addr=temp;
3305       #endif
3306       #if defined(HOST_IMM8)
3307       int ir=get_reg(i_regs->regmap,INVCP);
3308       assert(ir>=0);
3309       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3310       #else
3311       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3312       #endif
3313       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3314       emit_callne(invalidate_addr_reg[addr]);
3315       #else
3316       jaddr2=(int)out;
3317       emit_jne(0);
3318       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3319       #endif
3320     }
3321   }
3322   if(jaddr) {
3323     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3324   } else if(c&&!memtarget) {
3325     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3326   }
3327   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3328   //if(opcode[i]==0x2B || opcode[i]==0x28)
3329   //if(opcode[i]==0x2B || opcode[i]==0x29)
3330   //if(opcode[i]==0x2B)
3331   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3332   {
3333     #ifdef __i386__
3334     emit_pusha();
3335     #endif
3336     #ifdef __arm__
3337     save_regs(0x100f);
3338     #endif
3339         emit_readword((int)&last_count,ECX);
3340         #ifdef __i386__
3341         if(get_reg(i_regs->regmap,CCREG)<0)
3342           emit_loadreg(CCREG,HOST_CCREG);
3343         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3344         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3345         emit_writeword(HOST_CCREG,(int)&Count);
3346         #endif
3347         #ifdef __arm__
3348         if(get_reg(i_regs->regmap,CCREG)<0)
3349           emit_loadreg(CCREG,0);
3350         else
3351           emit_mov(HOST_CCREG,0);
3352         emit_add(0,ECX,0);
3353         emit_addimm(0,2*ccadj[i],0);
3354         emit_writeword(0,(int)&Count);
3355         #endif
3356     emit_call((int)memdebug);
3357     #ifdef __i386__
3358     emit_popa();
3359     #endif
3360     #ifdef __arm__
3361     restore_regs(0x100f);
3362     #endif
3363   }/**/
3364 }
3365
3366 void storelr_assemble(int i,struct regstat *i_regs)
3367 {
3368   int s,th,tl;
3369   int temp;
3370   int temp2;
3371   int offset;
3372   int jaddr=0,jaddr2;
3373   int case1,case2,case3;
3374   int done0,done1,done2;
3375   int memtarget=0,c=0;
3376   int agr=AGEN1+(i&1);
3377   u_int hr,reglist=0;
3378   th=get_reg(i_regs->regmap,rs2[i]|64);
3379   tl=get_reg(i_regs->regmap,rs2[i]);
3380   s=get_reg(i_regs->regmap,rs1[i]);
3381   temp=get_reg(i_regs->regmap,agr);
3382   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3383   offset=imm[i];
3384   if(s>=0) {
3385     c=(i_regs->isconst>>s)&1;
3386     if(c) {
3387       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3388       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3389     }
3390   }
3391   assert(tl>=0);
3392   for(hr=0;hr<HOST_REGS;hr++) {
3393     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3394   }
3395   assert(temp>=0);
3396   if(!using_tlb) {
3397     if(!c) {
3398       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3399       if(!offset&&s!=temp) emit_mov(s,temp);
3400       jaddr=(int)out;
3401       emit_jno(0);
3402     }
3403     else
3404     {
3405       if(!memtarget||!rs1[i]) {
3406         jaddr=(int)out;
3407         emit_jmp(0);
3408       }
3409     }
3410     #ifdef RAM_OFFSET
3411     int map=get_reg(i_regs->regmap,ROREG);
3412     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3413     gen_tlb_addr_w(temp,map);
3414     #else
3415     if((u_int)rdram!=0x80000000) 
3416       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3417     #endif
3418   }else{ // using tlb
3419     int map=get_reg(i_regs->regmap,TLREG);
3420     assert(map>=0);
3421     reglist&=~(1<<map);
3422     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3423     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3424     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3425     if(!jaddr&&!memtarget) {
3426       jaddr=(int)out;
3427       emit_jmp(0);
3428     }
3429     gen_tlb_addr_w(temp,map);
3430   }
3431
3432   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3433     temp2=get_reg(i_regs->regmap,FTEMP);
3434     if(!rs2[i]) temp2=th=tl;
3435   }
3436
3437 #ifndef BIG_ENDIAN_MIPS
3438     emit_xorimm(temp,3,temp);
3439 #endif
3440   emit_testimm(temp,2);
3441   case2=(int)out;
3442   emit_jne(0);
3443   emit_testimm(temp,1);
3444   case1=(int)out;
3445   emit_jne(0);
3446   // 0
3447   if (opcode[i]==0x2A) { // SWL
3448     emit_writeword_indexed(tl,0,temp);
3449   }
3450   if (opcode[i]==0x2E) { // SWR
3451     emit_writebyte_indexed(tl,3,temp);
3452   }
3453   if (opcode[i]==0x2C) { // SDL
3454     emit_writeword_indexed(th,0,temp);
3455     if(rs2[i]) emit_mov(tl,temp2);
3456   }
3457   if (opcode[i]==0x2D) { // SDR
3458     emit_writebyte_indexed(tl,3,temp);
3459     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3460   }
3461   done0=(int)out;
3462   emit_jmp(0);
3463   // 1
3464   set_jump_target(case1,(int)out);
3465   if (opcode[i]==0x2A) { // SWL
3466     // Write 3 msb into three least significant bytes
3467     if(rs2[i]) emit_rorimm(tl,8,tl);
3468     emit_writehword_indexed(tl,-1,temp);
3469     if(rs2[i]) emit_rorimm(tl,16,tl);
3470     emit_writebyte_indexed(tl,1,temp);
3471     if(rs2[i]) emit_rorimm(tl,8,tl);
3472   }
3473   if (opcode[i]==0x2E) { // SWR
3474     // Write two lsb into two most significant bytes
3475     emit_writehword_indexed(tl,1,temp);
3476   }
3477   if (opcode[i]==0x2C) { // SDL
3478     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3479     // Write 3 msb into three least significant bytes
3480     if(rs2[i]) emit_rorimm(th,8,th);
3481     emit_writehword_indexed(th,-1,temp);
3482     if(rs2[i]) emit_rorimm(th,16,th);
3483     emit_writebyte_indexed(th,1,temp);
3484     if(rs2[i]) emit_rorimm(th,8,th);
3485   }
3486   if (opcode[i]==0x2D) { // SDR
3487     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3488     // Write two lsb into two most significant bytes
3489     emit_writehword_indexed(tl,1,temp);
3490   }
3491   done1=(int)out;
3492   emit_jmp(0);
3493   // 2
3494   set_jump_target(case2,(int)out);
3495   emit_testimm(temp,1);
3496   case3=(int)out;
3497   emit_jne(0);
3498   if (opcode[i]==0x2A) { // SWL
3499     // Write two msb into two least significant bytes
3500     if(rs2[i]) emit_rorimm(tl,16,tl);
3501     emit_writehword_indexed(tl,-2,temp);
3502     if(rs2[i]) emit_rorimm(tl,16,tl);
3503   }
3504   if (opcode[i]==0x2E) { // SWR
3505     // Write 3 lsb into three most significant bytes
3506     emit_writebyte_indexed(tl,-1,temp);
3507     if(rs2[i]) emit_rorimm(tl,8,tl);
3508     emit_writehword_indexed(tl,0,temp);
3509     if(rs2[i]) emit_rorimm(tl,24,tl);
3510   }
3511   if (opcode[i]==0x2C) { // SDL
3512     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3513     // Write two msb into two least significant bytes
3514     if(rs2[i]) emit_rorimm(th,16,th);
3515     emit_writehword_indexed(th,-2,temp);
3516     if(rs2[i]) emit_rorimm(th,16,th);
3517   }
3518   if (opcode[i]==0x2D) { // SDR
3519     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3520     // Write 3 lsb into three most significant bytes
3521     emit_writebyte_indexed(tl,-1,temp);
3522     if(rs2[i]) emit_rorimm(tl,8,tl);
3523     emit_writehword_indexed(tl,0,temp);
3524     if(rs2[i]) emit_rorimm(tl,24,tl);
3525   }
3526   done2=(int)out;
3527   emit_jmp(0);
3528   // 3
3529   set_jump_target(case3,(int)out);
3530   if (opcode[i]==0x2A) { // SWL
3531     // Write msb into least significant byte
3532     if(rs2[i]) emit_rorimm(tl,24,tl);
3533     emit_writebyte_indexed(tl,-3,temp);
3534     if(rs2[i]) emit_rorimm(tl,8,tl);
3535   }
3536   if (opcode[i]==0x2E) { // SWR
3537     // Write entire word
3538     emit_writeword_indexed(tl,-3,temp);
3539   }
3540   if (opcode[i]==0x2C) { // SDL
3541     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3542     // Write msb into least significant byte
3543     if(rs2[i]) emit_rorimm(th,24,th);
3544     emit_writebyte_indexed(th,-3,temp);
3545     if(rs2[i]) emit_rorimm(th,8,th);
3546   }
3547   if (opcode[i]==0x2D) { // SDR
3548     if(rs2[i]) emit_mov(th,temp2);
3549     // Write entire word
3550     emit_writeword_indexed(tl,-3,temp);
3551   }
3552   set_jump_target(done0,(int)out);
3553   set_jump_target(done1,(int)out);
3554   set_jump_target(done2,(int)out);
3555   if (opcode[i]==0x2C) { // SDL
3556     emit_testimm(temp,4);
3557     done0=(int)out;
3558     emit_jne(0);
3559     emit_andimm(temp,~3,temp);
3560     emit_writeword_indexed(temp2,4,temp);
3561     set_jump_target(done0,(int)out);
3562   }
3563   if (opcode[i]==0x2D) { // SDR
3564     emit_testimm(temp,4);
3565     done0=(int)out;
3566     emit_jeq(0);
3567     emit_andimm(temp,~3,temp);
3568     emit_writeword_indexed(temp2,-4,temp);
3569     set_jump_target(done0,(int)out);
3570   }
3571   if(!c||!memtarget)
3572     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3573   if(!using_tlb) {
3574     #ifdef RAM_OFFSET
3575     int map=get_reg(i_regs->regmap,ROREG);
3576     if(map<0) map=HOST_TEMPREG;
3577     gen_orig_addr_w(temp,map);
3578     #else
3579     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3580     #endif
3581     #if defined(HOST_IMM8)
3582     int ir=get_reg(i_regs->regmap,INVCP);
3583     assert(ir>=0);
3584     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3585     #else
3586     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3587     #endif
3588     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3589     emit_callne(invalidate_addr_reg[temp]);
3590     #else
3591     jaddr2=(int)out;
3592     emit_jne(0);
3593     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3594     #endif
3595   }
3596   /*
3597     emit_pusha();
3598     //save_regs(0x100f);
3599         emit_readword((int)&last_count,ECX);
3600         if(get_reg(i_regs->regmap,CCREG)<0)
3601           emit_loadreg(CCREG,HOST_CCREG);
3602         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3603         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3604         emit_writeword(HOST_CCREG,(int)&Count);
3605     emit_call((int)memdebug);
3606     emit_popa();
3607     //restore_regs(0x100f);
3608   /**/
3609 }
3610
3611 void c1ls_assemble(int i,struct regstat *i_regs)
3612 {
3613 #ifndef DISABLE_COP1
3614   int s,th,tl;
3615   int temp,ar;
3616   int map=-1;
3617   int offset;
3618   int c=0;
3619   int jaddr,jaddr2=0,jaddr3,type;
3620   int agr=AGEN1+(i&1);
3621   u_int hr,reglist=0;
3622   th=get_reg(i_regs->regmap,FTEMP|64);
3623   tl=get_reg(i_regs->regmap,FTEMP);
3624   s=get_reg(i_regs->regmap,rs1[i]);
3625   temp=get_reg(i_regs->regmap,agr);
3626   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3627   offset=imm[i];
3628   assert(tl>=0);
3629   assert(rs1[i]>0);
3630   assert(temp>=0);
3631   for(hr=0;hr<HOST_REGS;hr++) {
3632     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3633   }
3634   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3635   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3636   {
3637     // Loads use a temporary register which we need to save
3638     reglist|=1<<temp;
3639   }
3640   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3641     ar=temp;
3642   else // LWC1/LDC1
3643     ar=tl;
3644   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3645   //else c=(i_regs->wasconst>>s)&1;
3646   if(s>=0) c=(i_regs->wasconst>>s)&1;
3647   // Check cop1 unusable
3648   if(!cop1_usable) {
3649     signed char rs=get_reg(i_regs->regmap,CSREG);
3650     assert(rs>=0);
3651     emit_testimm(rs,0x20000000);
3652     jaddr=(int)out;
3653     emit_jeq(0);
3654     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3655     cop1_usable=1;
3656   }
3657   if (opcode[i]==0x39) { // SWC1 (get float address)
3658     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3659   }
3660   if (opcode[i]==0x3D) { // SDC1 (get double address)
3661     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3662   }
3663   // Generate address + offset
3664   if(!using_tlb) {
3665     if(!c)
3666       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3667   }
3668   else
3669   {
3670     map=get_reg(i_regs->regmap,TLREG);
3671     assert(map>=0);
3672     reglist&=~(1<<map);
3673     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3674       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3675     }
3676     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3677       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3678     }
3679   }
3680   if (opcode[i]==0x39) { // SWC1 (read float)
3681     emit_readword_indexed(0,tl,tl);
3682   }
3683   if (opcode[i]==0x3D) { // SDC1 (read double)
3684     emit_readword_indexed(4,tl,th);
3685     emit_readword_indexed(0,tl,tl);
3686   }
3687   if (opcode[i]==0x31) { // LWC1 (get target address)
3688     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3689   }
3690   if (opcode[i]==0x35) { // LDC1 (get target address)
3691     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3692   }
3693   if(!using_tlb) {
3694     if(!c) {
3695       jaddr2=(int)out;
3696       emit_jno(0);
3697     }
3698     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3699       jaddr2=(int)out;
3700       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3701     }
3702     #ifdef DESTRUCTIVE_SHIFT
3703     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3704       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3705     }
3706     #endif
3707   }else{
3708     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3709       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3710     }
3711     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3712       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3713     }
3714   }
3715   if (opcode[i]==0x31) { // LWC1
3716     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3717     //gen_tlb_addr_r(ar,map);
3718     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3719     #ifdef HOST_IMM_ADDR32
3720     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3721     else
3722     #endif
3723     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3724     type=LOADW_STUB;
3725   }
3726   if (opcode[i]==0x35) { // LDC1
3727     assert(th>=0);
3728     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3729     //gen_tlb_addr_r(ar,map);
3730     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3731     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3732     #ifdef HOST_IMM_ADDR32
3733     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3734     else
3735     #endif
3736     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3737     type=LOADD_STUB;
3738   }
3739   if (opcode[i]==0x39) { // SWC1
3740     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3741     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3742     type=STOREW_STUB;
3743   }
3744   if (opcode[i]==0x3D) { // SDC1
3745     assert(th>=0);
3746     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3747     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3748     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3749     type=STORED_STUB;
3750   }
3751   if(!using_tlb) {
3752     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3753       #ifndef DESTRUCTIVE_SHIFT
3754       temp=offset||c||s<0?ar:s;
3755       #endif
3756       #if defined(HOST_IMM8)
3757       int ir=get_reg(i_regs->regmap,INVCP);
3758       assert(ir>=0);
3759       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3760       #else
3761       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3762       #endif
3763       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3764       emit_callne(invalidate_addr_reg[temp]);
3765       #else
3766       jaddr3=(int)out;
3767       emit_jne(0);
3768       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3769       #endif
3770     }
3771   }
3772   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3773   if (opcode[i]==0x31) { // LWC1 (write float)
3774     emit_writeword_indexed(tl,0,temp);
3775   }
3776   if (opcode[i]==0x35) { // LDC1 (write double)
3777     emit_writeword_indexed(th,4,temp);
3778     emit_writeword_indexed(tl,0,temp);
3779   }
3780   //if(opcode[i]==0x39)
3781   /*if(opcode[i]==0x39||opcode[i]==0x31)
3782   {
3783     emit_pusha();
3784         emit_readword((int)&last_count,ECX);
3785         if(get_reg(i_regs->regmap,CCREG)<0)
3786           emit_loadreg(CCREG,HOST_CCREG);
3787         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3788         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3789         emit_writeword(HOST_CCREG,(int)&Count);
3790     emit_call((int)memdebug);
3791     emit_popa();
3792   }/**/
3793 #else
3794   cop1_unusable(i, i_regs);
3795 #endif
3796 }
3797
3798 void c2ls_assemble(int i,struct regstat *i_regs)
3799 {
3800   int s,tl;
3801   int ar;
3802   int offset;
3803   int memtarget=0,c=0;
3804   int jaddr2=0,jaddr3,type;
3805   int agr=AGEN1+(i&1);
3806   int fastio_reg_override=0;
3807   u_int hr,reglist=0;
3808   u_int copr=(source[i]>>16)&0x1f;
3809   s=get_reg(i_regs->regmap,rs1[i]);
3810   tl=get_reg(i_regs->regmap,FTEMP);
3811   offset=imm[i];
3812   assert(rs1[i]>0);
3813   assert(tl>=0);
3814   assert(!using_tlb);
3815
3816   for(hr=0;hr<HOST_REGS;hr++) {
3817     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3818   }
3819   if(i_regs->regmap[HOST_CCREG]==CCREG)
3820     reglist&=~(1<<HOST_CCREG);
3821
3822   // get the address
3823   if (opcode[i]==0x3a) { // SWC2
3824     ar=get_reg(i_regs->regmap,agr);
3825     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3826     reglist|=1<<ar;
3827   } else { // LWC2
3828     ar=tl;
3829   }
3830   if(s>=0) c=(i_regs->wasconst>>s)&1;
3831   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3832   if (!offset&&!c&&s>=0) ar=s;
3833   assert(ar>=0);
3834
3835   if (opcode[i]==0x3a) { // SWC2
3836     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3837     type=STOREW_STUB;
3838   }
3839   else
3840     type=LOADW_STUB;
3841
3842   if(c&&!memtarget) {
3843     jaddr2=(int)out;
3844     emit_jmp(0); // inline_readstub/inline_writestub?
3845   }
3846   else {
3847     if(!c) {
3848       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3849     }
3850     if (opcode[i]==0x32) { // LWC2
3851       #ifdef HOST_IMM_ADDR32
3852       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3853       else
3854       #endif
3855       int a=ar;
3856       if(fastio_reg_override) a=fastio_reg_override;
3857       emit_readword_indexed(0,a,tl);
3858     }
3859     if (opcode[i]==0x3a) { // SWC2
3860       #ifdef DESTRUCTIVE_SHIFT
3861       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3862       #endif
3863       int a=ar;
3864       if(fastio_reg_override) a=fastio_reg_override;
3865       emit_writeword_indexed(tl,0,a);
3866     }
3867   }
3868   if(jaddr2)
3869     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3870   if (opcode[i]==0x3a) { // SWC2
3871 #if defined(HOST_IMM8)
3872     int ir=get_reg(i_regs->regmap,INVCP);
3873     assert(ir>=0);
3874     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3875 #else
3876     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3877 #endif
3878     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3879     emit_callne(invalidate_addr_reg[ar]);
3880     #else
3881     jaddr3=(int)out;
3882     emit_jne(0);
3883     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3884     #endif
3885   }
3886   if (opcode[i]==0x32) { // LWC2
3887     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3888   }
3889 }
3890
3891 #ifndef multdiv_assemble
3892 void multdiv_assemble(int i,struct regstat *i_regs)
3893 {
3894   printf("Need multdiv_assemble for this architecture.\n");
3895   exit(1);
3896 }
3897 #endif
3898
3899 void mov_assemble(int i,struct regstat *i_regs)
3900 {
3901   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3902   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3903   if(rt1[i]) {
3904     signed char sh,sl,th,tl;
3905     th=get_reg(i_regs->regmap,rt1[i]|64);
3906     tl=get_reg(i_regs->regmap,rt1[i]);
3907     //assert(tl>=0);
3908     if(tl>=0) {
3909       sh=get_reg(i_regs->regmap,rs1[i]|64);
3910       sl=get_reg(i_regs->regmap,rs1[i]);
3911       if(sl>=0) emit_mov(sl,tl);
3912       else emit_loadreg(rs1[i],tl);
3913       if(th>=0) {
3914         if(sh>=0) emit_mov(sh,th);
3915         else emit_loadreg(rs1[i]|64,th);
3916       }
3917     }
3918   }
3919 }
3920
3921 #ifndef fconv_assemble
3922 void fconv_assemble(int i,struct regstat *i_regs)
3923 {
3924   printf("Need fconv_assemble for this architecture.\n");
3925   exit(1);
3926 }
3927 #endif
3928
3929 #if 0
3930 void float_assemble(int i,struct regstat *i_regs)
3931 {
3932   printf("Need float_assemble for this architecture.\n");
3933   exit(1);
3934 }
3935 #endif
3936
3937 void syscall_assemble(int i,struct regstat *i_regs)
3938 {
3939   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3940   assert(ccreg==HOST_CCREG);
3941   assert(!is_delayslot);
3942   emit_movimm(start+i*4,EAX); // Get PC
3943   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3944   emit_jmp((int)jump_syscall_hle); // XXX
3945 }
3946
3947 void hlecall_assemble(int i,struct regstat *i_regs)
3948 {
3949   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3950   assert(ccreg==HOST_CCREG);
3951   assert(!is_delayslot);
3952   emit_movimm(start+i*4+4,0); // Get PC
3953   emit_movimm((int)psxHLEt[source[i]&7],1);
3954   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3955   emit_jmp((int)jump_hlecall);
3956 }
3957
3958 void intcall_assemble(int i,struct regstat *i_regs)
3959 {
3960   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3961   assert(ccreg==HOST_CCREG);
3962   assert(!is_delayslot);
3963   emit_movimm(start+i*4,0); // Get PC
3964   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
3965   emit_jmp((int)jump_intcall);
3966 }
3967
3968 void ds_assemble(int i,struct regstat *i_regs)
3969 {
3970   speculate_register_values(i);
3971   is_delayslot=1;
3972   switch(itype[i]) {
3973     case ALU:
3974       alu_assemble(i,i_regs);break;
3975     case IMM16:
3976       imm16_assemble(i,i_regs);break;
3977     case SHIFT:
3978       shift_assemble(i,i_regs);break;
3979     case SHIFTIMM:
3980       shiftimm_assemble(i,i_regs);break;
3981     case LOAD:
3982       load_assemble(i,i_regs);break;
3983     case LOADLR:
3984       loadlr_assemble(i,i_regs);break;
3985     case STORE:
3986       store_assemble(i,i_regs);break;
3987     case STORELR:
3988       storelr_assemble(i,i_regs);break;
3989     case COP0:
3990       cop0_assemble(i,i_regs);break;
3991     case COP1:
3992       cop1_assemble(i,i_regs);break;
3993     case C1LS:
3994       c1ls_assemble(i,i_regs);break;
3995     case COP2:
3996       cop2_assemble(i,i_regs);break;
3997     case C2LS:
3998       c2ls_assemble(i,i_regs);break;
3999     case C2OP:
4000       c2op_assemble(i,i_regs);break;
4001     case FCONV:
4002       fconv_assemble(i,i_regs);break;
4003     case FLOAT:
4004       float_assemble(i,i_regs);break;
4005     case FCOMP:
4006       fcomp_assemble(i,i_regs);break;
4007     case MULTDIV:
4008       multdiv_assemble(i,i_regs);break;
4009     case MOV:
4010       mov_assemble(i,i_regs);break;
4011     case SYSCALL:
4012     case HLECALL:
4013     case INTCALL:
4014     case SPAN:
4015     case UJUMP:
4016     case RJUMP:
4017     case CJUMP:
4018     case SJUMP:
4019     case FJUMP:
4020       printf("Jump in the delay slot.  This is probably a bug.\n");
4021   }
4022   is_delayslot=0;
4023 }
4024
4025 // Is the branch target a valid internal jump?
4026 int internal_branch(uint64_t i_is32,int addr)
4027 {
4028   if(addr&1) return 0; // Indirect (register) jump
4029   if(addr>=start && addr<start+slen*4-4)
4030   {
4031     int t=(addr-start)>>2;
4032     // Delay slots are not valid branch targets
4033     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4034     // 64 -> 32 bit transition requires a recompile
4035     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4036     {
4037       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4038       else printf("optimizable: yes\n");
4039     }*/
4040     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4041 #ifndef FORCE32
4042     if(requires_32bit[t]&~i_is32) return 0;
4043     else
4044 #endif
4045       return 1;
4046   }
4047   return 0;
4048 }
4049
4050 #ifndef wb_invalidate
4051 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4052   uint64_t u,uint64_t uu)
4053 {
4054   int hr;
4055   for(hr=0;hr<HOST_REGS;hr++) {
4056     if(hr!=EXCLUDE_REG) {
4057       if(pre[hr]!=entry[hr]) {
4058         if(pre[hr]>=0) {
4059           if((dirty>>hr)&1) {
4060             if(get_reg(entry,pre[hr])<0) {
4061               if(pre[hr]<64) {
4062                 if(!((u>>pre[hr])&1)) {
4063                   emit_storereg(pre[hr],hr);
4064                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4065                     emit_sarimm(hr,31,hr);
4066                     emit_storereg(pre[hr]|64,hr);
4067                   }
4068                 }
4069               }else{
4070                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4071                   emit_storereg(pre[hr],hr);
4072                 }
4073               }
4074             }
4075           }
4076         }
4077       }
4078     }
4079   }
4080   // Move from one register to another (no writeback)
4081   for(hr=0;hr<HOST_REGS;hr++) {
4082     if(hr!=EXCLUDE_REG) {
4083       if(pre[hr]!=entry[hr]) {
4084         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4085           int nr;
4086           if((nr=get_reg(entry,pre[hr]))>=0) {
4087             emit_mov(hr,nr);
4088           }
4089         }
4090       }
4091     }
4092   }
4093 }
4094 #endif
4095
4096 // Load the specified registers
4097 // This only loads the registers given as arguments because
4098 // we don't want to load things that will be overwritten
4099 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4100 {
4101   int hr;
4102   // Load 32-bit regs
4103   for(hr=0;hr<HOST_REGS;hr++) {
4104     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4105       if(entry[hr]!=regmap[hr]) {
4106         if(regmap[hr]==rs1||regmap[hr]==rs2)
4107         {
4108           if(regmap[hr]==0) {
4109             emit_zeroreg(hr);
4110           }
4111           else
4112           {
4113             emit_loadreg(regmap[hr],hr);
4114           }
4115         }
4116       }
4117     }
4118   }
4119   //Load 64-bit regs
4120   for(hr=0;hr<HOST_REGS;hr++) {
4121     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4122       if(entry[hr]!=regmap[hr]) {
4123         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4124         {
4125           assert(regmap[hr]!=64);
4126           if((is32>>(regmap[hr]&63))&1) {
4127             int lr=get_reg(regmap,regmap[hr]-64);
4128             if(lr>=0)
4129               emit_sarimm(lr,31,hr);
4130             else
4131               emit_loadreg(regmap[hr],hr);
4132           }
4133           else
4134           {
4135             emit_loadreg(regmap[hr],hr);
4136           }
4137         }
4138       }
4139     }
4140   }
4141 }
4142
4143 // Load registers prior to the start of a loop
4144 // so that they are not loaded within the loop
4145 static void loop_preload(signed char pre[],signed char entry[])
4146 {
4147   int hr;
4148   for(hr=0;hr<HOST_REGS;hr++) {
4149     if(hr!=EXCLUDE_REG) {
4150       if(pre[hr]!=entry[hr]) {
4151         if(entry[hr]>=0) {
4152           if(get_reg(pre,entry[hr])<0) {
4153             assem_debug("loop preload:\n");
4154             //printf("loop preload: %d\n",hr);
4155             if(entry[hr]==0) {
4156               emit_zeroreg(hr);
4157             }
4158             else if(entry[hr]<TEMPREG)
4159             {
4160               emit_loadreg(entry[hr],hr);
4161             }
4162             else if(entry[hr]-64<TEMPREG)
4163             {
4164               emit_loadreg(entry[hr],hr);
4165             }
4166           }
4167         }
4168       }
4169     }
4170   }
4171 }
4172
4173 // Generate address for load/store instruction
4174 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4175 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4176 {
4177   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4178     int ra=-1;
4179     int agr=AGEN1+(i&1);
4180     int mgr=MGEN1+(i&1);
4181     if(itype[i]==LOAD) {
4182       ra=get_reg(i_regs->regmap,rt1[i]);
4183       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4184       assert(ra>=0);
4185     }
4186     if(itype[i]==LOADLR) {
4187       ra=get_reg(i_regs->regmap,FTEMP);
4188     }
4189     if(itype[i]==STORE||itype[i]==STORELR) {
4190       ra=get_reg(i_regs->regmap,agr);
4191       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4192     }
4193     if(itype[i]==C1LS||itype[i]==C2LS) {
4194       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4195         ra=get_reg(i_regs->regmap,FTEMP);
4196       else { // SWC1/SDC1/SWC2/SDC2
4197         ra=get_reg(i_regs->regmap,agr);
4198         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4199       }
4200     }
4201     int rs=get_reg(i_regs->regmap,rs1[i]);
4202     int rm=get_reg(i_regs->regmap,TLREG);
4203     if(ra>=0) {
4204       int offset=imm[i];
4205       int c=(i_regs->wasconst>>rs)&1;
4206       if(rs1[i]==0) {
4207         // Using r0 as a base address
4208         /*if(rm>=0) {
4209           if(!entry||entry[rm]!=mgr) {
4210             generate_map_const(offset,rm);
4211           } // else did it in the previous cycle
4212         }*/
4213         if(!entry||entry[ra]!=agr) {
4214           if (opcode[i]==0x22||opcode[i]==0x26) {
4215             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4216           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4217             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4218           }else{
4219             emit_movimm(offset,ra);
4220           }
4221         } // else did it in the previous cycle
4222       }
4223       else if(rs<0) {
4224         if(!entry||entry[ra]!=rs1[i])
4225           emit_loadreg(rs1[i],ra);
4226         //if(!entry||entry[ra]!=rs1[i])
4227         //  printf("poor load scheduling!\n");
4228       }
4229       else if(c) {
4230 #ifndef DISABLE_TLB
4231         if(rm>=0) {
4232           if(!entry||entry[rm]!=mgr) {
4233             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4234               // Stores to memory go thru the mapper to detect self-modifying
4235               // code, loads don't.
4236               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4237                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4238                 generate_map_const(constmap[i][rs]+offset,rm);
4239             }else{
4240               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4241                 generate_map_const(constmap[i][rs]+offset,rm);
4242             }
4243           }
4244         }
4245 #endif
4246         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4247           if(!entry||entry[ra]!=agr) {
4248             if (opcode[i]==0x22||opcode[i]==0x26) {
4249               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4250             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4251               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4252             }else{
4253               #ifdef HOST_IMM_ADDR32
4254               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4255                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4256               #endif
4257               emit_movimm(constmap[i][rs]+offset,ra);
4258             }
4259           } // else did it in the previous cycle
4260         } // else load_consts already did it
4261       }
4262       if(offset&&!c&&rs1[i]) {
4263         if(rs>=0) {
4264           emit_addimm(rs,offset,ra);
4265         }else{
4266           emit_addimm(ra,offset,ra);
4267         }
4268       }
4269     }
4270   }
4271   // Preload constants for next instruction
4272   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4273     int agr,ra;
4274     #if !defined(HOST_IMM_ADDR32) && !defined(DISABLE_TLB)
4275     // Mapper entry
4276     agr=MGEN1+((i+1)&1);
4277     ra=get_reg(i_regs->regmap,agr);
4278     if(ra>=0) {
4279       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4280       int offset=imm[i+1];
4281       int c=(regs[i+1].wasconst>>rs)&1;
4282       if(c) {
4283         if(itype[i+1]==STORE||itype[i+1]==STORELR
4284            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4285           // Stores to memory go thru the mapper to detect self-modifying
4286           // code, loads don't.
4287           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4288              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4289             generate_map_const(constmap[i+1][rs]+offset,ra);
4290         }else{
4291           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4292             generate_map_const(constmap[i+1][rs]+offset,ra);
4293         }
4294       }
4295       /*else if(rs1[i]==0) {
4296         generate_map_const(offset,ra);
4297       }*/
4298     }
4299     #endif
4300     // Actual address
4301     agr=AGEN1+((i+1)&1);
4302     ra=get_reg(i_regs->regmap,agr);
4303     if(ra>=0) {
4304       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4305       int offset=imm[i+1];
4306       int c=(regs[i+1].wasconst>>rs)&1;
4307       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4308         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4309           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4310         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4311           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4312         }else{
4313           #ifdef HOST_IMM_ADDR32
4314           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4315              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4316           #endif
4317           emit_movimm(constmap[i+1][rs]+offset,ra);
4318         }
4319       }
4320       else if(rs1[i+1]==0) {
4321         // Using r0 as a base address
4322         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4323           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4324         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4325           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4326         }else{
4327           emit_movimm(offset,ra);
4328         }
4329       }
4330     }
4331   }
4332 }
4333
4334 int get_final_value(int hr, int i, int *value)
4335 {
4336   int reg=regs[i].regmap[hr];
4337   while(i<slen-1) {
4338     if(regs[i+1].regmap[hr]!=reg) break;
4339     if(!((regs[i+1].isconst>>hr)&1)) break;
4340     if(bt[i+1]) break;
4341     i++;
4342   }
4343   if(i<slen-1) {
4344     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4345       *value=constmap[i][hr];
4346       return 1;
4347     }
4348     if(!bt[i+1]) {
4349       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4350         // Load in delay slot, out-of-order execution
4351         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4352         {
4353           #ifdef HOST_IMM_ADDR32
4354           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4355           #endif
4356           // Precompute load address
4357           *value=constmap[i][hr]+imm[i+2];
4358           return 1;
4359         }
4360       }
4361       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4362       {
4363         #ifdef HOST_IMM_ADDR32
4364         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4365         #endif
4366         // Precompute load address
4367         *value=constmap[i][hr]+imm[i+1];
4368         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4369         return 1;
4370       }
4371     }
4372   }
4373   *value=constmap[i][hr];
4374   //printf("c=%x\n",(int)constmap[i][hr]);
4375   if(i==slen-1) return 1;
4376   if(reg<64) {
4377     return !((unneeded_reg[i+1]>>reg)&1);
4378   }else{
4379     return !((unneeded_reg_upper[i+1]>>reg)&1);
4380   }
4381 }
4382
4383 // Load registers with known constants
4384 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4385 {
4386   int hr;
4387   // Load 32-bit regs
4388   for(hr=0;hr<HOST_REGS;hr++) {
4389     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4390       //if(entry[hr]!=regmap[hr]) {
4391       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4392         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4393           int value;
4394           if(get_final_value(hr,i,&value)) {
4395             if(value==0) {
4396               emit_zeroreg(hr);
4397             }
4398             else {
4399               emit_movimm(value,hr);
4400             }
4401           }
4402         }
4403       }
4404     }
4405   }
4406   // Load 64-bit regs
4407   for(hr=0;hr<HOST_REGS;hr++) {
4408     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4409       //if(entry[hr]!=regmap[hr]) {
4410       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4411         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4412           if((is32>>(regmap[hr]&63))&1) {
4413             int lr=get_reg(regmap,regmap[hr]-64);
4414             assert(lr>=0);
4415             emit_sarimm(lr,31,hr);
4416           }
4417           else
4418           {
4419             int value;
4420             if(get_final_value(hr,i,&value)) {
4421               if(value==0) {
4422                 emit_zeroreg(hr);
4423               }
4424               else {
4425                 emit_movimm(value,hr);
4426               }
4427             }
4428           }
4429         }
4430       }
4431     }
4432   }
4433 }
4434 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4435 {
4436   int hr;
4437   // Load 32-bit regs
4438   for(hr=0;hr<HOST_REGS;hr++) {
4439     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4440       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4441         int value=constmap[i][hr];
4442         if(value==0) {
4443           emit_zeroreg(hr);
4444         }
4445         else {
4446           emit_movimm(value,hr);
4447         }
4448       }
4449     }
4450   }
4451   // Load 64-bit regs
4452   for(hr=0;hr<HOST_REGS;hr++) {
4453     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4454       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4455         if((is32>>(regmap[hr]&63))&1) {
4456           int lr=get_reg(regmap,regmap[hr]-64);
4457           assert(lr>=0);
4458           emit_sarimm(lr,31,hr);
4459         }
4460         else
4461         {
4462           int value=constmap[i][hr];
4463           if(value==0) {
4464             emit_zeroreg(hr);
4465           }
4466           else {
4467             emit_movimm(value,hr);
4468           }
4469         }
4470       }
4471     }
4472   }
4473 }
4474
4475 // Write out all dirty registers (except cycle count)
4476 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4477 {
4478   int hr;
4479   for(hr=0;hr<HOST_REGS;hr++) {
4480     if(hr!=EXCLUDE_REG) {
4481       if(i_regmap[hr]>0) {
4482         if(i_regmap[hr]!=CCREG) {
4483           if((i_dirty>>hr)&1) {
4484             if(i_regmap[hr]<64) {
4485               emit_storereg(i_regmap[hr],hr);
4486 #ifndef FORCE32
4487               if( ((i_is32>>i_regmap[hr])&1) ) {
4488                 #ifdef DESTRUCTIVE_WRITEBACK
4489                 emit_sarimm(hr,31,hr);
4490                 emit_storereg(i_regmap[hr]|64,hr);
4491                 #else
4492                 emit_sarimm(hr,31,HOST_TEMPREG);
4493                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4494                 #endif
4495               }
4496 #endif
4497             }else{
4498               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4499                 emit_storereg(i_regmap[hr],hr);
4500               }
4501             }
4502           }
4503         }
4504       }
4505     }
4506   }
4507 }
4508 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4509 // This writes the registers not written by store_regs_bt
4510 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4511 {
4512   int hr;
4513   int t=(addr-start)>>2;
4514   for(hr=0;hr<HOST_REGS;hr++) {
4515     if(hr!=EXCLUDE_REG) {
4516       if(i_regmap[hr]>0) {
4517         if(i_regmap[hr]!=CCREG) {
4518           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4519             if((i_dirty>>hr)&1) {
4520               if(i_regmap[hr]<64) {
4521                 emit_storereg(i_regmap[hr],hr);
4522 #ifndef FORCE32
4523                 if( ((i_is32>>i_regmap[hr])&1) ) {
4524                   #ifdef DESTRUCTIVE_WRITEBACK
4525                   emit_sarimm(hr,31,hr);
4526                   emit_storereg(i_regmap[hr]|64,hr);
4527                   #else
4528                   emit_sarimm(hr,31,HOST_TEMPREG);
4529                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4530                   #endif
4531                 }
4532 #endif
4533               }else{
4534                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4535                   emit_storereg(i_regmap[hr],hr);
4536                 }
4537               }
4538             }
4539           }
4540         }
4541       }
4542     }
4543   }
4544 }
4545
4546 // Load all registers (except cycle count)
4547 void load_all_regs(signed char i_regmap[])
4548 {
4549   int hr;
4550   for(hr=0;hr<HOST_REGS;hr++) {
4551     if(hr!=EXCLUDE_REG) {
4552       if(i_regmap[hr]==0) {
4553         emit_zeroreg(hr);
4554       }
4555       else
4556       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4557       {
4558         emit_loadreg(i_regmap[hr],hr);
4559       }
4560     }
4561   }
4562 }
4563
4564 // Load all current registers also needed by next instruction
4565 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4566 {
4567   int hr;
4568   for(hr=0;hr<HOST_REGS;hr++) {
4569     if(hr!=EXCLUDE_REG) {
4570       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4571         if(i_regmap[hr]==0) {
4572           emit_zeroreg(hr);
4573         }
4574         else
4575         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4576         {
4577           emit_loadreg(i_regmap[hr],hr);
4578         }
4579       }
4580     }
4581   }
4582 }
4583
4584 // Load all regs, storing cycle count if necessary
4585 void load_regs_entry(int t)
4586 {
4587   int hr;
4588   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4589   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4590   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4591     emit_storereg(CCREG,HOST_CCREG);
4592   }
4593   // Load 32-bit regs
4594   for(hr=0;hr<HOST_REGS;hr++) {
4595     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4596       if(regs[t].regmap_entry[hr]==0) {
4597         emit_zeroreg(hr);
4598       }
4599       else if(regs[t].regmap_entry[hr]!=CCREG)
4600       {
4601         emit_loadreg(regs[t].regmap_entry[hr],hr);
4602       }
4603     }
4604   }
4605   // Load 64-bit regs
4606   for(hr=0;hr<HOST_REGS;hr++) {
4607     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4608       assert(regs[t].regmap_entry[hr]!=64);
4609       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4610         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4611         if(lr<0) {
4612           emit_loadreg(regs[t].regmap_entry[hr],hr);
4613         }
4614         else
4615         {
4616           emit_sarimm(lr,31,hr);
4617         }
4618       }
4619       else
4620       {
4621         emit_loadreg(regs[t].regmap_entry[hr],hr);
4622       }
4623     }
4624   }
4625 }
4626
4627 // Store dirty registers prior to branch
4628 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4629 {
4630   if(internal_branch(i_is32,addr))
4631   {
4632     int t=(addr-start)>>2;
4633     int hr;
4634     for(hr=0;hr<HOST_REGS;hr++) {
4635       if(hr!=EXCLUDE_REG) {
4636         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4637           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4638             if((i_dirty>>hr)&1) {
4639               if(i_regmap[hr]<64) {
4640                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4641                   emit_storereg(i_regmap[hr],hr);
4642                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4643                     #ifdef DESTRUCTIVE_WRITEBACK
4644                     emit_sarimm(hr,31,hr);
4645                     emit_storereg(i_regmap[hr]|64,hr);
4646                     #else
4647                     emit_sarimm(hr,31,HOST_TEMPREG);
4648                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4649                     #endif
4650                   }
4651                 }
4652               }else{
4653                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4654                   emit_storereg(i_regmap[hr],hr);
4655                 }
4656               }
4657             }
4658           }
4659         }
4660       }
4661     }
4662   }
4663   else
4664   {
4665     // Branch out of this block, write out all dirty regs
4666     wb_dirtys(i_regmap,i_is32,i_dirty);
4667   }
4668 }
4669
4670 // Load all needed registers for branch target
4671 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4672 {
4673   //if(addr>=start && addr<(start+slen*4))
4674   if(internal_branch(i_is32,addr))
4675   {
4676     int t=(addr-start)>>2;
4677     int hr;
4678     // Store the cycle count before loading something else
4679     if(i_regmap[HOST_CCREG]!=CCREG) {
4680       assert(i_regmap[HOST_CCREG]==-1);
4681     }
4682     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4683       emit_storereg(CCREG,HOST_CCREG);
4684     }
4685     // Load 32-bit regs
4686     for(hr=0;hr<HOST_REGS;hr++) {
4687       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4688         #ifdef DESTRUCTIVE_WRITEBACK
4689         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4690         #else
4691         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4692         #endif
4693           if(regs[t].regmap_entry[hr]==0) {
4694             emit_zeroreg(hr);
4695           }
4696           else if(regs[t].regmap_entry[hr]!=CCREG)
4697           {
4698             emit_loadreg(regs[t].regmap_entry[hr],hr);
4699           }
4700         }
4701       }
4702     }
4703     //Load 64-bit regs
4704     for(hr=0;hr<HOST_REGS;hr++) {
4705       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4706         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4707           assert(regs[t].regmap_entry[hr]!=64);
4708           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4709             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4710             if(lr<0) {
4711               emit_loadreg(regs[t].regmap_entry[hr],hr);
4712             }
4713             else
4714             {
4715               emit_sarimm(lr,31,hr);
4716             }
4717           }
4718           else
4719           {
4720             emit_loadreg(regs[t].regmap_entry[hr],hr);
4721           }
4722         }
4723         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4724           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4725           assert(lr>=0);
4726           emit_sarimm(lr,31,hr);
4727         }
4728       }
4729     }
4730   }
4731 }
4732
4733 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4734 {
4735   if(addr>=start && addr<start+slen*4-4)
4736   {
4737     int t=(addr-start)>>2;
4738     int hr;
4739     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4740     for(hr=0;hr<HOST_REGS;hr++)
4741     {
4742       if(hr!=EXCLUDE_REG)
4743       {
4744         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4745         {
4746           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4747           {
4748             return 0;
4749           }
4750           else 
4751           if((i_dirty>>hr)&1)
4752           {
4753             if(i_regmap[hr]<TEMPREG)
4754             {
4755               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4756                 return 0;
4757             }
4758             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4759             {
4760               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4761                 return 0;
4762             }
4763           }
4764         }
4765         else // Same register but is it 32-bit or dirty?
4766         if(i_regmap[hr]>=0)
4767         {
4768           if(!((regs[t].dirty>>hr)&1))
4769           {
4770             if((i_dirty>>hr)&1)
4771             {
4772               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4773               {
4774                 //printf("%x: dirty no match\n",addr);
4775                 return 0;
4776               }
4777             }
4778           }
4779           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4780           {
4781             //printf("%x: is32 no match\n",addr);
4782             return 0;
4783           }
4784         }
4785       }
4786     }
4787     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4788 #ifndef FORCE32
4789     if(requires_32bit[t]&~i_is32) return 0;
4790 #endif
4791     // Delay slots are not valid branch targets
4792     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4793     // Delay slots require additional processing, so do not match
4794     if(is_ds[t]) return 0;
4795   }
4796   else
4797   {
4798     int hr;
4799     for(hr=0;hr<HOST_REGS;hr++)
4800     {
4801       if(hr!=EXCLUDE_REG)
4802       {
4803         if(i_regmap[hr]>=0)
4804         {
4805           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4806           {
4807             if((i_dirty>>hr)&1)
4808             {
4809               return 0;
4810             }
4811           }
4812         }
4813       }
4814     }
4815   }
4816   return 1;
4817 }
4818
4819 // Used when a branch jumps into the delay slot of another branch
4820 void ds_assemble_entry(int i)
4821 {
4822   int t=(ba[i]-start)>>2;
4823   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4824   assem_debug("Assemble delay slot at %x\n",ba[i]);
4825   assem_debug("<->\n");
4826   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4827     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4828   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4829   address_generation(t,&regs[t],regs[t].regmap_entry);
4830   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4831     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4832   cop1_usable=0;
4833   is_delayslot=0;
4834   switch(itype[t]) {
4835     case ALU:
4836       alu_assemble(t,&regs[t]);break;
4837     case IMM16:
4838       imm16_assemble(t,&regs[t]);break;
4839     case SHIFT:
4840       shift_assemble(t,&regs[t]);break;
4841     case SHIFTIMM:
4842       shiftimm_assemble(t,&regs[t]);break;
4843     case LOAD:
4844       load_assemble(t,&regs[t]);break;
4845     case LOADLR:
4846       loadlr_assemble(t,&regs[t]);break;
4847     case STORE:
4848       store_assemble(t,&regs[t]);break;
4849     case STORELR:
4850       storelr_assemble(t,&regs[t]);break;
4851     case COP0:
4852       cop0_assemble(t,&regs[t]);break;
4853     case COP1:
4854       cop1_assemble(t,&regs[t]);break;
4855     case C1LS:
4856       c1ls_assemble(t,&regs[t]);break;
4857     case COP2:
4858       cop2_assemble(t,&regs[t]);break;
4859     case C2LS:
4860       c2ls_assemble(t,&regs[t]);break;
4861     case C2OP:
4862       c2op_assemble(t,&regs[t]);break;
4863     case FCONV:
4864       fconv_assemble(t,&regs[t]);break;
4865     case FLOAT:
4866       float_assemble(t,&regs[t]);break;
4867     case FCOMP:
4868       fcomp_assemble(t,&regs[t]);break;
4869     case MULTDIV:
4870       multdiv_assemble(t,&regs[t]);break;
4871     case MOV:
4872       mov_assemble(t,&regs[t]);break;
4873     case SYSCALL:
4874     case HLECALL:
4875     case INTCALL:
4876     case SPAN:
4877     case UJUMP:
4878     case RJUMP:
4879     case CJUMP:
4880     case SJUMP:
4881     case FJUMP:
4882       printf("Jump in the delay slot.  This is probably a bug.\n");
4883   }
4884   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4885   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4886   if(internal_branch(regs[t].is32,ba[i]+4))
4887     assem_debug("branch: internal\n");
4888   else
4889     assem_debug("branch: external\n");
4890   assert(internal_branch(regs[t].is32,ba[i]+4));
4891   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4892   emit_jmp(0);
4893 }
4894
4895 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4896 {
4897   int count;
4898   int jaddr;
4899   int idle=0;
4900   if(itype[i]==RJUMP)
4901   {
4902     *adj=0;
4903   }
4904   //if(ba[i]>=start && ba[i]<(start+slen*4))
4905   if(internal_branch(branch_regs[i].is32,ba[i]))
4906   {
4907     int t=(ba[i]-start)>>2;
4908     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4909     else *adj=ccadj[t];
4910   }
4911   else
4912   {
4913     *adj=0;
4914   }
4915   count=ccadj[i];
4916   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4917     // Idle loop
4918     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4919     idle=(int)out;
4920     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4921     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4922     jaddr=(int)out;
4923     emit_jmp(0);
4924   }
4925   else if(*adj==0||invert) {
4926     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4927     jaddr=(int)out;
4928     emit_jns(0);
4929   }
4930   else
4931   {
4932     emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*(count+2));
4933     jaddr=(int)out;
4934     emit_jns(0);
4935   }
4936   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4937 }
4938
4939 void do_ccstub(int n)
4940 {
4941   literal_pool(256);
4942   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4943   set_jump_target(stubs[n][1],(int)out);
4944   int i=stubs[n][4];
4945   if(stubs[n][6]==NULLDS) {
4946     // Delay slot instruction is nullified ("likely" branch)
4947     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4948   }
4949   else if(stubs[n][6]!=TAKEN) {
4950     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4951   }
4952   else {
4953     if(internal_branch(branch_regs[i].is32,ba[i]))
4954       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4955   }
4956   if(stubs[n][5]!=-1)
4957   {
4958     // Save PC as return address
4959     emit_movimm(stubs[n][5],EAX);
4960     emit_writeword(EAX,(int)&pcaddr);
4961   }
4962   else
4963   {
4964     // Return address depends on which way the branch goes
4965     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4966     {
4967       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4968       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4969       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4970       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4971       if(rs1[i]==0)
4972       {
4973         s1l=s2l;s1h=s2h;
4974         s2l=s2h=-1;
4975       }
4976       else if(rs2[i]==0)
4977       {
4978         s2l=s2h=-1;
4979       }
4980       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4981         s1h=s2h=-1;
4982       }
4983       assert(s1l>=0);
4984       #ifdef DESTRUCTIVE_WRITEBACK
4985       if(rs1[i]) {
4986         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4987           emit_loadreg(rs1[i],s1l);
4988       } 
4989       else {
4990         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4991           emit_loadreg(rs2[i],s1l);
4992       }
4993       if(s2l>=0)
4994         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4995           emit_loadreg(rs2[i],s2l);
4996       #endif
4997       int hr=0;
4998       int addr=-1,alt=-1,ntaddr=-1;
4999       while(hr<HOST_REGS)
5000       {
5001         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5002            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5003            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5004         {
5005           addr=hr++;break;
5006         }
5007         hr++;
5008       }
5009       while(hr<HOST_REGS)
5010       {
5011         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5012            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5013            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5014         {
5015           alt=hr++;break;
5016         }
5017         hr++;
5018       }
5019       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5020       {
5021         while(hr<HOST_REGS)
5022         {
5023           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5024              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5025              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5026           {
5027             ntaddr=hr;break;
5028           }
5029           hr++;
5030         }
5031         assert(hr<HOST_REGS);
5032       }
5033       if((opcode[i]&0x2f)==4) // BEQ
5034       {
5035         #ifdef HAVE_CMOV_IMM
5036         if(s1h<0) {
5037           if(s2l>=0) emit_cmp(s1l,s2l);
5038           else emit_test(s1l,s1l);
5039           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5040         }
5041         else
5042         #endif
5043         {
5044           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5045           if(s1h>=0) {
5046             if(s2h>=0) emit_cmp(s1h,s2h);
5047             else emit_test(s1h,s1h);
5048             emit_cmovne_reg(alt,addr);
5049           }
5050           if(s2l>=0) emit_cmp(s1l,s2l);
5051           else emit_test(s1l,s1l);
5052           emit_cmovne_reg(alt,addr);
5053         }
5054       }
5055       if((opcode[i]&0x2f)==5) // BNE
5056       {
5057         #ifdef HAVE_CMOV_IMM
5058         if(s1h<0) {
5059           if(s2l>=0) emit_cmp(s1l,s2l);
5060           else emit_test(s1l,s1l);
5061           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5062         }
5063         else
5064         #endif
5065         {
5066           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5067           if(s1h>=0) {
5068             if(s2h>=0) emit_cmp(s1h,s2h);
5069             else emit_test(s1h,s1h);
5070             emit_cmovne_reg(alt,addr);
5071           }
5072           if(s2l>=0) emit_cmp(s1l,s2l);
5073           else emit_test(s1l,s1l);
5074           emit_cmovne_reg(alt,addr);
5075         }
5076       }
5077       if((opcode[i]&0x2f)==6) // BLEZ
5078       {
5079         //emit_movimm(ba[i],alt);
5080         //emit_movimm(start+i*4+8,addr);
5081         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5082         emit_cmpimm(s1l,1);
5083         if(s1h>=0) emit_mov(addr,ntaddr);
5084         emit_cmovl_reg(alt,addr);
5085         if(s1h>=0) {
5086           emit_test(s1h,s1h);
5087           emit_cmovne_reg(ntaddr,addr);
5088           emit_cmovs_reg(alt,addr);
5089         }
5090       }
5091       if((opcode[i]&0x2f)==7) // BGTZ
5092       {
5093         //emit_movimm(ba[i],addr);
5094         //emit_movimm(start+i*4+8,ntaddr);
5095         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5096         emit_cmpimm(s1l,1);
5097         if(s1h>=0) emit_mov(addr,alt);
5098         emit_cmovl_reg(ntaddr,addr);
5099         if(s1h>=0) {
5100           emit_test(s1h,s1h);
5101           emit_cmovne_reg(alt,addr);
5102           emit_cmovs_reg(ntaddr,addr);
5103         }
5104       }
5105       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5106       {
5107         //emit_movimm(ba[i],alt);
5108         //emit_movimm(start+i*4+8,addr);
5109         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5110         if(s1h>=0) emit_test(s1h,s1h);
5111         else emit_test(s1l,s1l);
5112         emit_cmovs_reg(alt,addr);
5113       }
5114       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5115       {
5116         //emit_movimm(ba[i],addr);
5117         //emit_movimm(start+i*4+8,alt);
5118         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5119         if(s1h>=0) emit_test(s1h,s1h);
5120         else emit_test(s1l,s1l);
5121         emit_cmovs_reg(alt,addr);
5122       }
5123       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5124         if(source[i]&0x10000) // BC1T
5125         {
5126           //emit_movimm(ba[i],alt);
5127           //emit_movimm(start+i*4+8,addr);
5128           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5129           emit_testimm(s1l,0x800000);
5130           emit_cmovne_reg(alt,addr);
5131         }
5132         else // BC1F
5133         {
5134           //emit_movimm(ba[i],addr);
5135           //emit_movimm(start+i*4+8,alt);
5136           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5137           emit_testimm(s1l,0x800000);
5138           emit_cmovne_reg(alt,addr);
5139         }
5140       }
5141       emit_writeword(addr,(int)&pcaddr);
5142     }
5143     else
5144     if(itype[i]==RJUMP)
5145     {
5146       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5147       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5148         r=get_reg(branch_regs[i].regmap,RTEMP);
5149       }
5150       emit_writeword(r,(int)&pcaddr);
5151     }
5152     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5153   }
5154   // Update cycle count
5155   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5156   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5157   emit_call((int)cc_interrupt);
5158   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
5159   if(stubs[n][6]==TAKEN) {
5160     if(internal_branch(branch_regs[i].is32,ba[i]))
5161       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5162     else if(itype[i]==RJUMP) {
5163       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5164         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5165       else
5166         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5167     }
5168   }else if(stubs[n][6]==NOTTAKEN) {
5169     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5170     else load_all_regs(branch_regs[i].regmap);
5171   }else if(stubs[n][6]==NULLDS) {
5172     // Delay slot instruction is nullified ("likely" branch)
5173     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5174     else load_all_regs(regs[i].regmap);
5175   }else{
5176     load_all_regs(branch_regs[i].regmap);
5177   }
5178   emit_jmp(stubs[n][2]); // return address
5179   
5180   /* This works but uses a lot of memory...
5181   emit_readword((int)&last_count,ECX);
5182   emit_add(HOST_CCREG,ECX,EAX);
5183   emit_writeword(EAX,(int)&Count);
5184   emit_call((int)gen_interupt);
5185   emit_readword((int)&Count,HOST_CCREG);
5186   emit_readword((int)&next_interupt,EAX);
5187   emit_readword((int)&pending_exception,EBX);
5188   emit_writeword(EAX,(int)&last_count);
5189   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5190   emit_test(EBX,EBX);
5191   int jne_instr=(int)out;
5192   emit_jne(0);
5193   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5194   load_all_regs(branch_regs[i].regmap);
5195   emit_jmp(stubs[n][2]); // return address
5196   set_jump_target(jne_instr,(int)out);
5197   emit_readword((int)&pcaddr,EAX);
5198   // Call get_addr_ht instead of doing the hash table here.
5199   // This code is executed infrequently and takes up a lot of space
5200   // so smaller is better.
5201   emit_storereg(CCREG,HOST_CCREG);
5202   emit_pushreg(EAX);
5203   emit_call((int)get_addr_ht);
5204   emit_loadreg(CCREG,HOST_CCREG);
5205   emit_addimm(ESP,4,ESP);
5206   emit_jmpreg(EAX);*/
5207 }
5208
5209 add_to_linker(int addr,int target,int ext)
5210 {
5211   link_addr[linkcount][0]=addr;
5212   link_addr[linkcount][1]=target;
5213   link_addr[linkcount][2]=ext;  
5214   linkcount++;
5215 }
5216
5217 static void ujump_assemble_write_ra(int i)
5218 {
5219   int rt;
5220   unsigned int return_address;
5221   rt=get_reg(branch_regs[i].regmap,31);
5222   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5223   //assert(rt>=0);
5224   return_address=start+i*4+8;
5225   if(rt>=0) {
5226     #ifdef USE_MINI_HT
5227     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5228       int temp=-1; // note: must be ds-safe
5229       #ifdef HOST_TEMPREG
5230       temp=HOST_TEMPREG;
5231       #endif
5232       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5233       else emit_movimm(return_address,rt);
5234     }
5235     else
5236     #endif
5237     {
5238       #ifdef REG_PREFETCH
5239       if(temp>=0) 
5240       {
5241         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5242       }
5243       #endif
5244       emit_movimm(return_address,rt); // PC into link register
5245       #ifdef IMM_PREFETCH
5246       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5247       #endif
5248     }
5249   }
5250 }
5251
5252 void ujump_assemble(int i,struct regstat *i_regs)
5253 {
5254   signed char *i_regmap=i_regs->regmap;
5255   int ra_done=0;
5256   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5257   address_generation(i+1,i_regs,regs[i].regmap_entry);
5258   #ifdef REG_PREFETCH
5259   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5260   if(rt1[i]==31&&temp>=0) 
5261   {
5262     int return_address=start+i*4+8;
5263     if(get_reg(branch_regs[i].regmap,31)>0) 
5264     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5265   }
5266   #endif
5267   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5268     ujump_assemble_write_ra(i); // writeback ra for DS
5269     ra_done=1;
5270   }
5271   ds_assemble(i+1,i_regs);
5272   uint64_t bc_unneeded=branch_regs[i].u;
5273   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5274   bc_unneeded|=1|(1LL<<rt1[i]);
5275   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5276   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5277                 bc_unneeded,bc_unneeded_upper);
5278   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5279   if(!ra_done&&rt1[i]==31)
5280     ujump_assemble_write_ra(i);
5281   int cc,adj;
5282   cc=get_reg(branch_regs[i].regmap,CCREG);
5283   assert(cc==HOST_CCREG);
5284   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5285   #ifdef REG_PREFETCH
5286   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5287   #endif
5288   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5289   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5290   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5291   if(internal_branch(branch_regs[i].is32,ba[i]))
5292     assem_debug("branch: internal\n");
5293   else
5294     assem_debug("branch: external\n");
5295   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5296     ds_assemble_entry(i);
5297   }
5298   else {
5299     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5300     emit_jmp(0);
5301   }
5302 }
5303
5304 static void rjump_assemble_write_ra(int i)
5305 {
5306   int rt,return_address;
5307   assert(rt1[i+1]!=rt1[i]);
5308   assert(rt2[i+1]!=rt1[i]);
5309   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5310   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5311   assert(rt>=0);
5312   return_address=start+i*4+8;
5313   #ifdef REG_PREFETCH
5314   if(temp>=0) 
5315   {
5316     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5317   }
5318   #endif
5319   emit_movimm(return_address,rt); // PC into link register
5320   #ifdef IMM_PREFETCH
5321   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5322   #endif
5323 }
5324
5325 void rjump_assemble(int i,struct regstat *i_regs)
5326 {
5327   signed char *i_regmap=i_regs->regmap;
5328   int temp;
5329   int rs,cc,adj;
5330   int ra_done=0;
5331   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5332   assert(rs>=0);
5333   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5334     // Delay slot abuse, make a copy of the branch address register
5335     temp=get_reg(branch_regs[i].regmap,RTEMP);
5336     assert(temp>=0);
5337     assert(regs[i].regmap[temp]==RTEMP);
5338     emit_mov(rs,temp);
5339     rs=temp;
5340   }
5341   address_generation(i+1,i_regs,regs[i].regmap_entry);
5342   #ifdef REG_PREFETCH
5343   if(rt1[i]==31) 
5344   {
5345     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5346       int return_address=start+i*4+8;
5347       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5348     }
5349   }
5350   #endif
5351   #ifdef USE_MINI_HT
5352   if(rs1[i]==31) {
5353     int rh=get_reg(regs[i].regmap,RHASH);
5354     if(rh>=0) do_preload_rhash(rh);
5355   }
5356   #endif
5357   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5358     rjump_assemble_write_ra(i);
5359     ra_done=1;
5360   }
5361   ds_assemble(i+1,i_regs);
5362   uint64_t bc_unneeded=branch_regs[i].u;
5363   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5364   bc_unneeded|=1|(1LL<<rt1[i]);
5365   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5366   bc_unneeded&=~(1LL<<rs1[i]);
5367   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5368                 bc_unneeded,bc_unneeded_upper);
5369   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5370   if(!ra_done&&rt1[i]!=0)
5371     rjump_assemble_write_ra(i);
5372   cc=get_reg(branch_regs[i].regmap,CCREG);
5373   assert(cc==HOST_CCREG);
5374   #ifdef USE_MINI_HT
5375   int rh=get_reg(branch_regs[i].regmap,RHASH);
5376   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5377   if(rs1[i]==31) {
5378     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5379     do_preload_rhtbl(ht);
5380     do_rhash(rs,rh);
5381   }
5382   #endif
5383   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5384   #ifdef DESTRUCTIVE_WRITEBACK
5385   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5386     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5387       emit_loadreg(rs1[i],rs);
5388     }
5389   }
5390   #endif
5391   #ifdef REG_PREFETCH
5392   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5393   #endif
5394   #ifdef USE_MINI_HT
5395   if(rs1[i]==31) {
5396     do_miniht_load(ht,rh);
5397   }
5398   #endif
5399   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5400   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5401   //assert(adj==0);
5402   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5403   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5404   emit_jns(0);
5405   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5406   #ifdef USE_MINI_HT
5407   if(rs1[i]==31) {
5408     do_miniht_jump(rs,rh,ht);
5409   }
5410   else
5411   #endif
5412   {
5413     //if(rs!=EAX) emit_mov(rs,EAX);
5414     //emit_jmp((int)jump_vaddr_eax);
5415     emit_jmp(jump_vaddr_reg[rs]);
5416   }
5417   /* Check hash table
5418   temp=!rs;
5419   emit_mov(rs,temp);
5420   emit_shrimm(rs,16,rs);
5421   emit_xor(temp,rs,rs);
5422   emit_movzwl_reg(rs,rs);
5423   emit_shlimm(rs,4,rs);
5424   emit_cmpmem_indexed((int)hash_table,rs,temp);
5425   emit_jne((int)out+14);
5426   emit_readword_indexed((int)hash_table+4,rs,rs);
5427   emit_jmpreg(rs);
5428   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5429   emit_addimm_no_flags(8,rs);
5430   emit_jeq((int)out-17);
5431   // No hit on hash table, call compiler
5432   emit_pushreg(temp);
5433 //DEBUG >
5434 #ifdef DEBUG_CYCLE_COUNT
5435   emit_readword((int)&last_count,ECX);
5436   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5437   emit_readword((int)&next_interupt,ECX);
5438   emit_writeword(HOST_CCREG,(int)&Count);
5439   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5440   emit_writeword(ECX,(int)&last_count);
5441 #endif
5442 //DEBUG <
5443   emit_storereg(CCREG,HOST_CCREG);
5444   emit_call((int)get_addr);
5445   emit_loadreg(CCREG,HOST_CCREG);
5446   emit_addimm(ESP,4,ESP);
5447   emit_jmpreg(EAX);*/
5448   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5449   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5450   #endif
5451 }
5452
5453 void cjump_assemble(int i,struct regstat *i_regs)
5454 {
5455   signed char *i_regmap=i_regs->regmap;
5456   int cc;
5457   int match;
5458   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5459   assem_debug("match=%d\n",match);
5460   int s1h,s1l,s2h,s2l;
5461   int prev_cop1_usable=cop1_usable;
5462   int unconditional=0,nop=0;
5463   int only32=0;
5464   int invert=0;
5465   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5466   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5467   if(!match) invert=1;
5468   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5469   if(i>(ba[i]-start)>>2) invert=1;
5470   #endif
5471   
5472   if(ooo[i]) {
5473     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5474     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5475     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5476     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5477   }
5478   else {
5479     s1l=get_reg(i_regmap,rs1[i]);
5480     s1h=get_reg(i_regmap,rs1[i]|64);
5481     s2l=get_reg(i_regmap,rs2[i]);
5482     s2h=get_reg(i_regmap,rs2[i]|64);
5483   }
5484   if(rs1[i]==0&&rs2[i]==0)
5485   {
5486     if(opcode[i]&1) nop=1;
5487     else unconditional=1;
5488     //assert(opcode[i]!=5);
5489     //assert(opcode[i]!=7);
5490     //assert(opcode[i]!=0x15);
5491     //assert(opcode[i]!=0x17);
5492   }
5493   else if(rs1[i]==0)
5494   {
5495     s1l=s2l;s1h=s2h;
5496     s2l=s2h=-1;
5497     only32=(regs[i].was32>>rs2[i])&1;
5498   }
5499   else if(rs2[i]==0)
5500   {
5501     s2l=s2h=-1;
5502     only32=(regs[i].was32>>rs1[i])&1;
5503   }
5504   else {
5505     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5506   }
5507
5508   if(ooo[i]) {
5509     // Out of order execution (delay slot first)
5510     //printf("OOOE\n");
5511     address_generation(i+1,i_regs,regs[i].regmap_entry);
5512     ds_assemble(i+1,i_regs);
5513     int adj;
5514     uint64_t bc_unneeded=branch_regs[i].u;
5515     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5516     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5517     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5518     bc_unneeded|=1;
5519     bc_unneeded_upper|=1;
5520     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5521                   bc_unneeded,bc_unneeded_upper);
5522     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5523     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5524     cc=get_reg(branch_regs[i].regmap,CCREG);
5525     assert(cc==HOST_CCREG);
5526     if(unconditional) 
5527       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5528     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5529     //assem_debug("cycle count (adj)\n");
5530     if(unconditional) {
5531       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5532       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5533         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5534         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5535         if(internal)
5536           assem_debug("branch: internal\n");
5537         else
5538           assem_debug("branch: external\n");
5539         if(internal&&is_ds[(ba[i]-start)>>2]) {
5540           ds_assemble_entry(i);
5541         }
5542         else {
5543           add_to_linker((int)out,ba[i],internal);
5544           emit_jmp(0);
5545         }
5546         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5547         if(((u_int)out)&7) emit_addnop(0);
5548         #endif
5549       }
5550     }
5551     else if(nop) {
5552       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5553       int jaddr=(int)out;
5554       emit_jns(0);
5555       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5556     }
5557     else {
5558       int taken=0,nottaken=0,nottaken1=0;
5559       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5560       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5561       if(!only32)
5562       {
5563         assert(s1h>=0);
5564         if(opcode[i]==4) // BEQ
5565         {
5566           if(s2h>=0) emit_cmp(s1h,s2h);
5567           else emit_test(s1h,s1h);
5568           nottaken1=(int)out;
5569           emit_jne(1);
5570         }
5571         if(opcode[i]==5) // BNE
5572         {
5573           if(s2h>=0) emit_cmp(s1h,s2h);
5574           else emit_test(s1h,s1h);
5575           if(invert) taken=(int)out;
5576           else add_to_linker((int)out,ba[i],internal);
5577           emit_jne(0);
5578         }
5579         if(opcode[i]==6) // BLEZ
5580         {
5581           emit_test(s1h,s1h);
5582           if(invert) taken=(int)out;
5583           else add_to_linker((int)out,ba[i],internal);
5584           emit_js(0);
5585           nottaken1=(int)out;
5586           emit_jne(1);
5587         }
5588         if(opcode[i]==7) // BGTZ
5589         {
5590           emit_test(s1h,s1h);
5591           nottaken1=(int)out;
5592           emit_js(1);
5593           if(invert) taken=(int)out;
5594           else add_to_linker((int)out,ba[i],internal);
5595           emit_jne(0);
5596         }
5597       } // if(!only32)
5598           
5599       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5600       assert(s1l>=0);
5601       if(opcode[i]==4) // BEQ
5602       {
5603         if(s2l>=0) emit_cmp(s1l,s2l);
5604         else emit_test(s1l,s1l);
5605         if(invert){
5606           nottaken=(int)out;
5607           emit_jne(1);
5608         }else{
5609           add_to_linker((int)out,ba[i],internal);
5610           emit_jeq(0);
5611         }
5612       }
5613       if(opcode[i]==5) // BNE
5614       {
5615         if(s2l>=0) emit_cmp(s1l,s2l);
5616         else emit_test(s1l,s1l);
5617         if(invert){
5618           nottaken=(int)out;
5619           emit_jeq(1);
5620         }else{
5621           add_to_linker((int)out,ba[i],internal);
5622           emit_jne(0);
5623         }
5624       }
5625       if(opcode[i]==6) // BLEZ
5626       {
5627         emit_cmpimm(s1l,1);
5628         if(invert){
5629           nottaken=(int)out;
5630           emit_jge(1);
5631         }else{
5632           add_to_linker((int)out,ba[i],internal);
5633           emit_jl(0);
5634         }
5635       }
5636       if(opcode[i]==7) // BGTZ
5637       {
5638         emit_cmpimm(s1l,1);
5639         if(invert){
5640           nottaken=(int)out;
5641           emit_jl(1);
5642         }else{
5643           add_to_linker((int)out,ba[i],internal);
5644           emit_jge(0);
5645         }
5646       }
5647       if(invert) {
5648         if(taken) set_jump_target(taken,(int)out);
5649         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5650         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5651           if(adj) {
5652             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5653             add_to_linker((int)out,ba[i],internal);
5654           }else{
5655             emit_addnop(13);
5656             add_to_linker((int)out,ba[i],internal*2);
5657           }
5658           emit_jmp(0);
5659         }else
5660         #endif
5661         {
5662           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5663           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5664           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5665           if(internal)
5666             assem_debug("branch: internal\n");
5667           else
5668             assem_debug("branch: external\n");
5669           if(internal&&is_ds[(ba[i]-start)>>2]) {
5670             ds_assemble_entry(i);
5671           }
5672           else {
5673             add_to_linker((int)out,ba[i],internal);
5674             emit_jmp(0);
5675           }
5676         }
5677         set_jump_target(nottaken,(int)out);
5678       }
5679
5680       if(nottaken1) set_jump_target(nottaken1,(int)out);
5681       if(adj) {
5682         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5683       }
5684     } // (!unconditional)
5685   } // if(ooo)
5686   else
5687   {
5688     // In-order execution (branch first)
5689     //if(likely[i]) printf("IOL\n");
5690     //else
5691     //printf("IOE\n");
5692     int taken=0,nottaken=0,nottaken1=0;
5693     if(!unconditional&&!nop) {
5694       if(!only32)
5695       {
5696         assert(s1h>=0);
5697         if((opcode[i]&0x2f)==4) // BEQ
5698         {
5699           if(s2h>=0) emit_cmp(s1h,s2h);
5700           else emit_test(s1h,s1h);
5701           nottaken1=(int)out;
5702           emit_jne(2);
5703         }
5704         if((opcode[i]&0x2f)==5) // BNE
5705         {
5706           if(s2h>=0) emit_cmp(s1h,s2h);
5707           else emit_test(s1h,s1h);
5708           taken=(int)out;
5709           emit_jne(1);
5710         }
5711         if((opcode[i]&0x2f)==6) // BLEZ
5712         {
5713           emit_test(s1h,s1h);
5714           taken=(int)out;
5715           emit_js(1);
5716           nottaken1=(int)out;
5717           emit_jne(2);
5718         }
5719         if((opcode[i]&0x2f)==7) // BGTZ
5720         {
5721           emit_test(s1h,s1h);
5722           nottaken1=(int)out;
5723           emit_js(2);
5724           taken=(int)out;
5725           emit_jne(1);
5726         }
5727       } // if(!only32)
5728           
5729       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5730       assert(s1l>=0);
5731       if((opcode[i]&0x2f)==4) // BEQ
5732       {
5733         if(s2l>=0) emit_cmp(s1l,s2l);
5734         else emit_test(s1l,s1l);
5735         nottaken=(int)out;
5736         emit_jne(2);
5737       }
5738       if((opcode[i]&0x2f)==5) // BNE
5739       {
5740         if(s2l>=0) emit_cmp(s1l,s2l);
5741         else emit_test(s1l,s1l);
5742         nottaken=(int)out;
5743         emit_jeq(2);
5744       }
5745       if((opcode[i]&0x2f)==6) // BLEZ
5746       {
5747         emit_cmpimm(s1l,1);
5748         nottaken=(int)out;
5749         emit_jge(2);
5750       }
5751       if((opcode[i]&0x2f)==7) // BGTZ
5752       {
5753         emit_cmpimm(s1l,1);
5754         nottaken=(int)out;
5755         emit_jl(2);
5756       }
5757     } // if(!unconditional)
5758     int adj;
5759     uint64_t ds_unneeded=branch_regs[i].u;
5760     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5761     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5762     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5763     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5764     ds_unneeded|=1;
5765     ds_unneeded_upper|=1;
5766     // branch taken
5767     if(!nop) {
5768       if(taken) set_jump_target(taken,(int)out);
5769       assem_debug("1:\n");
5770       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5771                     ds_unneeded,ds_unneeded_upper);
5772       // load regs
5773       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5774       address_generation(i+1,&branch_regs[i],0);
5775       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5776       ds_assemble(i+1,&branch_regs[i]);
5777       cc=get_reg(branch_regs[i].regmap,CCREG);
5778       if(cc==-1) {
5779         emit_loadreg(CCREG,cc=HOST_CCREG);
5780         // CHECK: Is the following instruction (fall thru) allocated ok?
5781       }
5782       assert(cc==HOST_CCREG);
5783       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5784       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5785       assem_debug("cycle count (adj)\n");
5786       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5787       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5788       if(internal)
5789         assem_debug("branch: internal\n");
5790       else
5791         assem_debug("branch: external\n");
5792       if(internal&&is_ds[(ba[i]-start)>>2]) {
5793         ds_assemble_entry(i);
5794       }
5795       else {
5796         add_to_linker((int)out,ba[i],internal);
5797         emit_jmp(0);
5798       }
5799     }
5800     // branch not taken
5801     cop1_usable=prev_cop1_usable;
5802     if(!unconditional) {
5803       if(nottaken1) set_jump_target(nottaken1,(int)out);
5804       set_jump_target(nottaken,(int)out);
5805       assem_debug("2:\n");
5806       if(!likely[i]) {
5807         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5808                       ds_unneeded,ds_unneeded_upper);
5809         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5810         address_generation(i+1,&branch_regs[i],0);
5811         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5812         ds_assemble(i+1,&branch_regs[i]);
5813       }
5814       cc=get_reg(branch_regs[i].regmap,CCREG);
5815       if(cc==-1&&!likely[i]) {
5816         // Cycle count isn't in a register, temporarily load it then write it out
5817         emit_loadreg(CCREG,HOST_CCREG);
5818         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5819         int jaddr=(int)out;
5820         emit_jns(0);
5821         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5822         emit_storereg(CCREG,HOST_CCREG);
5823       }
5824       else{
5825         cc=get_reg(i_regmap,CCREG);
5826         assert(cc==HOST_CCREG);
5827         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5828         int jaddr=(int)out;
5829         emit_jns(0);
5830         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5831       }
5832     }
5833   }
5834 }
5835
5836 void sjump_assemble(int i,struct regstat *i_regs)
5837 {
5838   signed char *i_regmap=i_regs->regmap;
5839   int cc;
5840   int match;
5841   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5842   assem_debug("smatch=%d\n",match);
5843   int s1h,s1l;
5844   int prev_cop1_usable=cop1_usable;
5845   int unconditional=0,nevertaken=0;
5846   int only32=0;
5847   int invert=0;
5848   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5849   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5850   if(!match) invert=1;
5851   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5852   if(i>(ba[i]-start)>>2) invert=1;
5853   #endif
5854
5855   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5856   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5857
5858   if(ooo[i]) {
5859     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5860     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5861   }
5862   else {
5863     s1l=get_reg(i_regmap,rs1[i]);
5864     s1h=get_reg(i_regmap,rs1[i]|64);
5865   }
5866   if(rs1[i]==0)
5867   {
5868     if(opcode2[i]&1) unconditional=1;
5869     else nevertaken=1;
5870     // These are never taken (r0 is never less than zero)
5871     //assert(opcode2[i]!=0);
5872     //assert(opcode2[i]!=2);
5873     //assert(opcode2[i]!=0x10);
5874     //assert(opcode2[i]!=0x12);
5875   }
5876   else {
5877     only32=(regs[i].was32>>rs1[i])&1;
5878   }
5879
5880   if(ooo[i]) {
5881     // Out of order execution (delay slot first)
5882     //printf("OOOE\n");
5883     address_generation(i+1,i_regs,regs[i].regmap_entry);
5884     ds_assemble(i+1,i_regs);
5885     int adj;
5886     uint64_t bc_unneeded=branch_regs[i].u;
5887     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5888     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5889     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5890     bc_unneeded|=1;
5891     bc_unneeded_upper|=1;
5892     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5893                   bc_unneeded,bc_unneeded_upper);
5894     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5895     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5896     if(rt1[i]==31) {
5897       int rt,return_address;
5898       rt=get_reg(branch_regs[i].regmap,31);
5899       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5900       if(rt>=0) {
5901         // Save the PC even if the branch is not taken
5902         return_address=start+i*4+8;
5903         emit_movimm(return_address,rt); // PC into link register
5904         #ifdef IMM_PREFETCH
5905         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5906         #endif
5907       }
5908     }
5909     cc=get_reg(branch_regs[i].regmap,CCREG);
5910     assert(cc==HOST_CCREG);
5911     if(unconditional) 
5912       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5913     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5914     assem_debug("cycle count (adj)\n");
5915     if(unconditional) {
5916       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5917       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5918         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5919         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5920         if(internal)
5921           assem_debug("branch: internal\n");
5922         else
5923           assem_debug("branch: external\n");
5924         if(internal&&is_ds[(ba[i]-start)>>2]) {
5925           ds_assemble_entry(i);
5926         }
5927         else {
5928           add_to_linker((int)out,ba[i],internal);
5929           emit_jmp(0);
5930         }
5931         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5932         if(((u_int)out)&7) emit_addnop(0);
5933         #endif
5934       }
5935     }
5936     else if(nevertaken) {
5937       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5938       int jaddr=(int)out;
5939       emit_jns(0);
5940       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5941     }
5942     else {
5943       int nottaken=0;
5944       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5945       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5946       if(!only32)
5947       {
5948         assert(s1h>=0);
5949         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5950         {
5951           emit_test(s1h,s1h);
5952           if(invert){
5953             nottaken=(int)out;
5954             emit_jns(1);
5955           }else{
5956             add_to_linker((int)out,ba[i],internal);
5957             emit_js(0);
5958           }
5959         }
5960         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5961         {
5962           emit_test(s1h,s1h);
5963           if(invert){
5964             nottaken=(int)out;
5965             emit_js(1);
5966           }else{
5967             add_to_linker((int)out,ba[i],internal);
5968             emit_jns(0);
5969           }
5970         }
5971       } // if(!only32)
5972       else
5973       {
5974         assert(s1l>=0);
5975         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5976         {
5977           emit_test(s1l,s1l);
5978           if(invert){
5979             nottaken=(int)out;
5980             emit_jns(1);
5981           }else{
5982             add_to_linker((int)out,ba[i],internal);
5983             emit_js(0);
5984           }
5985         }
5986         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5987         {
5988           emit_test(s1l,s1l);
5989           if(invert){
5990             nottaken=(int)out;
5991             emit_js(1);
5992           }else{
5993             add_to_linker((int)out,ba[i],internal);
5994             emit_jns(0);
5995           }
5996         }
5997       } // if(!only32)
5998           
5999       if(invert) {
6000         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6001         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
6002           if(adj) {
6003             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6004             add_to_linker((int)out,ba[i],internal);
6005           }else{
6006             emit_addnop(13);
6007             add_to_linker((int)out,ba[i],internal*2);
6008           }
6009           emit_jmp(0);
6010         }else
6011         #endif
6012         {
6013           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6014           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6015           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6016           if(internal)
6017             assem_debug("branch: internal\n");
6018           else
6019             assem_debug("branch: external\n");
6020           if(internal&&is_ds[(ba[i]-start)>>2]) {
6021             ds_assemble_entry(i);
6022           }
6023           else {
6024             add_to_linker((int)out,ba[i],internal);
6025             emit_jmp(0);
6026           }
6027         }
6028         set_jump_target(nottaken,(int)out);
6029       }
6030
6031       if(adj) {
6032         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6033       }
6034     } // (!unconditional)
6035   } // if(ooo)
6036   else
6037   {
6038     // In-order execution (branch first)
6039     //printf("IOE\n");
6040     int nottaken=0;
6041     if(rt1[i]==31) {
6042       int rt,return_address;
6043       rt=get_reg(branch_regs[i].regmap,31);
6044       if(rt>=0) {
6045         // Save the PC even if the branch is not taken
6046         return_address=start+i*4+8;
6047         emit_movimm(return_address,rt); // PC into link register
6048         #ifdef IMM_PREFETCH
6049         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6050         #endif
6051       }
6052     }
6053     if(!unconditional) {
6054       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6055       if(!only32)
6056       {
6057         assert(s1h>=0);
6058         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6059         {
6060           emit_test(s1h,s1h);
6061           nottaken=(int)out;
6062           emit_jns(1);
6063         }
6064         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6065         {
6066           emit_test(s1h,s1h);
6067           nottaken=(int)out;
6068           emit_js(1);
6069         }
6070       } // if(!only32)
6071       else
6072       {
6073         assert(s1l>=0);
6074         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6075         {
6076           emit_test(s1l,s1l);
6077           nottaken=(int)out;
6078           emit_jns(1);
6079         }
6080         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6081         {
6082           emit_test(s1l,s1l);
6083           nottaken=(int)out;
6084           emit_js(1);
6085         }
6086       }
6087     } // if(!unconditional)
6088     int adj;
6089     uint64_t ds_unneeded=branch_regs[i].u;
6090     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6091     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6092     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6093     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6094     ds_unneeded|=1;
6095     ds_unneeded_upper|=1;
6096     // branch taken
6097     if(!nevertaken) {
6098       //assem_debug("1:\n");
6099       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6100                     ds_unneeded,ds_unneeded_upper);
6101       // load regs
6102       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6103       address_generation(i+1,&branch_regs[i],0);
6104       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6105       ds_assemble(i+1,&branch_regs[i]);
6106       cc=get_reg(branch_regs[i].regmap,CCREG);
6107       if(cc==-1) {
6108         emit_loadreg(CCREG,cc=HOST_CCREG);
6109         // CHECK: Is the following instruction (fall thru) allocated ok?
6110       }
6111       assert(cc==HOST_CCREG);
6112       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6113       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6114       assem_debug("cycle count (adj)\n");
6115       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6116       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6117       if(internal)
6118         assem_debug("branch: internal\n");
6119       else
6120         assem_debug("branch: external\n");
6121       if(internal&&is_ds[(ba[i]-start)>>2]) {
6122         ds_assemble_entry(i);
6123       }
6124       else {
6125         add_to_linker((int)out,ba[i],internal);
6126         emit_jmp(0);
6127       }
6128     }
6129     // branch not taken
6130     cop1_usable=prev_cop1_usable;
6131     if(!unconditional) {
6132       set_jump_target(nottaken,(int)out);
6133       assem_debug("1:\n");
6134       if(!likely[i]) {
6135         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6136                       ds_unneeded,ds_unneeded_upper);
6137         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6138         address_generation(i+1,&branch_regs[i],0);
6139         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6140         ds_assemble(i+1,&branch_regs[i]);
6141       }
6142       cc=get_reg(branch_regs[i].regmap,CCREG);
6143       if(cc==-1&&!likely[i]) {
6144         // Cycle count isn't in a register, temporarily load it then write it out
6145         emit_loadreg(CCREG,HOST_CCREG);
6146         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6147         int jaddr=(int)out;
6148         emit_jns(0);
6149         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6150         emit_storereg(CCREG,HOST_CCREG);
6151       }
6152       else{
6153         cc=get_reg(i_regmap,CCREG);
6154         assert(cc==HOST_CCREG);
6155         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6156         int jaddr=(int)out;
6157         emit_jns(0);
6158         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6159       }
6160     }
6161   }
6162 }
6163
6164 void fjump_assemble(int i,struct regstat *i_regs)
6165 {
6166   signed char *i_regmap=i_regs->regmap;
6167   int cc;
6168   int match;
6169   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6170   assem_debug("fmatch=%d\n",match);
6171   int fs,cs;
6172   int eaddr;
6173   int invert=0;
6174   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6175   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6176   if(!match) invert=1;
6177   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6178   if(i>(ba[i]-start)>>2) invert=1;
6179   #endif
6180
6181   if(ooo[i]) {
6182     fs=get_reg(branch_regs[i].regmap,FSREG);
6183     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6184   }
6185   else {
6186     fs=get_reg(i_regmap,FSREG);
6187   }
6188
6189   // Check cop1 unusable
6190   if(!cop1_usable) {
6191     cs=get_reg(i_regmap,CSREG);
6192     assert(cs>=0);
6193     emit_testimm(cs,0x20000000);
6194     eaddr=(int)out;
6195     emit_jeq(0);
6196     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6197     cop1_usable=1;
6198   }
6199
6200   if(ooo[i]) {
6201     // Out of order execution (delay slot first)
6202     //printf("OOOE\n");
6203     ds_assemble(i+1,i_regs);
6204     int adj;
6205     uint64_t bc_unneeded=branch_regs[i].u;
6206     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6207     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6208     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6209     bc_unneeded|=1;
6210     bc_unneeded_upper|=1;
6211     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6212                   bc_unneeded,bc_unneeded_upper);
6213     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6214     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6215     cc=get_reg(branch_regs[i].regmap,CCREG);
6216     assert(cc==HOST_CCREG);
6217     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6218     assem_debug("cycle count (adj)\n");
6219     if(1) {
6220       int nottaken=0;
6221       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6222       if(1) {
6223         assert(fs>=0);
6224         emit_testimm(fs,0x800000);
6225         if(source[i]&0x10000) // BC1T
6226         {
6227           if(invert){
6228             nottaken=(int)out;
6229             emit_jeq(1);
6230           }else{
6231             add_to_linker((int)out,ba[i],internal);
6232             emit_jne(0);
6233           }
6234         }
6235         else // BC1F
6236           if(invert){
6237             nottaken=(int)out;
6238             emit_jne(1);
6239           }else{
6240             add_to_linker((int)out,ba[i],internal);
6241             emit_jeq(0);
6242           }
6243         {
6244         }
6245       } // if(!only32)
6246           
6247       if(invert) {
6248         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6249         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6250         else if(match) emit_addnop(13);
6251         #endif
6252         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6253         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6254         if(internal)
6255           assem_debug("branch: internal\n");
6256         else
6257           assem_debug("branch: external\n");
6258         if(internal&&is_ds[(ba[i]-start)>>2]) {
6259           ds_assemble_entry(i);
6260         }
6261         else {
6262           add_to_linker((int)out,ba[i],internal);
6263           emit_jmp(0);
6264         }
6265         set_jump_target(nottaken,(int)out);
6266       }
6267
6268       if(adj) {
6269         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6270       }
6271     } // (!unconditional)
6272   } // if(ooo)
6273   else
6274   {
6275     // In-order execution (branch first)
6276     //printf("IOE\n");
6277     int nottaken=0;
6278     if(1) {
6279       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6280       if(1) {
6281         assert(fs>=0);
6282         emit_testimm(fs,0x800000);
6283         if(source[i]&0x10000) // BC1T
6284         {
6285           nottaken=(int)out;
6286           emit_jeq(1);
6287         }
6288         else // BC1F
6289         {
6290           nottaken=(int)out;
6291           emit_jne(1);
6292         }
6293       }
6294     } // if(!unconditional)
6295     int adj;
6296     uint64_t ds_unneeded=branch_regs[i].u;
6297     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6298     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6299     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6300     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6301     ds_unneeded|=1;
6302     ds_unneeded_upper|=1;
6303     // branch taken
6304     //assem_debug("1:\n");
6305     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6306                   ds_unneeded,ds_unneeded_upper);
6307     // load regs
6308     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6309     address_generation(i+1,&branch_regs[i],0);
6310     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6311     ds_assemble(i+1,&branch_regs[i]);
6312     cc=get_reg(branch_regs[i].regmap,CCREG);
6313     if(cc==-1) {
6314       emit_loadreg(CCREG,cc=HOST_CCREG);
6315       // CHECK: Is the following instruction (fall thru) allocated ok?
6316     }
6317     assert(cc==HOST_CCREG);
6318     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6319     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6320     assem_debug("cycle count (adj)\n");
6321     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6322     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6323     if(internal)
6324       assem_debug("branch: internal\n");
6325     else
6326       assem_debug("branch: external\n");
6327     if(internal&&is_ds[(ba[i]-start)>>2]) {
6328       ds_assemble_entry(i);
6329     }
6330     else {
6331       add_to_linker((int)out,ba[i],internal);
6332       emit_jmp(0);
6333     }
6334
6335     // branch not taken
6336     if(1) { // <- FIXME (don't need this)
6337       set_jump_target(nottaken,(int)out);
6338       assem_debug("1:\n");
6339       if(!likely[i]) {
6340         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6341                       ds_unneeded,ds_unneeded_upper);
6342         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6343         address_generation(i+1,&branch_regs[i],0);
6344         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6345         ds_assemble(i+1,&branch_regs[i]);
6346       }
6347       cc=get_reg(branch_regs[i].regmap,CCREG);
6348       if(cc==-1&&!likely[i]) {
6349         // Cycle count isn't in a register, temporarily load it then write it out
6350         emit_loadreg(CCREG,HOST_CCREG);
6351         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6352         int jaddr=(int)out;
6353         emit_jns(0);
6354         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6355         emit_storereg(CCREG,HOST_CCREG);
6356       }
6357       else{
6358         cc=get_reg(i_regmap,CCREG);
6359         assert(cc==HOST_CCREG);
6360         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6361         int jaddr=(int)out;
6362         emit_jns(0);
6363         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6364       }
6365     }
6366   }
6367 }
6368
6369 static void pagespan_assemble(int i,struct regstat *i_regs)
6370 {
6371   int s1l=get_reg(i_regs->regmap,rs1[i]);
6372   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6373   int s2l=get_reg(i_regs->regmap,rs2[i]);
6374   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6375   void *nt_branch=NULL;
6376   int taken=0;
6377   int nottaken=0;
6378   int unconditional=0;
6379   if(rs1[i]==0)
6380   {
6381     s1l=s2l;s1h=s2h;
6382     s2l=s2h=-1;
6383   }
6384   else if(rs2[i]==0)
6385   {
6386     s2l=s2h=-1;
6387   }
6388   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6389     s1h=s2h=-1;
6390   }
6391   int hr=0;
6392   int addr,alt,ntaddr;
6393   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6394   else {
6395     while(hr<HOST_REGS)
6396     {
6397       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6398          (i_regs->regmap[hr]&63)!=rs1[i] &&
6399          (i_regs->regmap[hr]&63)!=rs2[i] )
6400       {
6401         addr=hr++;break;
6402       }
6403       hr++;
6404     }
6405   }
6406   while(hr<HOST_REGS)
6407   {
6408     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6409        (i_regs->regmap[hr]&63)!=rs1[i] &&
6410        (i_regs->regmap[hr]&63)!=rs2[i] )
6411     {
6412       alt=hr++;break;
6413     }
6414     hr++;
6415   }
6416   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6417   {
6418     while(hr<HOST_REGS)
6419     {
6420       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6421          (i_regs->regmap[hr]&63)!=rs1[i] &&
6422          (i_regs->regmap[hr]&63)!=rs2[i] )
6423       {
6424         ntaddr=hr;break;
6425       }
6426       hr++;
6427     }
6428   }
6429   assert(hr<HOST_REGS);
6430   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6431     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6432   }
6433   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6434   if(opcode[i]==2) // J
6435   {
6436     unconditional=1;
6437   }
6438   if(opcode[i]==3) // JAL
6439   {
6440     // TODO: mini_ht
6441     int rt=get_reg(i_regs->regmap,31);
6442     emit_movimm(start+i*4+8,rt);
6443     unconditional=1;
6444   }
6445   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6446   {
6447     emit_mov(s1l,addr);
6448     if(opcode2[i]==9) // JALR
6449     {
6450       int rt=get_reg(i_regs->regmap,rt1[i]);
6451       emit_movimm(start+i*4+8,rt);
6452     }
6453   }
6454   if((opcode[i]&0x3f)==4) // BEQ
6455   {
6456     if(rs1[i]==rs2[i])
6457     {
6458       unconditional=1;
6459     }
6460     else
6461     #ifdef HAVE_CMOV_IMM
6462     if(s1h<0) {
6463       if(s2l>=0) emit_cmp(s1l,s2l);
6464       else emit_test(s1l,s1l);
6465       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6466     }
6467     else
6468     #endif
6469     {
6470       assert(s1l>=0);
6471       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6472       if(s1h>=0) {
6473         if(s2h>=0) emit_cmp(s1h,s2h);
6474         else emit_test(s1h,s1h);
6475         emit_cmovne_reg(alt,addr);
6476       }
6477       if(s2l>=0) emit_cmp(s1l,s2l);
6478       else emit_test(s1l,s1l);
6479       emit_cmovne_reg(alt,addr);
6480     }
6481   }
6482   if((opcode[i]&0x3f)==5) // BNE
6483   {
6484     #ifdef HAVE_CMOV_IMM
6485     if(s1h<0) {
6486       if(s2l>=0) emit_cmp(s1l,s2l);
6487       else emit_test(s1l,s1l);
6488       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6489     }
6490     else
6491     #endif
6492     {
6493       assert(s1l>=0);
6494       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6495       if(s1h>=0) {
6496         if(s2h>=0) emit_cmp(s1h,s2h);
6497         else emit_test(s1h,s1h);
6498         emit_cmovne_reg(alt,addr);
6499       }
6500       if(s2l>=0) emit_cmp(s1l,s2l);
6501       else emit_test(s1l,s1l);
6502       emit_cmovne_reg(alt,addr);
6503     }
6504   }
6505   if((opcode[i]&0x3f)==0x14) // BEQL
6506   {
6507     if(s1h>=0) {
6508       if(s2h>=0) emit_cmp(s1h,s2h);
6509       else emit_test(s1h,s1h);
6510       nottaken=(int)out;
6511       emit_jne(0);
6512     }
6513     if(s2l>=0) emit_cmp(s1l,s2l);
6514     else emit_test(s1l,s1l);
6515     if(nottaken) set_jump_target(nottaken,(int)out);
6516     nottaken=(int)out;
6517     emit_jne(0);
6518   }
6519   if((opcode[i]&0x3f)==0x15) // BNEL
6520   {
6521     if(s1h>=0) {
6522       if(s2h>=0) emit_cmp(s1h,s2h);
6523       else emit_test(s1h,s1h);
6524       taken=(int)out;
6525       emit_jne(0);
6526     }
6527     if(s2l>=0) emit_cmp(s1l,s2l);
6528     else emit_test(s1l,s1l);
6529     nottaken=(int)out;
6530     emit_jeq(0);
6531     if(taken) set_jump_target(taken,(int)out);
6532   }
6533   if((opcode[i]&0x3f)==6) // BLEZ
6534   {
6535     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6536     emit_cmpimm(s1l,1);
6537     if(s1h>=0) emit_mov(addr,ntaddr);
6538     emit_cmovl_reg(alt,addr);
6539     if(s1h>=0) {
6540       emit_test(s1h,s1h);
6541       emit_cmovne_reg(ntaddr,addr);
6542       emit_cmovs_reg(alt,addr);
6543     }
6544   }
6545   if((opcode[i]&0x3f)==7) // BGTZ
6546   {
6547     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6548     emit_cmpimm(s1l,1);
6549     if(s1h>=0) emit_mov(addr,alt);
6550     emit_cmovl_reg(ntaddr,addr);
6551     if(s1h>=0) {
6552       emit_test(s1h,s1h);
6553       emit_cmovne_reg(alt,addr);
6554       emit_cmovs_reg(ntaddr,addr);
6555     }
6556   }
6557   if((opcode[i]&0x3f)==0x16) // BLEZL
6558   {
6559     assert((opcode[i]&0x3f)!=0x16);
6560   }
6561   if((opcode[i]&0x3f)==0x17) // BGTZL
6562   {
6563     assert((opcode[i]&0x3f)!=0x17);
6564   }
6565   assert(opcode[i]!=1); // BLTZ/BGEZ
6566
6567   //FIXME: Check CSREG
6568   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6569     if((source[i]&0x30000)==0) // BC1F
6570     {
6571       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6572       emit_testimm(s1l,0x800000);
6573       emit_cmovne_reg(alt,addr);
6574     }
6575     if((source[i]&0x30000)==0x10000) // BC1T
6576     {
6577       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6578       emit_testimm(s1l,0x800000);
6579       emit_cmovne_reg(alt,addr);
6580     }
6581     if((source[i]&0x30000)==0x20000) // BC1FL
6582     {
6583       emit_testimm(s1l,0x800000);
6584       nottaken=(int)out;
6585       emit_jne(0);
6586     }
6587     if((source[i]&0x30000)==0x30000) // BC1TL
6588     {
6589       emit_testimm(s1l,0x800000);
6590       nottaken=(int)out;
6591       emit_jeq(0);
6592     }
6593   }
6594
6595   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6596   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6597   if(likely[i]||unconditional)
6598   {
6599     emit_movimm(ba[i],HOST_BTREG);
6600   }
6601   else if(addr!=HOST_BTREG)
6602   {
6603     emit_mov(addr,HOST_BTREG);
6604   }
6605   void *branch_addr=out;
6606   emit_jmp(0);
6607   int target_addr=start+i*4+5;
6608   void *stub=out;
6609   void *compiled_target_addr=check_addr(target_addr);
6610   emit_extjump_ds((int)branch_addr,target_addr);
6611   if(compiled_target_addr) {
6612     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6613     add_link(target_addr,stub);
6614   }
6615   else set_jump_target((int)branch_addr,(int)stub);
6616   if(likely[i]) {
6617     // Not-taken path
6618     set_jump_target((int)nottaken,(int)out);
6619     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6620     void *branch_addr=out;
6621     emit_jmp(0);
6622     int target_addr=start+i*4+8;
6623     void *stub=out;
6624     void *compiled_target_addr=check_addr(target_addr);
6625     emit_extjump_ds((int)branch_addr,target_addr);
6626     if(compiled_target_addr) {
6627       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6628       add_link(target_addr,stub);
6629     }
6630     else set_jump_target((int)branch_addr,(int)stub);
6631   }
6632 }
6633
6634 // Assemble the delay slot for the above
6635 static void pagespan_ds()
6636 {
6637   assem_debug("initial delay slot:\n");
6638   u_int vaddr=start+1;
6639   u_int page=get_page(vaddr);
6640   u_int vpage=get_vpage(vaddr);
6641   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6642   do_dirty_stub_ds();
6643   ll_add(jump_in+page,vaddr,(void *)out);
6644   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6645   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6646     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6647   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6648     emit_writeword(HOST_BTREG,(int)&branch_target);
6649   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6650   address_generation(0,&regs[0],regs[0].regmap_entry);
6651   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6652     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6653   cop1_usable=0;
6654   is_delayslot=0;
6655   switch(itype[0]) {
6656     case ALU:
6657       alu_assemble(0,&regs[0]);break;
6658     case IMM16:
6659       imm16_assemble(0,&regs[0]);break;
6660     case SHIFT:
6661       shift_assemble(0,&regs[0]);break;
6662     case SHIFTIMM:
6663       shiftimm_assemble(0,&regs[0]);break;
6664     case LOAD:
6665       load_assemble(0,&regs[0]);break;
6666     case LOADLR:
6667       loadlr_assemble(0,&regs[0]);break;
6668     case STORE:
6669       store_assemble(0,&regs[0]);break;
6670     case STORELR:
6671       storelr_assemble(0,&regs[0]);break;
6672     case COP0:
6673       cop0_assemble(0,&regs[0]);break;
6674     case COP1:
6675       cop1_assemble(0,&regs[0]);break;
6676     case C1LS:
6677       c1ls_assemble(0,&regs[0]);break;
6678     case COP2:
6679       cop2_assemble(0,&regs[0]);break;
6680     case C2LS:
6681       c2ls_assemble(0,&regs[0]);break;
6682     case C2OP:
6683       c2op_assemble(0,&regs[0]);break;
6684     case FCONV:
6685       fconv_assemble(0,&regs[0]);break;
6686     case FLOAT:
6687       float_assemble(0,&regs[0]);break;
6688     case FCOMP:
6689       fcomp_assemble(0,&regs[0]);break;
6690     case MULTDIV:
6691       multdiv_assemble(0,&regs[0]);break;
6692     case MOV:
6693       mov_assemble(0,&regs[0]);break;
6694     case SYSCALL:
6695     case HLECALL:
6696     case INTCALL:
6697     case SPAN:
6698     case UJUMP:
6699     case RJUMP:
6700     case CJUMP:
6701     case SJUMP:
6702     case FJUMP:
6703       printf("Jump in the delay slot.  This is probably a bug.\n");
6704   }
6705   int btaddr=get_reg(regs[0].regmap,BTREG);
6706   if(btaddr<0) {
6707     btaddr=get_reg(regs[0].regmap,-1);
6708     emit_readword((int)&branch_target,btaddr);
6709   }
6710   assert(btaddr!=HOST_CCREG);
6711   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6712 #ifdef HOST_IMM8
6713   emit_movimm(start+4,HOST_TEMPREG);
6714   emit_cmp(btaddr,HOST_TEMPREG);
6715 #else
6716   emit_cmpimm(btaddr,start+4);
6717 #endif
6718   int branch=(int)out;
6719   emit_jeq(0);
6720   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6721   emit_jmp(jump_vaddr_reg[btaddr]);
6722   set_jump_target(branch,(int)out);
6723   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6724   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6725 }
6726
6727 // Basic liveness analysis for MIPS registers
6728 void unneeded_registers(int istart,int iend,int r)
6729 {
6730   int i;
6731   uint64_t u,uu,gte_u,b,bu,gte_bu;
6732   uint64_t temp_u,temp_uu,temp_gte_u;
6733   uint64_t tdep;
6734   if(iend==slen-1) {
6735     u=1;uu=1;
6736   }else{
6737     u=unneeded_reg[iend+1];
6738     uu=unneeded_reg_upper[iend+1];
6739     u=1;uu=1;
6740   }
6741   gte_u=temp_gte_u=0;
6742
6743   for (i=iend;i>=istart;i--)
6744   {
6745     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6746     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6747     {
6748       // If subroutine call, flag return address as a possible branch target
6749       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6750       
6751       if(ba[i]<start || ba[i]>=(start+slen*4))
6752       {
6753         // Branch out of this block, flush all regs
6754         u=1;
6755         uu=1;
6756         gte_u=0;
6757         /* Hexagon hack 
6758         if(itype[i]==UJUMP&&rt1[i]==31)
6759         {
6760           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6761         }
6762         if(itype[i]==RJUMP&&rs1[i]==31)
6763         {
6764           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6765         }
6766         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6767           if(itype[i]==UJUMP&&rt1[i]==31)
6768           {
6769             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6770             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6771           }
6772           if(itype[i]==RJUMP&&rs1[i]==31)
6773           {
6774             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6775             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6776           }
6777         }*/
6778         branch_unneeded_reg[i]=u;
6779         branch_unneeded_reg_upper[i]=uu;
6780         // Merge in delay slot
6781         tdep=(~uu>>rt1[i+1])&1;
6782         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6783         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6784         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6785         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6786         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6787         u|=1;uu|=1;
6788         gte_u|=gte_rt[i+1];
6789         gte_u&=~gte_rs[i+1];
6790         // If branch is "likely" (and conditional)
6791         // then we skip the delay slot on the fall-thru path
6792         if(likely[i]) {
6793           if(i<slen-1) {
6794             u&=unneeded_reg[i+2];
6795             uu&=unneeded_reg_upper[i+2];
6796             gte_u&=gte_unneeded[i+2];
6797           }
6798           else
6799           {
6800             u=1;
6801             uu=1;
6802             gte_u=0;
6803           }
6804         }
6805       }
6806       else
6807       {
6808         // Internal branch, flag target
6809         bt[(ba[i]-start)>>2]=1;
6810         if(ba[i]<=start+i*4) {
6811           // Backward branch
6812           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6813           {
6814             // Unconditional branch
6815             temp_u=1;temp_uu=1;
6816             temp_gte_u=0;
6817           } else {
6818             // Conditional branch (not taken case)
6819             temp_u=unneeded_reg[i+2];
6820             temp_uu=unneeded_reg_upper[i+2];
6821             temp_gte_u&=gte_unneeded[i+2];
6822           }
6823           // Merge in delay slot
6824           tdep=(~temp_uu>>rt1[i+1])&1;
6825           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6826           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6827           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6828           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6829           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6830           temp_u|=1;temp_uu|=1;
6831           temp_gte_u|=gte_rt[i+1];
6832           temp_gte_u&=~gte_rs[i+1];
6833           // If branch is "likely" (and conditional)
6834           // then we skip the delay slot on the fall-thru path
6835           if(likely[i]) {
6836             if(i<slen-1) {
6837               temp_u&=unneeded_reg[i+2];
6838               temp_uu&=unneeded_reg_upper[i+2];
6839               temp_gte_u&=gte_unneeded[i+2];
6840             }
6841             else
6842             {
6843               temp_u=1;
6844               temp_uu=1;
6845               temp_gte_u=0;
6846             }
6847           }
6848           tdep=(~temp_uu>>rt1[i])&1;
6849           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6850           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6851           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6852           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6853           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6854           temp_u|=1;temp_uu|=1;
6855           temp_gte_u|=gte_rt[i];
6856           temp_gte_u&=~gte_rs[i];
6857           unneeded_reg[i]=temp_u;
6858           unneeded_reg_upper[i]=temp_uu;
6859           gte_unneeded[i]=temp_gte_u;
6860           // Only go three levels deep.  This recursion can take an
6861           // excessive amount of time if there are a lot of nested loops.
6862           if(r<2) {
6863             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6864           }else{
6865             unneeded_reg[(ba[i]-start)>>2]=1;
6866             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6867             gte_unneeded[(ba[i]-start)>>2]=0;
6868           }
6869         } /*else*/ if(1) {
6870           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6871           {
6872             // Unconditional branch
6873             u=unneeded_reg[(ba[i]-start)>>2];
6874             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6875             gte_u=gte_unneeded[(ba[i]-start)>>2];
6876             branch_unneeded_reg[i]=u;
6877             branch_unneeded_reg_upper[i]=uu;
6878         //u=1;
6879         //uu=1;
6880         //branch_unneeded_reg[i]=u;
6881         //branch_unneeded_reg_upper[i]=uu;
6882             // Merge in delay slot
6883             tdep=(~uu>>rt1[i+1])&1;
6884             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6885             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6886             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6887             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6888             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6889             u|=1;uu|=1;
6890             gte_u|=gte_rt[i+1];
6891             gte_u&=~gte_rs[i+1];
6892           } else {
6893             // Conditional branch
6894             b=unneeded_reg[(ba[i]-start)>>2];
6895             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6896             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6897             branch_unneeded_reg[i]=b;
6898             branch_unneeded_reg_upper[i]=bu;
6899         //b=1;
6900         //bu=1;
6901         //branch_unneeded_reg[i]=b;
6902         //branch_unneeded_reg_upper[i]=bu;
6903             // Branch delay slot
6904             tdep=(~uu>>rt1[i+1])&1;
6905             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6906             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6907             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6908             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6909             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6910             b|=1;bu|=1;
6911             gte_bu|=gte_rt[i+1];
6912             gte_bu&=~gte_rs[i+1];
6913             // If branch is "likely" then we skip the
6914             // delay slot on the fall-thru path
6915             if(likely[i]) {
6916               u=b;
6917               uu=bu;
6918               gte_u=gte_bu;
6919               if(i<slen-1) {
6920                 u&=unneeded_reg[i+2];
6921                 uu&=unneeded_reg_upper[i+2];
6922                 gte_u&=gte_unneeded[i+2];
6923         //u=1;
6924         //uu=1;
6925               }
6926             } else {
6927               u&=b;
6928               uu&=bu;
6929               gte_u&=gte_bu;
6930         //u=1;
6931         //uu=1;
6932             }
6933             if(i<slen-1) {
6934               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6935               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6936         //branch_unneeded_reg[i]=1;
6937         //branch_unneeded_reg_upper[i]=1;
6938             } else {
6939               branch_unneeded_reg[i]=1;
6940               branch_unneeded_reg_upper[i]=1;
6941             }
6942           }
6943         }
6944       }
6945     }
6946     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6947     {
6948       // SYSCALL instruction (software interrupt)
6949       u=1;
6950       uu=1;
6951     }
6952     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6953     {
6954       // ERET instruction (return from interrupt)
6955       u=1;
6956       uu=1;
6957     }
6958     //u=uu=1; // DEBUG
6959     tdep=(~uu>>rt1[i])&1;
6960     // Written registers are unneeded
6961     u|=1LL<<rt1[i];
6962     u|=1LL<<rt2[i];
6963     uu|=1LL<<rt1[i];
6964     uu|=1LL<<rt2[i];
6965     gte_u|=gte_rt[i];
6966     // Accessed registers are needed
6967     u&=~(1LL<<rs1[i]);
6968     u&=~(1LL<<rs2[i]);
6969     uu&=~(1LL<<us1[i]);
6970     uu&=~(1LL<<us2[i]);
6971     gte_u&=~gte_rs[i];
6972     // Source-target dependencies
6973     uu&=~(tdep<<dep1[i]);
6974     uu&=~(tdep<<dep2[i]);
6975     // R0 is always unneeded
6976     u|=1;uu|=1;
6977     // Save it
6978     unneeded_reg[i]=u;
6979     unneeded_reg_upper[i]=uu;
6980     gte_unneeded[i]=gte_u;
6981     /*
6982     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6983     printf("U:");
6984     int r;
6985     for(r=1;r<=CCREG;r++) {
6986       if((unneeded_reg[i]>>r)&1) {
6987         if(r==HIREG) printf(" HI");
6988         else if(r==LOREG) printf(" LO");
6989         else printf(" r%d",r);
6990       }
6991     }
6992     printf(" UU:");
6993     for(r=1;r<=CCREG;r++) {
6994       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6995         if(r==HIREG) printf(" HI");
6996         else if(r==LOREG) printf(" LO");
6997         else printf(" r%d",r);
6998       }
6999     }
7000     printf("\n");*/
7001   }
7002 #ifdef FORCE32
7003   for (i=iend;i>=istart;i--)
7004   {
7005     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7006   }
7007 #endif
7008 }
7009
7010 // Identify registers which are likely to contain 32-bit values
7011 // This is used to predict whether any branches will jump to a
7012 // location with 64-bit values in registers.
7013 static void provisional_32bit()
7014 {
7015   int i,j;
7016   uint64_t is32=1;
7017   uint64_t lastbranch=1;
7018   
7019   for(i=0;i<slen;i++)
7020   {
7021     if(i>0) {
7022       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7023         if(i>1) is32=lastbranch;
7024         else is32=1;
7025       }
7026     }
7027     if(i>1)
7028     {
7029       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7030         if(likely[i-2]) {
7031           if(i>2) is32=lastbranch;
7032           else is32=1;
7033         }
7034       }
7035       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7036       {
7037         if(rs1[i-2]==0||rs2[i-2]==0)
7038         {
7039           if(rs1[i-2]) {
7040             is32|=1LL<<rs1[i-2];
7041           }
7042           if(rs2[i-2]) {
7043             is32|=1LL<<rs2[i-2];
7044           }
7045         }
7046       }
7047     }
7048     // If something jumps here with 64-bit values
7049     // then promote those registers to 64 bits
7050     if(bt[i])
7051     {
7052       uint64_t temp_is32=is32;
7053       for(j=i-1;j>=0;j--)
7054       {
7055         if(ba[j]==start+i*4) 
7056           //temp_is32&=branch_regs[j].is32;
7057           temp_is32&=p32[j];
7058       }
7059       for(j=i;j<slen;j++)
7060       {
7061         if(ba[j]==start+i*4) 
7062           temp_is32=1;
7063       }
7064       is32=temp_is32;
7065     }
7066     int type=itype[i];
7067     int op=opcode[i];
7068     int op2=opcode2[i];
7069     int rt=rt1[i];
7070     int s1=rs1[i];
7071     int s2=rs2[i];
7072     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7073       // Branches don't write registers, consider the delay slot instead.
7074       type=itype[i+1];
7075       op=opcode[i+1];
7076       op2=opcode2[i+1];
7077       rt=rt1[i+1];
7078       s1=rs1[i+1];
7079       s2=rs2[i+1];
7080       lastbranch=is32;
7081     }
7082     switch(type) {
7083       case LOAD:
7084         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7085            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7086           is32&=~(1LL<<rt);
7087         else
7088           is32|=1LL<<rt;
7089         break;
7090       case STORE:
7091       case STORELR:
7092         break;
7093       case LOADLR:
7094         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7095         if(op==0x22) is32|=1LL<<rt; // LWL
7096         break;
7097       case IMM16:
7098         if (op==0x08||op==0x09|| // ADDI/ADDIU
7099             op==0x0a||op==0x0b|| // SLTI/SLTIU
7100             op==0x0c|| // ANDI
7101             op==0x0f)  // LUI
7102         {
7103           is32|=1LL<<rt;
7104         }
7105         if(op==0x18||op==0x19) { // DADDI/DADDIU
7106           is32&=~(1LL<<rt);
7107           //if(imm[i]==0)
7108           //  is32|=((is32>>s1)&1LL)<<rt;
7109         }
7110         if(op==0x0d||op==0x0e) { // ORI/XORI
7111           uint64_t sr=((is32>>s1)&1LL);
7112           is32&=~(1LL<<rt);
7113           is32|=sr<<rt;
7114         }
7115         break;
7116       case UJUMP:
7117         break;
7118       case RJUMP:
7119         break;
7120       case CJUMP:
7121         break;
7122       case SJUMP:
7123         break;
7124       case FJUMP:
7125         break;
7126       case ALU:
7127         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7128           is32|=1LL<<rt;
7129         }
7130         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7131           is32|=1LL<<rt;
7132         }
7133         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7134           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7135           is32&=~(1LL<<rt);
7136           is32|=sr<<rt;
7137         }
7138         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7139           if(s1==0&&s2==0) {
7140             is32|=1LL<<rt;
7141           }
7142           else if(s2==0) {
7143             uint64_t sr=((is32>>s1)&1LL);
7144             is32&=~(1LL<<rt);
7145             is32|=sr<<rt;
7146           }
7147           else if(s1==0) {
7148             uint64_t sr=((is32>>s2)&1LL);
7149             is32&=~(1LL<<rt);
7150             is32|=sr<<rt;
7151           }
7152           else {
7153             is32&=~(1LL<<rt);
7154           }
7155         }
7156         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7157           if(s1==0&&s2==0) {
7158             is32|=1LL<<rt;
7159           }
7160           else if(s2==0) {
7161             uint64_t sr=((is32>>s1)&1LL);
7162             is32&=~(1LL<<rt);
7163             is32|=sr<<rt;
7164           }
7165           else {
7166             is32&=~(1LL<<rt);
7167           }
7168         }
7169         break;
7170       case MULTDIV:
7171         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7172           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7173         }
7174         else {
7175           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7176         }
7177         break;
7178       case MOV:
7179         {
7180           uint64_t sr=((is32>>s1)&1LL);
7181           is32&=~(1LL<<rt);
7182           is32|=sr<<rt;
7183         }
7184         break;
7185       case SHIFT:
7186         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7187         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7188         break;
7189       case SHIFTIMM:
7190         is32|=1LL<<rt;
7191         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7192         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7193         break;
7194       case COP0:
7195         if(op2==0) is32|=1LL<<rt; // MFC0
7196         break;
7197       case COP1:
7198       case COP2:
7199         if(op2==0) is32|=1LL<<rt; // MFC1
7200         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7201         if(op2==2) is32|=1LL<<rt; // CFC1
7202         break;
7203       case C1LS:
7204       case C2LS:
7205         break;
7206       case FLOAT:
7207       case FCONV:
7208         break;
7209       case FCOMP:
7210         break;
7211       case C2OP:
7212       case SYSCALL:
7213       case HLECALL:
7214         break;
7215       default:
7216         break;
7217     }
7218     is32|=1;
7219     p32[i]=is32;
7220
7221     if(i>0)
7222     {
7223       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7224       {
7225         if(rt1[i-1]==31) // JAL/JALR
7226         {
7227           // Subroutine call will return here, don't alloc any registers
7228           is32=1;
7229         }
7230         else if(i+1<slen)
7231         {
7232           // Internal branch will jump here, match registers to caller
7233           is32=0x3FFFFFFFFLL;
7234         }
7235       }
7236     }
7237   }
7238 }
7239
7240 // Identify registers which may be assumed to contain 32-bit values
7241 // and where optimizations will rely on this.
7242 // This is used to determine whether backward branches can safely
7243 // jump to a location with 64-bit values in registers.
7244 static void provisional_r32()
7245 {
7246   u_int r32=0;
7247   int i;
7248   
7249   for (i=slen-1;i>=0;i--)
7250   {
7251     int hr;
7252     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7253     {
7254       if(ba[i]<start || ba[i]>=(start+slen*4))
7255       {
7256         // Branch out of this block, don't need anything
7257         r32=0;
7258       }
7259       else
7260       {
7261         // Internal branch
7262         // Need whatever matches the target
7263         // (and doesn't get overwritten by the delay slot instruction)
7264         r32=0;
7265         int t=(ba[i]-start)>>2;
7266         if(ba[i]>start+i*4) {
7267           // Forward branch
7268           //if(!(requires_32bit[t]&~regs[i].was32))
7269           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7270           if(!(pr32[t]&~regs[i].was32))
7271             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7272         }else{
7273           // Backward branch
7274           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7275             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7276         }
7277       }
7278       // Conditional branch may need registers for following instructions
7279       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7280       {
7281         if(i<slen-2) {
7282           //r32|=requires_32bit[i+2];
7283           r32|=pr32[i+2];
7284           r32&=regs[i].was32;
7285           // Mark this address as a branch target since it may be called
7286           // upon return from interrupt
7287           //bt[i+2]=1;
7288         }
7289       }
7290       // Merge in delay slot
7291       if(!likely[i]) {
7292         // These are overwritten unless the branch is "likely"
7293         // and the delay slot is nullified if not taken
7294         r32&=~(1LL<<rt1[i+1]);
7295         r32&=~(1LL<<rt2[i+1]);
7296       }
7297       // Assume these are needed (delay slot)
7298       if(us1[i+1]>0)
7299       {
7300         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7301       }
7302       if(us2[i+1]>0)
7303       {
7304         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7305       }
7306       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7307       {
7308         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7309       }
7310       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7311       {
7312         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7313       }
7314     }
7315     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7316     {
7317       // SYSCALL instruction (software interrupt)
7318       r32=0;
7319     }
7320     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7321     {
7322       // ERET instruction (return from interrupt)
7323       r32=0;
7324     }
7325     // Check 32 bits
7326     r32&=~(1LL<<rt1[i]);
7327     r32&=~(1LL<<rt2[i]);
7328     if(us1[i]>0)
7329     {
7330       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7331     }
7332     if(us2[i]>0)
7333     {
7334       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7335     }
7336     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7337     {
7338       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7339     }
7340     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7341     {
7342       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7343     }
7344     //requires_32bit[i]=r32;
7345     pr32[i]=r32;
7346     
7347     // Dirty registers which are 32-bit, require 32-bit input
7348     // as they will be written as 32-bit values
7349     for(hr=0;hr<HOST_REGS;hr++)
7350     {
7351       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7352         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7353           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7354           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7355           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7356         }
7357       }
7358     }
7359   }
7360 }
7361
7362 // Write back dirty registers as soon as we will no longer modify them,
7363 // so that we don't end up with lots of writes at the branches.
7364 void clean_registers(int istart,int iend,int wr)
7365 {
7366   int i;
7367   int r;
7368   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7369   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7370   if(iend==slen-1) {
7371     will_dirty_i=will_dirty_next=0;
7372     wont_dirty_i=wont_dirty_next=0;
7373   }else{
7374     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7375     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7376   }
7377   for (i=iend;i>=istart;i--)
7378   {
7379     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7380     {
7381       if(ba[i]<start || ba[i]>=(start+slen*4))
7382       {
7383         // Branch out of this block, flush all regs
7384         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7385         {
7386           // Unconditional branch
7387           will_dirty_i=0;
7388           wont_dirty_i=0;
7389           // Merge in delay slot (will dirty)
7390           for(r=0;r<HOST_REGS;r++) {
7391             if(r!=EXCLUDE_REG) {
7392               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7393               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7394               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7395               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7396               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7397               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7398               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7399               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7400               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7401               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7402               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7403               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7404               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7405               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7406             }
7407           }
7408         }
7409         else
7410         {
7411           // Conditional branch
7412           will_dirty_i=0;
7413           wont_dirty_i=wont_dirty_next;
7414           // Merge in delay slot (will dirty)
7415           for(r=0;r<HOST_REGS;r++) {
7416             if(r!=EXCLUDE_REG) {
7417               if(!likely[i]) {
7418                 // Might not dirty if likely branch is not taken
7419                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7420                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7421                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7422                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7423                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7424                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7425                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7426                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7427                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7428                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7429                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7430                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7431                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7432                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7433               }
7434             }
7435           }
7436         }
7437         // Merge in delay slot (wont dirty)
7438         for(r=0;r<HOST_REGS;r++) {
7439           if(r!=EXCLUDE_REG) {
7440             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7441             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7442             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7443             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7444             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7445             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7446             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7447             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7448             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7449             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7450           }
7451         }
7452         if(wr) {
7453           #ifndef DESTRUCTIVE_WRITEBACK
7454           branch_regs[i].dirty&=wont_dirty_i;
7455           #endif
7456           branch_regs[i].dirty|=will_dirty_i;
7457         }
7458       }
7459       else
7460       {
7461         // Internal branch
7462         if(ba[i]<=start+i*4) {
7463           // Backward branch
7464           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7465           {
7466             // Unconditional branch
7467             temp_will_dirty=0;
7468             temp_wont_dirty=0;
7469             // Merge in delay slot (will dirty)
7470             for(r=0;r<HOST_REGS;r++) {
7471               if(r!=EXCLUDE_REG) {
7472                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7473                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7474                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7475                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7476                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7477                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7478                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7479                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7480                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7481                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7482                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7483                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7484                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7485                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7486               }
7487             }
7488           } else {
7489             // Conditional branch (not taken case)
7490             temp_will_dirty=will_dirty_next;
7491             temp_wont_dirty=wont_dirty_next;
7492             // Merge in delay slot (will dirty)
7493             for(r=0;r<HOST_REGS;r++) {
7494               if(r!=EXCLUDE_REG) {
7495                 if(!likely[i]) {
7496                   // Will not dirty if likely branch is not taken
7497                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7498                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7499                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7500                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7501                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7502                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7503                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7504                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7505                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7506                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7507                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7508                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7509                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7510                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7511                 }
7512               }
7513             }
7514           }
7515           // Merge in delay slot (wont dirty)
7516           for(r=0;r<HOST_REGS;r++) {
7517             if(r!=EXCLUDE_REG) {
7518               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7519               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7520               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7521               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7522               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7523               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7524               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7525               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7526               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7527               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7528             }
7529           }
7530           // Deal with changed mappings
7531           if(i<iend) {
7532             for(r=0;r<HOST_REGS;r++) {
7533               if(r!=EXCLUDE_REG) {
7534                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7535                   temp_will_dirty&=~(1<<r);
7536                   temp_wont_dirty&=~(1<<r);
7537                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7538                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7539                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7540                   } else {
7541                     temp_will_dirty|=1<<r;
7542                     temp_wont_dirty|=1<<r;
7543                   }
7544                 }
7545               }
7546             }
7547           }
7548           if(wr) {
7549             will_dirty[i]=temp_will_dirty;
7550             wont_dirty[i]=temp_wont_dirty;
7551             clean_registers((ba[i]-start)>>2,i-1,0);
7552           }else{
7553             // Limit recursion.  It can take an excessive amount
7554             // of time if there are a lot of nested loops.
7555             will_dirty[(ba[i]-start)>>2]=0;
7556             wont_dirty[(ba[i]-start)>>2]=-1;
7557           }
7558         }
7559         /*else*/ if(1)
7560         {
7561           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7562           {
7563             // Unconditional branch
7564             will_dirty_i=0;
7565             wont_dirty_i=0;
7566           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7567             for(r=0;r<HOST_REGS;r++) {
7568               if(r!=EXCLUDE_REG) {
7569                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7570                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7571                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7572                 }
7573                 if(branch_regs[i].regmap[r]>=0) {
7574                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7575                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7576                 }
7577               }
7578             }
7579           //}
7580             // Merge in delay slot
7581             for(r=0;r<HOST_REGS;r++) {
7582               if(r!=EXCLUDE_REG) {
7583                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7584                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7585                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7586                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7587                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7588                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7589                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7590                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7591                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7592                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7593                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7594                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7595                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7596                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7597               }
7598             }
7599           } else {
7600             // Conditional branch
7601             will_dirty_i=will_dirty_next;
7602             wont_dirty_i=wont_dirty_next;
7603           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7604             for(r=0;r<HOST_REGS;r++) {
7605               if(r!=EXCLUDE_REG) {
7606                 signed char target_reg=branch_regs[i].regmap[r];
7607                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7608                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7609                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7610                 }
7611                 else if(target_reg>=0) {
7612                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7613                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7614                 }
7615                 // Treat delay slot as part of branch too
7616                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7617                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7618                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7619                 }
7620                 else
7621                 {
7622                   will_dirty[i+1]&=~(1<<r);
7623                 }*/
7624               }
7625             }
7626           //}
7627             // Merge in delay slot
7628             for(r=0;r<HOST_REGS;r++) {
7629               if(r!=EXCLUDE_REG) {
7630                 if(!likely[i]) {
7631                   // Might not dirty if likely branch is not taken
7632                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7633                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7634                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7635                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7636                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7637                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7638                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7639                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7640                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7641                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7642                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7643                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7644                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7645                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7646                 }
7647               }
7648             }
7649           }
7650           // Merge in delay slot (won't dirty)
7651           for(r=0;r<HOST_REGS;r++) {
7652             if(r!=EXCLUDE_REG) {
7653               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7654               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7655               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7656               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7657               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7658               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7659               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7660               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7661               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7662               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7663             }
7664           }
7665           if(wr) {
7666             #ifndef DESTRUCTIVE_WRITEBACK
7667             branch_regs[i].dirty&=wont_dirty_i;
7668             #endif
7669             branch_regs[i].dirty|=will_dirty_i;
7670           }
7671         }
7672       }
7673     }
7674     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7675     {
7676       // SYSCALL instruction (software interrupt)
7677       will_dirty_i=0;
7678       wont_dirty_i=0;
7679     }
7680     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7681     {
7682       // ERET instruction (return from interrupt)
7683       will_dirty_i=0;
7684       wont_dirty_i=0;
7685     }
7686     will_dirty_next=will_dirty_i;
7687     wont_dirty_next=wont_dirty_i;
7688     for(r=0;r<HOST_REGS;r++) {
7689       if(r!=EXCLUDE_REG) {
7690         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7691         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7692         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7693         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7694         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7695         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7696         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7697         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7698         if(i>istart) {
7699           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7700           {
7701             // Don't store a register immediately after writing it,
7702             // may prevent dual-issue.
7703             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7704             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7705           }
7706         }
7707       }
7708     }
7709     // Save it
7710     will_dirty[i]=will_dirty_i;
7711     wont_dirty[i]=wont_dirty_i;
7712     // Mark registers that won't be dirtied as not dirty
7713     if(wr) {
7714       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7715       for(r=0;r<HOST_REGS;r++) {
7716         if((will_dirty_i>>r)&1) {
7717           printf(" r%d",r);
7718         }
7719       }
7720       printf("\n");*/
7721
7722       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7723         regs[i].dirty|=will_dirty_i;
7724         #ifndef DESTRUCTIVE_WRITEBACK
7725         regs[i].dirty&=wont_dirty_i;
7726         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7727         {
7728           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7729             for(r=0;r<HOST_REGS;r++) {
7730               if(r!=EXCLUDE_REG) {
7731                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7732                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7733                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7734               }
7735             }
7736           }
7737         }
7738         else
7739         {
7740           if(i<iend) {
7741             for(r=0;r<HOST_REGS;r++) {
7742               if(r!=EXCLUDE_REG) {
7743                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7744                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7745                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7746               }
7747             }
7748           }
7749         }
7750         #endif
7751       //}
7752     }
7753     // Deal with changed mappings
7754     temp_will_dirty=will_dirty_i;
7755     temp_wont_dirty=wont_dirty_i;
7756     for(r=0;r<HOST_REGS;r++) {
7757       if(r!=EXCLUDE_REG) {
7758         int nr;
7759         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7760           if(wr) {
7761             #ifndef DESTRUCTIVE_WRITEBACK
7762             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7763             #endif
7764             regs[i].wasdirty|=will_dirty_i&(1<<r);
7765           }
7766         }
7767         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7768           // Register moved to a different register
7769           will_dirty_i&=~(1<<r);
7770           wont_dirty_i&=~(1<<r);
7771           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7772           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7773           if(wr) {
7774             #ifndef DESTRUCTIVE_WRITEBACK
7775             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7776             #endif
7777             regs[i].wasdirty|=will_dirty_i&(1<<r);
7778           }
7779         }
7780         else {
7781           will_dirty_i&=~(1<<r);
7782           wont_dirty_i&=~(1<<r);
7783           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7784             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7785             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7786           } else {
7787             wont_dirty_i|=1<<r;
7788             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7789           }
7790         }
7791       }
7792     }
7793   }
7794 }
7795
7796 #ifdef DISASM
7797   /* disassembly */
7798 void disassemble_inst(int i)
7799 {
7800     if (bt[i]) printf("*"); else printf(" ");
7801     switch(itype[i]) {
7802       case UJUMP:
7803         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7804       case CJUMP:
7805         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7806       case SJUMP:
7807         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7808       case FJUMP:
7809         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7810       case RJUMP:
7811         if (opcode[i]==0x9&&rt1[i]!=31)
7812           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7813         else
7814           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7815         break;
7816       case SPAN:
7817         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7818       case IMM16:
7819         if(opcode[i]==0xf) //LUI
7820           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7821         else
7822           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7823         break;
7824       case LOAD:
7825       case LOADLR:
7826         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7827         break;
7828       case STORE:
7829       case STORELR:
7830         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7831         break;
7832       case ALU:
7833       case SHIFT:
7834         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7835         break;
7836       case MULTDIV:
7837         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7838         break;
7839       case SHIFTIMM:
7840         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7841         break;
7842       case MOV:
7843         if((opcode2[i]&0x1d)==0x10)
7844           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7845         else if((opcode2[i]&0x1d)==0x11)
7846           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7847         else
7848           printf (" %x: %s\n",start+i*4,insn[i]);
7849         break;
7850       case COP0:
7851         if(opcode2[i]==0)
7852           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7853         else if(opcode2[i]==4)
7854           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7855         else printf (" %x: %s\n",start+i*4,insn[i]);
7856         break;
7857       case COP1:
7858         if(opcode2[i]<3)
7859           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7860         else if(opcode2[i]>3)
7861           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7862         else printf (" %x: %s\n",start+i*4,insn[i]);
7863         break;
7864       case COP2:
7865         if(opcode2[i]<3)
7866           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7867         else if(opcode2[i]>3)
7868           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7869         else printf (" %x: %s\n",start+i*4,insn[i]);
7870         break;
7871       case C1LS:
7872         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7873         break;
7874       case C2LS:
7875         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7876         break;
7877       case INTCALL:
7878         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7879         break;
7880       default:
7881         //printf (" %s %8x\n",insn[i],source[i]);
7882         printf (" %x: %s\n",start+i*4,insn[i]);
7883     }
7884 }
7885 #else
7886 static void disassemble_inst(int i) {}
7887 #endif // DISASM
7888
7889 // clear the state completely, instead of just marking
7890 // things invalid like invalidate_all_pages() does
7891 void new_dynarec_clear_full()
7892 {
7893   int n;
7894   out=(u_char *)BASE_ADDR;
7895   memset(invalid_code,1,sizeof(invalid_code));
7896   memset(hash_table,0xff,sizeof(hash_table));
7897   memset(mini_ht,-1,sizeof(mini_ht));
7898   memset(restore_candidate,0,sizeof(restore_candidate));
7899   memset(shadow,0,sizeof(shadow));
7900   copy=shadow;
7901   expirep=16384; // Expiry pointer, +2 blocks
7902   pending_exception=0;
7903   literalcount=0;
7904   stop_after_jal=0;
7905   inv_code_start=inv_code_end=~0;
7906   gte_reads_flags=0;
7907   // TLB
7908 #ifndef DISABLE_TLB
7909   using_tlb=0;
7910   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7911     memory_map[n]=-1;
7912   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7913     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7914   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7915     memory_map[n]=-1;
7916 #endif
7917   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7918   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7919   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7920 }
7921
7922 void new_dynarec_init()
7923 {
7924   printf("Init new dynarec\n");
7925   out=(u_char *)BASE_ADDR;
7926   if (mmap (out, 1<<TARGET_SIZE_2,
7927             PROT_READ | PROT_WRITE | PROT_EXEC,
7928             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7929             -1, 0) <= 0) {printf("mmap() failed\n");}
7930 #ifdef MUPEN64
7931   rdword=&readmem_dword;
7932   fake_pc.f.r.rs=&readmem_dword;
7933   fake_pc.f.r.rt=&readmem_dword;
7934   fake_pc.f.r.rd=&readmem_dword;
7935 #endif
7936   int n;
7937   new_dynarec_clear_full();
7938 #ifdef HOST_IMM8
7939   // Copy this into local area so we don't have to put it in every literal pool
7940   invc_ptr=invalid_code;
7941 #endif
7942 #ifdef MUPEN64
7943   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7944     writemem[n] = write_nomem_new;
7945     writememb[n] = write_nomemb_new;
7946     writememh[n] = write_nomemh_new;
7947 #ifndef FORCE32
7948     writememd[n] = write_nomemd_new;
7949 #endif
7950     readmem[n] = read_nomem_new;
7951     readmemb[n] = read_nomemb_new;
7952     readmemh[n] = read_nomemh_new;
7953 #ifndef FORCE32
7954     readmemd[n] = read_nomemd_new;
7955 #endif
7956   }
7957   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7958     writemem[n] = write_rdram_new;
7959     writememb[n] = write_rdramb_new;
7960     writememh[n] = write_rdramh_new;
7961 #ifndef FORCE32
7962     writememd[n] = write_rdramd_new;
7963 #endif
7964   }
7965   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7966     writemem[n] = write_nomem_new;
7967     writememb[n] = write_nomemb_new;
7968     writememh[n] = write_nomemh_new;
7969 #ifndef FORCE32
7970     writememd[n] = write_nomemd_new;
7971 #endif
7972     readmem[n] = read_nomem_new;
7973     readmemb[n] = read_nomemb_new;
7974     readmemh[n] = read_nomemh_new;
7975 #ifndef FORCE32
7976     readmemd[n] = read_nomemd_new;
7977 #endif
7978   }
7979 #endif
7980   tlb_hacks();
7981   arch_init();
7982 }
7983
7984 void new_dynarec_cleanup()
7985 {
7986   int n;
7987   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7988   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7989   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7990   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7991   #ifdef ROM_COPY
7992   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7993   #endif
7994 }
7995
7996 int new_recompile_block(int addr)
7997 {
7998 /*
7999   if(addr==0x800cd050) {
8000     int block;
8001     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
8002     int n;
8003     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
8004   }
8005 */
8006   //if(Count==365117028) tracedebug=1;
8007   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8008   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8009   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8010   //if(debug) 
8011   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8012   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8013   /*if(Count>=312978186) {
8014     rlist();
8015   }*/
8016   //rlist();
8017   start = (u_int)addr&~3;
8018   //assert(((u_int)addr&1)==0);
8019   new_dynarec_did_compile=1;
8020 #ifdef PCSX
8021   if (Config.HLE && start == 0x80001000) // hlecall
8022   {
8023     // XXX: is this enough? Maybe check hleSoftCall?
8024     u_int beginning=(u_int)out;
8025     u_int page=get_page(start);
8026     invalid_code[start>>12]=0;
8027     emit_movimm(start,0);
8028     emit_writeword(0,(int)&pcaddr);
8029     emit_jmp((int)new_dyna_leave);
8030     literal_pool(0);
8031 #ifdef __arm__
8032     __clear_cache((void *)beginning,out);
8033 #endif
8034     ll_add(jump_in+page,start,(void *)beginning);
8035     return 0;
8036   }
8037   else if ((u_int)addr < 0x00200000 ||
8038     (0xa0000000 <= addr && addr < 0xa0200000)) {
8039     // used for BIOS calls mostly?
8040     source = (u_int *)((u_int)rdram+(start&0x1fffff));
8041     pagelimit = (addr&0xa0000000)|0x00200000;
8042   }
8043   else if (!Config.HLE && (
8044 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8045     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8046     // BIOS
8047     source = (u_int *)((u_int)psxR+(start&0x7ffff));
8048     pagelimit = (addr&0xfff00000)|0x80000;
8049   }
8050   else
8051 #endif
8052 #ifdef MUPEN64
8053   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
8054     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
8055     pagelimit = 0xa4001000;
8056   }
8057   else
8058 #endif
8059   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
8060     source = (u_int *)((u_int)rdram+start-0x80000000);
8061     pagelimit = 0x80000000+RAM_SIZE;
8062   }
8063 #ifndef DISABLE_TLB
8064   else if ((signed int)addr >= (signed int)0xC0000000) {
8065     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
8066     //if(tlb_LUT_r[start>>12])
8067       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
8068     if((signed int)memory_map[start>>12]>=0) {
8069       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
8070       pagelimit=(start+4096)&0xFFFFF000;
8071       int map=memory_map[start>>12];
8072       int i;
8073       for(i=0;i<5;i++) {
8074         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
8075         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
8076       }
8077       assem_debug("pagelimit=%x\n",pagelimit);
8078       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
8079     }
8080     else {
8081       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
8082       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
8083       return -1; // Caller will invoke exception handler
8084     }
8085     //printf("source= %x\n",(int)source);
8086   }
8087 #endif
8088   else {
8089     printf("Compile at bogus memory address: %x \n", (int)addr);
8090     exit(1);
8091   }
8092
8093   /* Pass 1: disassemble */
8094   /* Pass 2: register dependencies, branch targets */
8095   /* Pass 3: register allocation */
8096   /* Pass 4: branch dependencies */
8097   /* Pass 5: pre-alloc */
8098   /* Pass 6: optimize clean/dirty state */
8099   /* Pass 7: flag 32-bit registers */
8100   /* Pass 8: assembly */
8101   /* Pass 9: linker */
8102   /* Pass 10: garbage collection / free memory */
8103
8104   int i,j;
8105   int done=0;
8106   unsigned int type,op,op2;
8107
8108   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8109   
8110   /* Pass 1 disassembly */
8111
8112   for(i=0;!done;i++) {
8113     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8114     minimum_free_regs[i]=0;
8115     opcode[i]=op=source[i]>>26;
8116     switch(op)
8117     {
8118       case 0x00: strcpy(insn[i],"special"); type=NI;
8119         op2=source[i]&0x3f;
8120         switch(op2)
8121         {
8122           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8123           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8124           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8125           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8126           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8127           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8128           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8129           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8130           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8131           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8132           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8133           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8134           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8135           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8136           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8137           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8138           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8139           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8140           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8141           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8142           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8143           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8144           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8145           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8146           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8147           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8148           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8149           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8150           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8151           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8152           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8153           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8154           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8155           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8156           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8157 #ifndef FORCE32
8158           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8159           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8160           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8161           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8162           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8163           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8164           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8165           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8166           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8167           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8168           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8169           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8170           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8171           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8172           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8173           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8174           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8175 #endif
8176         }
8177         break;
8178       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8179         op2=(source[i]>>16)&0x1f;
8180         switch(op2)
8181         {
8182           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8183           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8184           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8185           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8186           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8187           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8188           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8189           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8190           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8191           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8192           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8193           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8194           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8195           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8196         }
8197         break;
8198       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8199       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8200       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8201       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8202       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8203       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8204       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8205       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8206       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8207       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8208       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8209       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8210       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8211       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8212       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8213         op2=(source[i]>>21)&0x1f;
8214         switch(op2)
8215         {
8216           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8217           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8218           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8219           switch(source[i]&0x3f)
8220           {
8221             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8222             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8223             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8224             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8225 #ifdef PCSX
8226             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8227 #else
8228             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8229 #endif
8230           }
8231         }
8232         break;
8233       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8234         op2=(source[i]>>21)&0x1f;
8235         switch(op2)
8236         {
8237           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8238           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8239           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8240           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8241           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8242           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8243           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8244           switch((source[i]>>16)&0x3)
8245           {
8246             case 0x00: strcpy(insn[i],"BC1F"); break;
8247             case 0x01: strcpy(insn[i],"BC1T"); break;
8248             case 0x02: strcpy(insn[i],"BC1FL"); break;
8249             case 0x03: strcpy(insn[i],"BC1TL"); break;
8250           }
8251           break;
8252           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8253           switch(source[i]&0x3f)
8254           {
8255             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8256             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8257             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8258             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8259             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8260             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8261             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8262             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8263             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8264             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8265             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8266             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8267             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8268             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8269             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8270             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8271             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8272             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8273             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8274             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8275             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8276             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8277             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8278             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8279             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8280             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8281             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8282             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8283             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8284             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8285             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8286             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8287             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8288             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8289             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8290           }
8291           break;
8292           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8293           switch(source[i]&0x3f)
8294           {
8295             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8296             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8297             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8298             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8299             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8300             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8301             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8302             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8303             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8304             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8305             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8306             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8307             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8308             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8309             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8310             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8311             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8312             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8313             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8314             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8315             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8316             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8317             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8318             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8319             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8320             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8321             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8322             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8323             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8324             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8325             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8326             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8327             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8328             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8329             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8330           }
8331           break;
8332           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8333           switch(source[i]&0x3f)
8334           {
8335             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8336             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8337           }
8338           break;
8339           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8340           switch(source[i]&0x3f)
8341           {
8342             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8343             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8344           }
8345           break;
8346         }
8347         break;
8348 #ifndef FORCE32
8349       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8350       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8351       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8352       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8353       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8354       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8355       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8356       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8357 #endif
8358       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8359       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8360       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8361       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8362       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8363       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8364       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8365 #ifndef FORCE32
8366       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8367 #endif
8368       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8369       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8370       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8371       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8372 #ifndef FORCE32
8373       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8374       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8375 #endif
8376       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8377       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8378       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8379       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8380 #ifndef FORCE32
8381       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8382       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8383       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8384 #endif
8385       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8386       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8387 #ifndef FORCE32
8388       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8389       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8390       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8391 #endif
8392 #ifdef PCSX
8393       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8394         op2=(source[i]>>21)&0x1f;
8395         //if (op2 & 0x10) {
8396         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8397           if (gte_handlers[source[i]&0x3f]!=NULL) {
8398             if (gte_regnames[source[i]&0x3f]!=NULL)
8399               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8400             else
8401               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8402             type=C2OP;
8403           }
8404         }
8405         else switch(op2)
8406         {
8407           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8408           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8409           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8410           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8411         }
8412         break;
8413       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8414       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8415       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8416 #endif
8417       default: strcpy(insn[i],"???"); type=NI;
8418         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8419         break;
8420     }
8421     itype[i]=type;
8422     opcode2[i]=op2;
8423     /* Get registers/immediates */
8424     lt1[i]=0;
8425     us1[i]=0;
8426     us2[i]=0;
8427     dep1[i]=0;
8428     dep2[i]=0;
8429     gte_rs[i]=gte_rt[i]=0;
8430     switch(type) {
8431       case LOAD:
8432         rs1[i]=(source[i]>>21)&0x1f;
8433         rs2[i]=0;
8434         rt1[i]=(source[i]>>16)&0x1f;
8435         rt2[i]=0;
8436         imm[i]=(short)source[i];
8437         break;
8438       case STORE:
8439       case STORELR:
8440         rs1[i]=(source[i]>>21)&0x1f;
8441         rs2[i]=(source[i]>>16)&0x1f;
8442         rt1[i]=0;
8443         rt2[i]=0;
8444         imm[i]=(short)source[i];
8445         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8446         break;
8447       case LOADLR:
8448         // LWL/LWR only load part of the register,
8449         // therefore the target register must be treated as a source too
8450         rs1[i]=(source[i]>>21)&0x1f;
8451         rs2[i]=(source[i]>>16)&0x1f;
8452         rt1[i]=(source[i]>>16)&0x1f;
8453         rt2[i]=0;
8454         imm[i]=(short)source[i];
8455         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8456         if(op==0x26) dep1[i]=rt1[i]; // LWR
8457         break;
8458       case IMM16:
8459         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8460         else rs1[i]=(source[i]>>21)&0x1f;
8461         rs2[i]=0;
8462         rt1[i]=(source[i]>>16)&0x1f;
8463         rt2[i]=0;
8464         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8465           imm[i]=(unsigned short)source[i];
8466         }else{
8467           imm[i]=(short)source[i];
8468         }
8469         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8470         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8471         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8472         break;
8473       case UJUMP:
8474         rs1[i]=0;
8475         rs2[i]=0;
8476         rt1[i]=0;
8477         rt2[i]=0;
8478         // The JAL instruction writes to r31.
8479         if (op&1) {
8480           rt1[i]=31;
8481         }
8482         rs2[i]=CCREG;
8483         break;
8484       case RJUMP:
8485         rs1[i]=(source[i]>>21)&0x1f;
8486         rs2[i]=0;
8487         rt1[i]=0;
8488         rt2[i]=0;
8489         // The JALR instruction writes to rd.
8490         if (op2&1) {
8491           rt1[i]=(source[i]>>11)&0x1f;
8492         }
8493         rs2[i]=CCREG;
8494         break;
8495       case CJUMP:
8496         rs1[i]=(source[i]>>21)&0x1f;
8497         rs2[i]=(source[i]>>16)&0x1f;
8498         rt1[i]=0;
8499         rt2[i]=0;
8500         if(op&2) { // BGTZ/BLEZ
8501           rs2[i]=0;
8502         }
8503         us1[i]=rs1[i];
8504         us2[i]=rs2[i];
8505         likely[i]=op>>4;
8506         break;
8507       case SJUMP:
8508         rs1[i]=(source[i]>>21)&0x1f;
8509         rs2[i]=CCREG;
8510         rt1[i]=0;
8511         rt2[i]=0;
8512         us1[i]=rs1[i];
8513         if(op2&0x10) { // BxxAL
8514           rt1[i]=31;
8515           // NOTE: If the branch is not taken, r31 is still overwritten
8516         }
8517         likely[i]=(op2&2)>>1;
8518         break;
8519       case FJUMP:
8520         rs1[i]=FSREG;
8521         rs2[i]=CSREG;
8522         rt1[i]=0;
8523         rt2[i]=0;
8524         likely[i]=((source[i])>>17)&1;
8525         break;
8526       case ALU:
8527         rs1[i]=(source[i]>>21)&0x1f; // source
8528         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8529         rt1[i]=(source[i]>>11)&0x1f; // destination
8530         rt2[i]=0;
8531         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8532           us1[i]=rs1[i];us2[i]=rs2[i];
8533         }
8534         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8535           dep1[i]=rs1[i];dep2[i]=rs2[i];
8536         }
8537         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8538           dep1[i]=rs1[i];dep2[i]=rs2[i];
8539         }
8540         break;
8541       case MULTDIV:
8542         rs1[i]=(source[i]>>21)&0x1f; // source
8543         rs2[i]=(source[i]>>16)&0x1f; // divisor
8544         rt1[i]=HIREG;
8545         rt2[i]=LOREG;
8546         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8547           us1[i]=rs1[i];us2[i]=rs2[i];
8548         }
8549         break;
8550       case MOV:
8551         rs1[i]=0;
8552         rs2[i]=0;
8553         rt1[i]=0;
8554         rt2[i]=0;
8555         if(op2==0x10) rs1[i]=HIREG; // MFHI
8556         if(op2==0x11) rt1[i]=HIREG; // MTHI
8557         if(op2==0x12) rs1[i]=LOREG; // MFLO
8558         if(op2==0x13) rt1[i]=LOREG; // MTLO
8559         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8560         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8561         dep1[i]=rs1[i];
8562         break;
8563       case SHIFT:
8564         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8565         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8566         rt1[i]=(source[i]>>11)&0x1f; // destination
8567         rt2[i]=0;
8568         // DSLLV/DSRLV/DSRAV are 64-bit
8569         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8570         break;
8571       case SHIFTIMM:
8572         rs1[i]=(source[i]>>16)&0x1f;
8573         rs2[i]=0;
8574         rt1[i]=(source[i]>>11)&0x1f;
8575         rt2[i]=0;
8576         imm[i]=(source[i]>>6)&0x1f;
8577         // DSxx32 instructions
8578         if(op2>=0x3c) imm[i]|=0x20;
8579         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8580         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8581         break;
8582       case COP0:
8583         rs1[i]=0;
8584         rs2[i]=0;
8585         rt1[i]=0;
8586         rt2[i]=0;
8587         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8588         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8589         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8590         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8591         break;
8592       case COP1:
8593         rs1[i]=0;
8594         rs2[i]=0;
8595         rt1[i]=0;
8596         rt2[i]=0;
8597         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8598         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8599         if(op2==5) us1[i]=rs1[i]; // DMTC1
8600         rs2[i]=CSREG;
8601         break;
8602       case COP2:
8603         rs1[i]=0;
8604         rs2[i]=0;
8605         rt1[i]=0;
8606         rt2[i]=0;
8607         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8608         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8609         rs2[i]=CSREG;
8610         int gr=(source[i]>>11)&0x1F;
8611         switch(op2)
8612         {
8613           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8614           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8615           case 0x02: gte_rs[i]=1ll<<(gr+32); // CFC2
8616             if(gr==31&&!gte_reads_flags) {
8617               assem_debug("gte flag read encountered @%08x\n",addr + i*4);
8618               gte_reads_flags=1;
8619             }
8620             break;
8621           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8622         }
8623         break;
8624       case C1LS:
8625         rs1[i]=(source[i]>>21)&0x1F;
8626         rs2[i]=CSREG;
8627         rt1[i]=0;
8628         rt2[i]=0;
8629         imm[i]=(short)source[i];
8630         break;
8631       case C2LS:
8632         rs1[i]=(source[i]>>21)&0x1F;
8633         rs2[i]=0;
8634         rt1[i]=0;
8635         rt2[i]=0;
8636         imm[i]=(short)source[i];
8637         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8638         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8639         break;
8640       case C2OP:
8641         rs1[i]=0;
8642         rs2[i]=0;
8643         rt1[i]=0;
8644         rt2[i]=0;
8645         gte_rt[i]=1ll<<63; // every op changes flags
8646         // TODO: other regs?
8647         break;
8648       case FLOAT:
8649       case FCONV:
8650         rs1[i]=0;
8651         rs2[i]=CSREG;
8652         rt1[i]=0;
8653         rt2[i]=0;
8654         break;
8655       case FCOMP:
8656         rs1[i]=FSREG;
8657         rs2[i]=CSREG;
8658         rt1[i]=FSREG;
8659         rt2[i]=0;
8660         break;
8661       case SYSCALL:
8662       case HLECALL:
8663       case INTCALL:
8664         rs1[i]=CCREG;
8665         rs2[i]=0;
8666         rt1[i]=0;
8667         rt2[i]=0;
8668         break;
8669       default:
8670         rs1[i]=0;
8671         rs2[i]=0;
8672         rt1[i]=0;
8673         rt2[i]=0;
8674     }
8675     /* Calculate branch target addresses */
8676     if(type==UJUMP)
8677       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8678     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8679       ba[i]=start+i*4+8; // Ignore never taken branch
8680     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8681       ba[i]=start+i*4+8; // Ignore never taken branch
8682     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8683       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8684     else ba[i]=-1;
8685 #ifdef PCSX
8686     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8687       int do_in_intrp=0;
8688       // branch in delay slot?
8689       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8690         // don't handle first branch and call interpreter if it's hit
8691         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8692         do_in_intrp=1;
8693       }
8694       // basic load delay detection
8695       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8696         int t=(ba[i-1]-start)/4;
8697         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8698           // jump target wants DS result - potential load delay effect
8699           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8700           do_in_intrp=1;
8701           bt[t+1]=1; // expected return from interpreter
8702         }
8703         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8704               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8705           // v0 overwrite like this is a sign of trouble, bail out
8706           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8707           do_in_intrp=1;
8708         }
8709       }
8710       if(do_in_intrp) {
8711         rs1[i-1]=CCREG;
8712         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8713         ba[i-1]=-1;
8714         itype[i-1]=INTCALL;
8715         done=2;
8716         i--; // don't compile the DS
8717       }
8718     }
8719 #endif
8720     /* Is this the end of the block? */
8721     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8722       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8723         done=2;
8724       }
8725       else {
8726         if(stop_after_jal) done=1;
8727         // Stop on BREAK
8728         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8729       }
8730       // Don't recompile stuff that's already compiled
8731       if(check_addr(start+i*4+4)) done=1;
8732       // Don't get too close to the limit
8733       if(i>MAXBLOCK/2) done=1;
8734     }
8735     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8736     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8737     if(done==2) {
8738       // Does the block continue due to a branch?
8739       for(j=i-1;j>=0;j--)
8740       {
8741         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8742         if(ba[j]==start+i*4+4) done=j=0;
8743         if(ba[j]==start+i*4+8) done=j=0;
8744       }
8745     }
8746     //assert(i<MAXBLOCK-1);
8747     if(start+i*4==pagelimit-4) done=1;
8748     assert(start+i*4<pagelimit);
8749     if (i==MAXBLOCK-1) done=1;
8750     // Stop if we're compiling junk
8751     if(itype[i]==NI&&opcode[i]==0x11) {
8752       done=stop_after_jal=1;
8753       printf("Disabled speculative precompilation\n");
8754     }
8755   }
8756   slen=i;
8757   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8758     if(start+i*4==pagelimit) {
8759       itype[i-1]=SPAN;
8760     }
8761   }
8762   assert(slen>0);
8763
8764   /* Pass 2 - Register dependencies and branch targets */
8765
8766   unneeded_registers(0,slen-1,0);
8767   
8768   /* Pass 3 - Register allocation */
8769
8770   struct regstat current; // Current register allocations/status
8771   current.is32=1;
8772   current.dirty=0;
8773   current.u=unneeded_reg[0];
8774   current.uu=unneeded_reg_upper[0];
8775   clear_all_regs(current.regmap);
8776   alloc_reg(&current,0,CCREG);
8777   dirty_reg(&current,CCREG);
8778   current.isconst=0;
8779   current.wasconst=0;
8780   int ds=0;
8781   int cc=0;
8782   int hr=-1;
8783
8784 #ifndef FORCE32
8785   provisional_32bit();
8786 #endif
8787   if((u_int)addr&1) {
8788     // First instruction is delay slot
8789     cc=-1;
8790     bt[1]=1;
8791     ds=1;
8792     unneeded_reg[0]=1;
8793     unneeded_reg_upper[0]=1;
8794     current.regmap[HOST_BTREG]=BTREG;
8795   }
8796   
8797   for(i=0;i<slen;i++)
8798   {
8799     if(bt[i])
8800     {
8801       int hr;
8802       for(hr=0;hr<HOST_REGS;hr++)
8803       {
8804         // Is this really necessary?
8805         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8806       }
8807       current.isconst=0;
8808     }
8809     if(i>1)
8810     {
8811       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8812       {
8813         if(rs1[i-2]==0||rs2[i-2]==0)
8814         {
8815           if(rs1[i-2]) {
8816             current.is32|=1LL<<rs1[i-2];
8817             int hr=get_reg(current.regmap,rs1[i-2]|64);
8818             if(hr>=0) current.regmap[hr]=-1;
8819           }
8820           if(rs2[i-2]) {
8821             current.is32|=1LL<<rs2[i-2];
8822             int hr=get_reg(current.regmap,rs2[i-2]|64);
8823             if(hr>=0) current.regmap[hr]=-1;
8824           }
8825         }
8826       }
8827     }
8828 #ifndef FORCE32
8829     // If something jumps here with 64-bit values
8830     // then promote those registers to 64 bits
8831     if(bt[i])
8832     {
8833       uint64_t temp_is32=current.is32;
8834       for(j=i-1;j>=0;j--)
8835       {
8836         if(ba[j]==start+i*4) 
8837           temp_is32&=branch_regs[j].is32;
8838       }
8839       for(j=i;j<slen;j++)
8840       {
8841         if(ba[j]==start+i*4) 
8842           //temp_is32=1;
8843           temp_is32&=p32[j];
8844       }
8845       if(temp_is32!=current.is32) {
8846         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8847         #ifndef DESTRUCTIVE_WRITEBACK
8848         if(ds)
8849         #endif
8850         for(hr=0;hr<HOST_REGS;hr++)
8851         {
8852           int r=current.regmap[hr];
8853           if(r>0&&r<64)
8854           {
8855             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8856               temp_is32|=1LL<<r;
8857               //printf("restore %d\n",r);
8858             }
8859           }
8860         }
8861         current.is32=temp_is32;
8862       }
8863     }
8864 #else
8865     current.is32=-1LL;
8866 #endif
8867
8868     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8869     regs[i].wasconst=current.isconst;
8870     regs[i].was32=current.is32;
8871     regs[i].wasdirty=current.dirty;
8872     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8873     // To change a dirty register from 32 to 64 bits, we must write
8874     // it out during the previous cycle (for branches, 2 cycles)
8875     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8876     {
8877       uint64_t temp_is32=current.is32;
8878       for(j=i-1;j>=0;j--)
8879       {
8880         if(ba[j]==start+i*4+4) 
8881           temp_is32&=branch_regs[j].is32;
8882       }
8883       for(j=i;j<slen;j++)
8884       {
8885         if(ba[j]==start+i*4+4) 
8886           //temp_is32=1;
8887           temp_is32&=p32[j];
8888       }
8889       if(temp_is32!=current.is32) {
8890         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8891         for(hr=0;hr<HOST_REGS;hr++)
8892         {
8893           int r=current.regmap[hr];
8894           if(r>0)
8895           {
8896             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8897               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8898               {
8899                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8900                 {
8901                   //printf("dump %d/r%d\n",hr,r);
8902                   current.regmap[hr]=-1;
8903                   if(get_reg(current.regmap,r|64)>=0) 
8904                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8905                 }
8906               }
8907             }
8908           }
8909         }
8910       }
8911     }
8912     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8913     {
8914       uint64_t temp_is32=current.is32;
8915       for(j=i-1;j>=0;j--)
8916       {
8917         if(ba[j]==start+i*4+8) 
8918           temp_is32&=branch_regs[j].is32;
8919       }
8920       for(j=i;j<slen;j++)
8921       {
8922         if(ba[j]==start+i*4+8) 
8923           //temp_is32=1;
8924           temp_is32&=p32[j];
8925       }
8926       if(temp_is32!=current.is32) {
8927         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8928         for(hr=0;hr<HOST_REGS;hr++)
8929         {
8930           int r=current.regmap[hr];
8931           if(r>0)
8932           {
8933             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8934               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8935               {
8936                 //printf("dump %d/r%d\n",hr,r);
8937                 current.regmap[hr]=-1;
8938                 if(get_reg(current.regmap,r|64)>=0) 
8939                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8940               }
8941             }
8942           }
8943         }
8944       }
8945     }
8946     #endif
8947     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8948       if(i+1<slen) {
8949         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8950         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8951         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8952         current.u|=1;
8953         current.uu|=1;
8954       } else {
8955         current.u=1;
8956         current.uu=1;
8957       }
8958     } else {
8959       if(i+1<slen) {
8960         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8961         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8962         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8963         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8964         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8965         current.u|=1;
8966         current.uu|=1;
8967       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8968     }
8969     is_ds[i]=ds;
8970     if(ds) {
8971       ds=0; // Skip delay slot, already allocated as part of branch
8972       // ...but we need to alloc it in case something jumps here
8973       if(i+1<slen) {
8974         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8975         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8976       }else{
8977         current.u=branch_unneeded_reg[i-1];
8978         current.uu=branch_unneeded_reg_upper[i-1];
8979       }
8980       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8981       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8982       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8983       current.u|=1;
8984       current.uu|=1;
8985       struct regstat temp;
8986       memcpy(&temp,&current,sizeof(current));
8987       temp.wasdirty=temp.dirty;
8988       temp.was32=temp.is32;
8989       // TODO: Take into account unconditional branches, as below
8990       delayslot_alloc(&temp,i);
8991       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8992       regs[i].wasdirty=temp.wasdirty;
8993       regs[i].was32=temp.was32;
8994       regs[i].dirty=temp.dirty;
8995       regs[i].is32=temp.is32;
8996       regs[i].isconst=0;
8997       regs[i].wasconst=0;
8998       current.isconst=0;
8999       // Create entry (branch target) regmap
9000       for(hr=0;hr<HOST_REGS;hr++)
9001       {
9002         int r=temp.regmap[hr];
9003         if(r>=0) {
9004           if(r!=regmap_pre[i][hr]) {
9005             regs[i].regmap_entry[hr]=-1;
9006           }
9007           else
9008           {
9009             if(r<64){
9010               if((current.u>>r)&1) {
9011                 regs[i].regmap_entry[hr]=-1;
9012                 regs[i].regmap[hr]=-1;
9013                 //Don't clear regs in the delay slot as the branch might need them
9014                 //current.regmap[hr]=-1;
9015               }else
9016                 regs[i].regmap_entry[hr]=r;
9017             }
9018             else {
9019               if((current.uu>>(r&63))&1) {
9020                 regs[i].regmap_entry[hr]=-1;
9021                 regs[i].regmap[hr]=-1;
9022                 //Don't clear regs in the delay slot as the branch might need them
9023                 //current.regmap[hr]=-1;
9024               }else
9025                 regs[i].regmap_entry[hr]=r;
9026             }
9027           }
9028         } else {
9029           // First instruction expects CCREG to be allocated
9030           if(i==0&&hr==HOST_CCREG) 
9031             regs[i].regmap_entry[hr]=CCREG;
9032           else
9033             regs[i].regmap_entry[hr]=-1;
9034         }
9035       }
9036     }
9037     else { // Not delay slot
9038       switch(itype[i]) {
9039         case UJUMP:
9040           //current.isconst=0; // DEBUG
9041           //current.wasconst=0; // DEBUG
9042           //regs[i].wasconst=0; // DEBUG
9043           clear_const(&current,rt1[i]);
9044           alloc_cc(&current,i);
9045           dirty_reg(&current,CCREG);
9046           if (rt1[i]==31) {
9047             alloc_reg(&current,i,31);
9048             dirty_reg(&current,31);
9049             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9050             //assert(rt1[i+1]!=rt1[i]);
9051             #ifdef REG_PREFETCH
9052             alloc_reg(&current,i,PTEMP);
9053             #endif
9054             //current.is32|=1LL<<rt1[i];
9055           }
9056           ooo[i]=1;
9057           delayslot_alloc(&current,i+1);
9058           //current.isconst=0; // DEBUG
9059           ds=1;
9060           //printf("i=%d, isconst=%x\n",i,current.isconst);
9061           break;
9062         case RJUMP:
9063           //current.isconst=0;
9064           //current.wasconst=0;
9065           //regs[i].wasconst=0;
9066           clear_const(&current,rs1[i]);
9067           clear_const(&current,rt1[i]);
9068           alloc_cc(&current,i);
9069           dirty_reg(&current,CCREG);
9070           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9071             alloc_reg(&current,i,rs1[i]);
9072             if (rt1[i]!=0) {
9073               alloc_reg(&current,i,rt1[i]);
9074               dirty_reg(&current,rt1[i]);
9075               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9076               assert(rt1[i+1]!=rt1[i]);
9077               #ifdef REG_PREFETCH
9078               alloc_reg(&current,i,PTEMP);
9079               #endif
9080             }
9081             #ifdef USE_MINI_HT
9082             if(rs1[i]==31) { // JALR
9083               alloc_reg(&current,i,RHASH);
9084               #ifndef HOST_IMM_ADDR32
9085               alloc_reg(&current,i,RHTBL);
9086               #endif
9087             }
9088             #endif
9089             delayslot_alloc(&current,i+1);
9090           } else {
9091             // The delay slot overwrites our source register,
9092             // allocate a temporary register to hold the old value.
9093             current.isconst=0;
9094             current.wasconst=0;
9095             regs[i].wasconst=0;
9096             delayslot_alloc(&current,i+1);
9097             current.isconst=0;
9098             alloc_reg(&current,i,RTEMP);
9099           }
9100           //current.isconst=0; // DEBUG
9101           ooo[i]=1;
9102           ds=1;
9103           break;
9104         case CJUMP:
9105           //current.isconst=0;
9106           //current.wasconst=0;
9107           //regs[i].wasconst=0;
9108           clear_const(&current,rs1[i]);
9109           clear_const(&current,rs2[i]);
9110           if((opcode[i]&0x3E)==4) // BEQ/BNE
9111           {
9112             alloc_cc(&current,i);
9113             dirty_reg(&current,CCREG);
9114             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9115             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9116             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9117             {
9118               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9119               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9120             }
9121             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9122                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9123               // The delay slot overwrites one of our conditions.
9124               // Allocate the branch condition registers instead.
9125               current.isconst=0;
9126               current.wasconst=0;
9127               regs[i].wasconst=0;
9128               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9129               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9130               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9131               {
9132                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9133                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9134               }
9135             }
9136             else
9137             {
9138               ooo[i]=1;
9139               delayslot_alloc(&current,i+1);
9140             }
9141           }
9142           else
9143           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9144           {
9145             alloc_cc(&current,i);
9146             dirty_reg(&current,CCREG);
9147             alloc_reg(&current,i,rs1[i]);
9148             if(!(current.is32>>rs1[i]&1))
9149             {
9150               alloc_reg64(&current,i,rs1[i]);
9151             }
9152             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9153               // The delay slot overwrites one of our conditions.
9154               // Allocate the branch condition registers instead.
9155               current.isconst=0;
9156               current.wasconst=0;
9157               regs[i].wasconst=0;
9158               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9159               if(!((current.is32>>rs1[i])&1))
9160               {
9161                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9162               }
9163             }
9164             else
9165             {
9166               ooo[i]=1;
9167               delayslot_alloc(&current,i+1);
9168             }
9169           }
9170           else
9171           // Don't alloc the delay slot yet because we might not execute it
9172           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9173           {
9174             current.isconst=0;
9175             current.wasconst=0;
9176             regs[i].wasconst=0;
9177             alloc_cc(&current,i);
9178             dirty_reg(&current,CCREG);
9179             alloc_reg(&current,i,rs1[i]);
9180             alloc_reg(&current,i,rs2[i]);
9181             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9182             {
9183               alloc_reg64(&current,i,rs1[i]);
9184               alloc_reg64(&current,i,rs2[i]);
9185             }
9186           }
9187           else
9188           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9189           {
9190             current.isconst=0;
9191             current.wasconst=0;
9192             regs[i].wasconst=0;
9193             alloc_cc(&current,i);
9194             dirty_reg(&current,CCREG);
9195             alloc_reg(&current,i,rs1[i]);
9196             if(!(current.is32>>rs1[i]&1))
9197             {
9198               alloc_reg64(&current,i,rs1[i]);
9199             }
9200           }
9201           ds=1;
9202           //current.isconst=0;
9203           break;
9204         case SJUMP:
9205           //current.isconst=0;
9206           //current.wasconst=0;
9207           //regs[i].wasconst=0;
9208           clear_const(&current,rs1[i]);
9209           clear_const(&current,rt1[i]);
9210           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9211           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9212           {
9213             alloc_cc(&current,i);
9214             dirty_reg(&current,CCREG);
9215             alloc_reg(&current,i,rs1[i]);
9216             if(!(current.is32>>rs1[i]&1))
9217             {
9218               alloc_reg64(&current,i,rs1[i]);
9219             }
9220             if (rt1[i]==31) { // BLTZAL/BGEZAL
9221               alloc_reg(&current,i,31);
9222               dirty_reg(&current,31);
9223               //#ifdef REG_PREFETCH
9224               //alloc_reg(&current,i,PTEMP);
9225               //#endif
9226               //current.is32|=1LL<<rt1[i];
9227             }
9228             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9229                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9230               // Allocate the branch condition registers instead.
9231               current.isconst=0;
9232               current.wasconst=0;
9233               regs[i].wasconst=0;
9234               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9235               if(!((current.is32>>rs1[i])&1))
9236               {
9237                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9238               }
9239             }
9240             else
9241             {
9242               ooo[i]=1;
9243               delayslot_alloc(&current,i+1);
9244             }
9245           }
9246           else
9247           // Don't alloc the delay slot yet because we might not execute it
9248           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9249           {
9250             current.isconst=0;
9251             current.wasconst=0;
9252             regs[i].wasconst=0;
9253             alloc_cc(&current,i);
9254             dirty_reg(&current,CCREG);
9255             alloc_reg(&current,i,rs1[i]);
9256             if(!(current.is32>>rs1[i]&1))
9257             {
9258               alloc_reg64(&current,i,rs1[i]);
9259             }
9260           }
9261           ds=1;
9262           //current.isconst=0;
9263           break;
9264         case FJUMP:
9265           current.isconst=0;
9266           current.wasconst=0;
9267           regs[i].wasconst=0;
9268           if(likely[i]==0) // BC1F/BC1T
9269           {
9270             // TODO: Theoretically we can run out of registers here on x86.
9271             // The delay slot can allocate up to six, and we need to check
9272             // CSREG before executing the delay slot.  Possibly we can drop
9273             // the cycle count and then reload it after checking that the
9274             // FPU is in a usable state, or don't do out-of-order execution.
9275             alloc_cc(&current,i);
9276             dirty_reg(&current,CCREG);
9277             alloc_reg(&current,i,FSREG);
9278             alloc_reg(&current,i,CSREG);
9279             if(itype[i+1]==FCOMP) {
9280               // The delay slot overwrites the branch condition.
9281               // Allocate the branch condition registers instead.
9282               alloc_cc(&current,i);
9283               dirty_reg(&current,CCREG);
9284               alloc_reg(&current,i,CSREG);
9285               alloc_reg(&current,i,FSREG);
9286             }
9287             else {
9288               ooo[i]=1;
9289               delayslot_alloc(&current,i+1);
9290               alloc_reg(&current,i+1,CSREG);
9291             }
9292           }
9293           else
9294           // Don't alloc the delay slot yet because we might not execute it
9295           if(likely[i]) // BC1FL/BC1TL
9296           {
9297             alloc_cc(&current,i);
9298             dirty_reg(&current,CCREG);
9299             alloc_reg(&current,i,CSREG);
9300             alloc_reg(&current,i,FSREG);
9301           }
9302           ds=1;
9303           current.isconst=0;
9304           break;
9305         case IMM16:
9306           imm16_alloc(&current,i);
9307           break;
9308         case LOAD:
9309         case LOADLR:
9310           load_alloc(&current,i);
9311           break;
9312         case STORE:
9313         case STORELR:
9314           store_alloc(&current,i);
9315           break;
9316         case ALU:
9317           alu_alloc(&current,i);
9318           break;
9319         case SHIFT:
9320           shift_alloc(&current,i);
9321           break;
9322         case MULTDIV:
9323           multdiv_alloc(&current,i);
9324           break;
9325         case SHIFTIMM:
9326           shiftimm_alloc(&current,i);
9327           break;
9328         case MOV:
9329           mov_alloc(&current,i);
9330           break;
9331         case COP0:
9332           cop0_alloc(&current,i);
9333           break;
9334         case COP1:
9335         case COP2:
9336           cop1_alloc(&current,i);
9337           break;
9338         case C1LS:
9339           c1ls_alloc(&current,i);
9340           break;
9341         case C2LS:
9342           c2ls_alloc(&current,i);
9343           break;
9344         case C2OP:
9345           c2op_alloc(&current,i);
9346           break;
9347         case FCONV:
9348           fconv_alloc(&current,i);
9349           break;
9350         case FLOAT:
9351           float_alloc(&current,i);
9352           break;
9353         case FCOMP:
9354           fcomp_alloc(&current,i);
9355           break;
9356         case SYSCALL:
9357         case HLECALL:
9358         case INTCALL:
9359           syscall_alloc(&current,i);
9360           break;
9361         case SPAN:
9362           pagespan_alloc(&current,i);
9363           break;
9364       }
9365       
9366       // Drop the upper half of registers that have become 32-bit
9367       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9368       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9369         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9370         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9371         current.uu|=1;
9372       } else {
9373         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9374         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9375         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9376         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9377         current.uu|=1;
9378       }
9379
9380       // Create entry (branch target) regmap
9381       for(hr=0;hr<HOST_REGS;hr++)
9382       {
9383         int r,or,er;
9384         r=current.regmap[hr];
9385         if(r>=0) {
9386           if(r!=regmap_pre[i][hr]) {
9387             // TODO: delay slot (?)
9388             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9389             if(or<0||(r&63)>=TEMPREG){
9390               regs[i].regmap_entry[hr]=-1;
9391             }
9392             else
9393             {
9394               // Just move it to a different register
9395               regs[i].regmap_entry[hr]=r;
9396               // If it was dirty before, it's still dirty
9397               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9398             }
9399           }
9400           else
9401           {
9402             // Unneeded
9403             if(r==0){
9404               regs[i].regmap_entry[hr]=0;
9405             }
9406             else
9407             if(r<64){
9408               if((current.u>>r)&1) {
9409                 regs[i].regmap_entry[hr]=-1;
9410                 //regs[i].regmap[hr]=-1;
9411                 current.regmap[hr]=-1;
9412               }else
9413                 regs[i].regmap_entry[hr]=r;
9414             }
9415             else {
9416               if((current.uu>>(r&63))&1) {
9417                 regs[i].regmap_entry[hr]=-1;
9418                 //regs[i].regmap[hr]=-1;
9419                 current.regmap[hr]=-1;
9420               }else
9421                 regs[i].regmap_entry[hr]=r;
9422             }
9423           }
9424         } else {
9425           // Branches expect CCREG to be allocated at the target
9426           if(regmap_pre[i][hr]==CCREG) 
9427             regs[i].regmap_entry[hr]=CCREG;
9428           else
9429             regs[i].regmap_entry[hr]=-1;
9430         }
9431       }
9432       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9433     }
9434     /* Branch post-alloc */
9435     if(i>0)
9436     {
9437       current.was32=current.is32;
9438       current.wasdirty=current.dirty;
9439       switch(itype[i-1]) {
9440         case UJUMP:
9441           memcpy(&branch_regs[i-1],&current,sizeof(current));
9442           branch_regs[i-1].isconst=0;
9443           branch_regs[i-1].wasconst=0;
9444           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9445           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9446           alloc_cc(&branch_regs[i-1],i-1);
9447           dirty_reg(&branch_regs[i-1],CCREG);
9448           if(rt1[i-1]==31) { // JAL
9449             alloc_reg(&branch_regs[i-1],i-1,31);
9450             dirty_reg(&branch_regs[i-1],31);
9451             branch_regs[i-1].is32|=1LL<<31;
9452           }
9453           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9454           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9455           break;
9456         case RJUMP:
9457           memcpy(&branch_regs[i-1],&current,sizeof(current));
9458           branch_regs[i-1].isconst=0;
9459           branch_regs[i-1].wasconst=0;
9460           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9461           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9462           alloc_cc(&branch_regs[i-1],i-1);
9463           dirty_reg(&branch_regs[i-1],CCREG);
9464           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9465           if(rt1[i-1]!=0) { // JALR
9466             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9467             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9468             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9469           }
9470           #ifdef USE_MINI_HT
9471           if(rs1[i-1]==31) { // JALR
9472             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9473             #ifndef HOST_IMM_ADDR32
9474             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9475             #endif
9476           }
9477           #endif
9478           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9479           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9480           break;
9481         case CJUMP:
9482           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9483           {
9484             alloc_cc(&current,i-1);
9485             dirty_reg(&current,CCREG);
9486             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9487                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9488               // The delay slot overwrote one of our conditions
9489               // Delay slot goes after the test (in order)
9490               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9491               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9492               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9493               current.u|=1;
9494               current.uu|=1;
9495               delayslot_alloc(&current,i);
9496               current.isconst=0;
9497             }
9498             else
9499             {
9500               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9501               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9502               // Alloc the branch condition registers
9503               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9504               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9505               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9506               {
9507                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9508                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9509               }
9510             }
9511             memcpy(&branch_regs[i-1],&current,sizeof(current));
9512             branch_regs[i-1].isconst=0;
9513             branch_regs[i-1].wasconst=0;
9514             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9515             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9516           }
9517           else
9518           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9519           {
9520             alloc_cc(&current,i-1);
9521             dirty_reg(&current,CCREG);
9522             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9523               // The delay slot overwrote the branch condition
9524               // Delay slot goes after the test (in order)
9525               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9526               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9527               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9528               current.u|=1;
9529               current.uu|=1;
9530               delayslot_alloc(&current,i);
9531               current.isconst=0;
9532             }
9533             else
9534             {
9535               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9536               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9537               // Alloc the branch condition register
9538               alloc_reg(&current,i-1,rs1[i-1]);
9539               if(!(current.is32>>rs1[i-1]&1))
9540               {
9541                 alloc_reg64(&current,i-1,rs1[i-1]);
9542               }
9543             }
9544             memcpy(&branch_regs[i-1],&current,sizeof(current));
9545             branch_regs[i-1].isconst=0;
9546             branch_regs[i-1].wasconst=0;
9547             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9548             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9549           }
9550           else
9551           // Alloc the delay slot in case the branch is taken
9552           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9553           {
9554             memcpy(&branch_regs[i-1],&current,sizeof(current));
9555             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9556             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9557             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9558             alloc_cc(&branch_regs[i-1],i);
9559             dirty_reg(&branch_regs[i-1],CCREG);
9560             delayslot_alloc(&branch_regs[i-1],i);
9561             branch_regs[i-1].isconst=0;
9562             alloc_reg(&current,i,CCREG); // Not taken path
9563             dirty_reg(&current,CCREG);
9564             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9565           }
9566           else
9567           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9568           {
9569             memcpy(&branch_regs[i-1],&current,sizeof(current));
9570             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9571             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9572             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9573             alloc_cc(&branch_regs[i-1],i);
9574             dirty_reg(&branch_regs[i-1],CCREG);
9575             delayslot_alloc(&branch_regs[i-1],i);
9576             branch_regs[i-1].isconst=0;
9577             alloc_reg(&current,i,CCREG); // Not taken path
9578             dirty_reg(&current,CCREG);
9579             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9580           }
9581           break;
9582         case SJUMP:
9583           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9584           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9585           {
9586             alloc_cc(&current,i-1);
9587             dirty_reg(&current,CCREG);
9588             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9589               // The delay slot overwrote the branch condition
9590               // Delay slot goes after the test (in order)
9591               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9592               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9593               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9594               current.u|=1;
9595               current.uu|=1;
9596               delayslot_alloc(&current,i);
9597               current.isconst=0;
9598             }
9599             else
9600             {
9601               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9602               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9603               // Alloc the branch condition register
9604               alloc_reg(&current,i-1,rs1[i-1]);
9605               if(!(current.is32>>rs1[i-1]&1))
9606               {
9607                 alloc_reg64(&current,i-1,rs1[i-1]);
9608               }
9609             }
9610             memcpy(&branch_regs[i-1],&current,sizeof(current));
9611             branch_regs[i-1].isconst=0;
9612             branch_regs[i-1].wasconst=0;
9613             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9614             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9615           }
9616           else
9617           // Alloc the delay slot in case the branch is taken
9618           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9619           {
9620             memcpy(&branch_regs[i-1],&current,sizeof(current));
9621             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9622             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9623             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9624             alloc_cc(&branch_regs[i-1],i);
9625             dirty_reg(&branch_regs[i-1],CCREG);
9626             delayslot_alloc(&branch_regs[i-1],i);
9627             branch_regs[i-1].isconst=0;
9628             alloc_reg(&current,i,CCREG); // Not taken path
9629             dirty_reg(&current,CCREG);
9630             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9631           }
9632           // FIXME: BLTZAL/BGEZAL
9633           if(opcode2[i-1]&0x10) { // BxxZAL
9634             alloc_reg(&branch_regs[i-1],i-1,31);
9635             dirty_reg(&branch_regs[i-1],31);
9636             branch_regs[i-1].is32|=1LL<<31;
9637           }
9638           break;
9639         case FJUMP:
9640           if(likely[i-1]==0) // BC1F/BC1T
9641           {
9642             alloc_cc(&current,i-1);
9643             dirty_reg(&current,CCREG);
9644             if(itype[i]==FCOMP) {
9645               // The delay slot overwrote the branch condition
9646               // Delay slot goes after the test (in order)
9647               delayslot_alloc(&current,i);
9648               current.isconst=0;
9649             }
9650             else
9651             {
9652               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9653               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9654               // Alloc the branch condition register
9655               alloc_reg(&current,i-1,FSREG);
9656             }
9657             memcpy(&branch_regs[i-1],&current,sizeof(current));
9658             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9659           }
9660           else // BC1FL/BC1TL
9661           {
9662             // Alloc the delay slot in case the branch is taken
9663             memcpy(&branch_regs[i-1],&current,sizeof(current));
9664             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9665             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9666             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9667             alloc_cc(&branch_regs[i-1],i);
9668             dirty_reg(&branch_regs[i-1],CCREG);
9669             delayslot_alloc(&branch_regs[i-1],i);
9670             branch_regs[i-1].isconst=0;
9671             alloc_reg(&current,i,CCREG); // Not taken path
9672             dirty_reg(&current,CCREG);
9673             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9674           }
9675           break;
9676       }
9677
9678       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9679       {
9680         if(rt1[i-1]==31) // JAL/JALR
9681         {
9682           // Subroutine call will return here, don't alloc any registers
9683           current.is32=1;
9684           current.dirty=0;
9685           clear_all_regs(current.regmap);
9686           alloc_reg(&current,i,CCREG);
9687           dirty_reg(&current,CCREG);
9688         }
9689         else if(i+1<slen)
9690         {
9691           // Internal branch will jump here, match registers to caller
9692           current.is32=0x3FFFFFFFFLL;
9693           current.dirty=0;
9694           clear_all_regs(current.regmap);
9695           alloc_reg(&current,i,CCREG);
9696           dirty_reg(&current,CCREG);
9697           for(j=i-1;j>=0;j--)
9698           {
9699             if(ba[j]==start+i*4+4) {
9700               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9701               current.is32=branch_regs[j].is32;
9702               current.dirty=branch_regs[j].dirty;
9703               break;
9704             }
9705           }
9706           while(j>=0) {
9707             if(ba[j]==start+i*4+4) {
9708               for(hr=0;hr<HOST_REGS;hr++) {
9709                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9710                   current.regmap[hr]=-1;
9711                 }
9712                 current.is32&=branch_regs[j].is32;
9713                 current.dirty&=branch_regs[j].dirty;
9714               }
9715             }
9716             j--;
9717           }
9718         }
9719       }
9720     }
9721
9722     // Count cycles in between branches
9723     ccadj[i]=cc;
9724     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9725     {
9726       cc=0;
9727     }
9728 #ifdef PCSX
9729     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9730     {
9731       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9732     }
9733     else if(itype[i]==C2LS)
9734     {
9735       cc+=4;
9736     }
9737 #endif
9738     else
9739     {
9740       cc++;
9741     }
9742
9743     flush_dirty_uppers(&current);
9744     if(!is_ds[i]) {
9745       regs[i].is32=current.is32;
9746       regs[i].dirty=current.dirty;
9747       regs[i].isconst=current.isconst;
9748       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9749     }
9750     for(hr=0;hr<HOST_REGS;hr++) {
9751       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9752         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9753           regs[i].wasconst&=~(1<<hr);
9754         }
9755       }
9756     }
9757     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9758   }
9759   
9760   /* Pass 4 - Cull unused host registers */
9761   
9762   uint64_t nr=0;
9763   
9764   for (i=slen-1;i>=0;i--)
9765   {
9766     int hr;
9767     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9768     {
9769       if(ba[i]<start || ba[i]>=(start+slen*4))
9770       {
9771         // Branch out of this block, don't need anything
9772         nr=0;
9773       }
9774       else
9775       {
9776         // Internal branch
9777         // Need whatever matches the target
9778         nr=0;
9779         int t=(ba[i]-start)>>2;
9780         for(hr=0;hr<HOST_REGS;hr++)
9781         {
9782           if(regs[i].regmap_entry[hr]>=0) {
9783             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9784           }
9785         }
9786       }
9787       // Conditional branch may need registers for following instructions
9788       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9789       {
9790         if(i<slen-2) {
9791           nr|=needed_reg[i+2];
9792           for(hr=0;hr<HOST_REGS;hr++)
9793           {
9794             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9795             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9796           }
9797         }
9798       }
9799       // Don't need stuff which is overwritten
9800       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9801       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9802       // Merge in delay slot
9803       for(hr=0;hr<HOST_REGS;hr++)
9804       {
9805         if(!likely[i]) {
9806           // These are overwritten unless the branch is "likely"
9807           // and the delay slot is nullified if not taken
9808           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9809           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9810         }
9811         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9812         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9813         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9814         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9815         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9816         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9817         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9818         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9819         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9820           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9821           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9822         }
9823         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9824           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9825           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9826         }
9827         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9828           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9829           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9830         }
9831       }
9832     }
9833     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9834     {
9835       // SYSCALL instruction (software interrupt)
9836       nr=0;
9837     }
9838     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9839     {
9840       // ERET instruction (return from interrupt)
9841       nr=0;
9842     }
9843     else // Non-branch
9844     {
9845       if(i<slen-1) {
9846         for(hr=0;hr<HOST_REGS;hr++) {
9847           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9848           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9849           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9850           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9851         }
9852       }
9853     }
9854     for(hr=0;hr<HOST_REGS;hr++)
9855     {
9856       // Overwritten registers are not needed
9857       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9858       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9859       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9860       // Source registers are needed
9861       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9862       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9863       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9864       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9865       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9866       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9867       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9868       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9869       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9870         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9871         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9872       }
9873       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9874         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9875         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9876       }
9877       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9878         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9879         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9880       }
9881       // Don't store a register immediately after writing it,
9882       // may prevent dual-issue.
9883       // But do so if this is a branch target, otherwise we
9884       // might have to load the register before the branch.
9885       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9886         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9887            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9888           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9889           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9890         }
9891         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9892            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9893           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9894           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9895         }
9896       }
9897     }
9898     // Cycle count is needed at branches.  Assume it is needed at the target too.
9899     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9900       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9901       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9902     }
9903     // Save it
9904     needed_reg[i]=nr;
9905     
9906     // Deallocate unneeded registers
9907     for(hr=0;hr<HOST_REGS;hr++)
9908     {
9909       if(!((nr>>hr)&1)) {
9910         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9911         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9912            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9913            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9914         {
9915           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9916           {
9917             if(likely[i]) {
9918               regs[i].regmap[hr]=-1;
9919               regs[i].isconst&=~(1<<hr);
9920               if(i<slen-2) {
9921                 regmap_pre[i+2][hr]=-1;
9922                 regs[i+2].wasconst&=~(1<<hr);
9923               }
9924             }
9925           }
9926         }
9927         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9928         {
9929           int d1=0,d2=0,map=0,temp=0;
9930           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9931           {
9932             d1=dep1[i+1];
9933             d2=dep2[i+1];
9934           }
9935           if(using_tlb) {
9936             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9937                itype[i+1]==STORE || itype[i+1]==STORELR ||
9938                itype[i+1]==C1LS || itype[i+1]==C2LS)
9939             map=TLREG;
9940           } else
9941           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9942              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9943             map=INVCP;
9944           }
9945           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9946              itype[i+1]==C1LS || itype[i+1]==C2LS)
9947             temp=FTEMP;
9948           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9949              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9950              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9951              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9952              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9953              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9954              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9955              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9956              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9957              regs[i].regmap[hr]!=map )
9958           {
9959             regs[i].regmap[hr]=-1;
9960             regs[i].isconst&=~(1<<hr);
9961             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9962                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9963                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9964                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9965                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9966                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9967                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9968                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9969                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9970                branch_regs[i].regmap[hr]!=map)
9971             {
9972               branch_regs[i].regmap[hr]=-1;
9973               branch_regs[i].regmap_entry[hr]=-1;
9974               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9975               {
9976                 if(!likely[i]&&i<slen-2) {
9977                   regmap_pre[i+2][hr]=-1;
9978                   regs[i+2].wasconst&=~(1<<hr);
9979                 }
9980               }
9981             }
9982           }
9983         }
9984         else
9985         {
9986           // Non-branch
9987           if(i>0)
9988           {
9989             int d1=0,d2=0,map=-1,temp=-1;
9990             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9991             {
9992               d1=dep1[i];
9993               d2=dep2[i];
9994             }
9995             if(using_tlb) {
9996               if(itype[i]==LOAD || itype[i]==LOADLR ||
9997                  itype[i]==STORE || itype[i]==STORELR ||
9998                  itype[i]==C1LS || itype[i]==C2LS)
9999               map=TLREG;
10000             } else if(itype[i]==STORE || itype[i]==STORELR ||
10001                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10002               map=INVCP;
10003             }
10004             if(itype[i]==LOADLR || itype[i]==STORELR ||
10005                itype[i]==C1LS || itype[i]==C2LS)
10006               temp=FTEMP;
10007             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10008                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10009                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10010                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10011                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10012                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10013             {
10014               if(i<slen-1&&!is_ds[i]) {
10015                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10016                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10017                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10018                 {
10019                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10020                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10021                 }
10022                 regmap_pre[i+1][hr]=-1;
10023                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10024                 regs[i+1].wasconst&=~(1<<hr);
10025               }
10026               regs[i].regmap[hr]=-1;
10027               regs[i].isconst&=~(1<<hr);
10028             }
10029           }
10030         }
10031       }
10032     }
10033   }
10034   
10035   /* Pass 5 - Pre-allocate registers */
10036   
10037   // If a register is allocated during a loop, try to allocate it for the
10038   // entire loop, if possible.  This avoids loading/storing registers
10039   // inside of the loop.
10040   
10041   signed char f_regmap[HOST_REGS];
10042   clear_all_regs(f_regmap);
10043   for(i=0;i<slen-1;i++)
10044   {
10045     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10046     {
10047       if(ba[i]>=start && ba[i]<(start+i*4)) 
10048       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10049       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10050       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10051       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10052       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10053       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10054       {
10055         int t=(ba[i]-start)>>2;
10056         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10057         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10058         for(hr=0;hr<HOST_REGS;hr++)
10059         {
10060           if(regs[i].regmap[hr]>64) {
10061             if(!((regs[i].dirty>>hr)&1))
10062               f_regmap[hr]=regs[i].regmap[hr];
10063             else f_regmap[hr]=-1;
10064           }
10065           else if(regs[i].regmap[hr]>=0) {
10066             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10067               // dealloc old register
10068               int n;
10069               for(n=0;n<HOST_REGS;n++)
10070               {
10071                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10072               }
10073               // and alloc new one
10074               f_regmap[hr]=regs[i].regmap[hr];
10075             }
10076           }
10077           if(branch_regs[i].regmap[hr]>64) {
10078             if(!((branch_regs[i].dirty>>hr)&1))
10079               f_regmap[hr]=branch_regs[i].regmap[hr];
10080             else f_regmap[hr]=-1;
10081           }
10082           else if(branch_regs[i].regmap[hr]>=0) {
10083             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10084               // dealloc old register
10085               int n;
10086               for(n=0;n<HOST_REGS;n++)
10087               {
10088                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10089               }
10090               // and alloc new one
10091               f_regmap[hr]=branch_regs[i].regmap[hr];
10092             }
10093           }
10094           if(ooo[i]) {
10095             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
10096               f_regmap[hr]=branch_regs[i].regmap[hr];
10097           }else{
10098             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
10099               f_regmap[hr]=branch_regs[i].regmap[hr];
10100           }
10101           // Avoid dirty->clean transition
10102           #ifdef DESTRUCTIVE_WRITEBACK
10103           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10104           #endif
10105           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10106           // case above, however it's always a good idea.  We can't hoist the
10107           // load if the register was already allocated, so there's no point
10108           // wasting time analyzing most of these cases.  It only "succeeds"
10109           // when the mapping was different and the load can be replaced with
10110           // a mov, which is of negligible benefit.  So such cases are
10111           // skipped below.
10112           if(f_regmap[hr]>0) {
10113             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10114               int r=f_regmap[hr];
10115               for(j=t;j<=i;j++)
10116               {
10117                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10118                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10119                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10120                 if(r>63) {
10121                   // NB This can exclude the case where the upper-half
10122                   // register is lower numbered than the lower-half
10123                   // register.  Not sure if it's worth fixing...
10124                   if(get_reg(regs[j].regmap,r&63)<0) break;
10125                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10126                   if(regs[j].is32&(1LL<<(r&63))) break;
10127                 }
10128                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10129                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10130                   int k;
10131                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10132                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10133                     if(r>63) {
10134                       if(get_reg(regs[i].regmap,r&63)<0) break;
10135                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10136                     }
10137                     k=i;
10138                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10139                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10140                         //printf("no free regs for store %x\n",start+(k-1)*4);
10141                         break;
10142                       }
10143                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10144                         //printf("no-match due to different register\n");
10145                         break;
10146                       }
10147                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10148                         //printf("no-match due to branch\n");
10149                         break;
10150                       }
10151                       // call/ret fast path assumes no registers allocated
10152                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10153                         break;
10154                       }
10155                       if(r>63) {
10156                         // NB This can exclude the case where the upper-half
10157                         // register is lower numbered than the lower-half
10158                         // register.  Not sure if it's worth fixing...
10159                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10160                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10161                       }
10162                       k--;
10163                     }
10164                     if(i<slen-1) {
10165                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10166                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10167                         //printf("bad match after branch\n");
10168                         break;
10169                       }
10170                     }
10171                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10172                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10173                       while(k<i) {
10174                         regs[k].regmap_entry[hr]=f_regmap[hr];
10175                         regs[k].regmap[hr]=f_regmap[hr];
10176                         regmap_pre[k+1][hr]=f_regmap[hr];
10177                         regs[k].wasdirty&=~(1<<hr);
10178                         regs[k].dirty&=~(1<<hr);
10179                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10180                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10181                         regs[k].wasconst&=~(1<<hr);
10182                         regs[k].isconst&=~(1<<hr);
10183                         k++;
10184                       }
10185                     }
10186                     else {
10187                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10188                       break;
10189                     }
10190                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10191                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10192                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10193                       regs[i].regmap_entry[hr]=f_regmap[hr];
10194                       regs[i].regmap[hr]=f_regmap[hr];
10195                       regs[i].wasdirty&=~(1<<hr);
10196                       regs[i].dirty&=~(1<<hr);
10197                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10198                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10199                       regs[i].wasconst&=~(1<<hr);
10200                       regs[i].isconst&=~(1<<hr);
10201                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10202                       branch_regs[i].wasdirty&=~(1<<hr);
10203                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10204                       branch_regs[i].regmap[hr]=f_regmap[hr];
10205                       branch_regs[i].dirty&=~(1<<hr);
10206                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10207                       branch_regs[i].wasconst&=~(1<<hr);
10208                       branch_regs[i].isconst&=~(1<<hr);
10209                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10210                         regmap_pre[i+2][hr]=f_regmap[hr];
10211                         regs[i+2].wasdirty&=~(1<<hr);
10212                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10213                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10214                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10215                       }
10216                     }
10217                   }
10218                   for(k=t;k<j;k++) {
10219                     // Alloc register clean at beginning of loop,
10220                     // but may dirty it in pass 6
10221                     regs[k].regmap_entry[hr]=f_regmap[hr];
10222                     regs[k].regmap[hr]=f_regmap[hr];
10223                     regs[k].dirty&=~(1<<hr);
10224                     regs[k].wasconst&=~(1<<hr);
10225                     regs[k].isconst&=~(1<<hr);
10226                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10227                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10228                       branch_regs[k].regmap[hr]=f_regmap[hr];
10229                       branch_regs[k].dirty&=~(1<<hr);
10230                       branch_regs[k].wasconst&=~(1<<hr);
10231                       branch_regs[k].isconst&=~(1<<hr);
10232                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10233                         regmap_pre[k+2][hr]=f_regmap[hr];
10234                         regs[k+2].wasdirty&=~(1<<hr);
10235                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10236                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10237                       }
10238                     }
10239                     else
10240                     {
10241                       regmap_pre[k+1][hr]=f_regmap[hr];
10242                       regs[k+1].wasdirty&=~(1<<hr);
10243                     }
10244                   }
10245                   if(regs[j].regmap[hr]==f_regmap[hr])
10246                     regs[j].regmap_entry[hr]=f_regmap[hr];
10247                   break;
10248                 }
10249                 if(j==i) break;
10250                 if(regs[j].regmap[hr]>=0)
10251                   break;
10252                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10253                   //printf("no-match due to different register\n");
10254                   break;
10255                 }
10256                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10257                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10258                   break;
10259                 }
10260                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10261                 {
10262                   // Stop on unconditional branch
10263                   break;
10264                 }
10265                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10266                 {
10267                   if(ooo[j]) {
10268                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10269                       break;
10270                   }else{
10271                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10272                       break;
10273                   }
10274                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10275                     //printf("no-match due to different register (branch)\n");
10276                     break;
10277                   }
10278                 }
10279                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10280                   //printf("No free regs for store %x\n",start+j*4);
10281                   break;
10282                 }
10283                 if(f_regmap[hr]>=64) {
10284                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10285                     break;
10286                   }
10287                   else
10288                   {
10289                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10290                       break;
10291                     }
10292                   }
10293                 }
10294               }
10295             }
10296           }
10297         }
10298       }
10299     }else{
10300       // Non branch or undetermined branch target
10301       for(hr=0;hr<HOST_REGS;hr++)
10302       {
10303         if(hr!=EXCLUDE_REG) {
10304           if(regs[i].regmap[hr]>64) {
10305             if(!((regs[i].dirty>>hr)&1))
10306               f_regmap[hr]=regs[i].regmap[hr];
10307           }
10308           else if(regs[i].regmap[hr]>=0) {
10309             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10310               // dealloc old register
10311               int n;
10312               for(n=0;n<HOST_REGS;n++)
10313               {
10314                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10315               }
10316               // and alloc new one
10317               f_regmap[hr]=regs[i].regmap[hr];
10318             }
10319           }
10320         }
10321       }
10322       // Try to restore cycle count at branch targets
10323       if(bt[i]) {
10324         for(j=i;j<slen-1;j++) {
10325           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10326           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10327             //printf("no free regs for store %x\n",start+j*4);
10328             break;
10329           }
10330         }
10331         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10332           int k=i;
10333           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10334           while(k<j) {
10335             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10336             regs[k].regmap[HOST_CCREG]=CCREG;
10337             regmap_pre[k+1][HOST_CCREG]=CCREG;
10338             regs[k+1].wasdirty|=1<<HOST_CCREG;
10339             regs[k].dirty|=1<<HOST_CCREG;
10340             regs[k].wasconst&=~(1<<HOST_CCREG);
10341             regs[k].isconst&=~(1<<HOST_CCREG);
10342             k++;
10343           }
10344           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10345         }
10346         // Work backwards from the branch target
10347         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10348         {
10349           //printf("Extend backwards\n");
10350           int k;
10351           k=i;
10352           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10353             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10354               //printf("no free regs for store %x\n",start+(k-1)*4);
10355               break;
10356             }
10357             k--;
10358           }
10359           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10360             //printf("Extend CC, %x ->\n",start+k*4);
10361             while(k<=i) {
10362               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10363               regs[k].regmap[HOST_CCREG]=CCREG;
10364               regmap_pre[k+1][HOST_CCREG]=CCREG;
10365               regs[k+1].wasdirty|=1<<HOST_CCREG;
10366               regs[k].dirty|=1<<HOST_CCREG;
10367               regs[k].wasconst&=~(1<<HOST_CCREG);
10368               regs[k].isconst&=~(1<<HOST_CCREG);
10369               k++;
10370             }
10371           }
10372           else {
10373             //printf("Fail Extend CC, %x ->\n",start+k*4);
10374           }
10375         }
10376       }
10377       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10378          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10379          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10380          itype[i]!=FCONV&&itype[i]!=FCOMP)
10381       {
10382         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10383       }
10384     }
10385   }
10386   
10387   // Cache memory offset or tlb map pointer if a register is available
10388   #ifndef HOST_IMM_ADDR32
10389   #ifndef RAM_OFFSET
10390   if(using_tlb)
10391   #endif
10392   {
10393     int earliest_available[HOST_REGS];
10394     int loop_start[HOST_REGS];
10395     int score[HOST_REGS];
10396     int end[HOST_REGS];
10397     int reg=using_tlb?MMREG:ROREG;
10398
10399     // Init
10400     for(hr=0;hr<HOST_REGS;hr++) {
10401       score[hr]=0;earliest_available[hr]=0;
10402       loop_start[hr]=MAXBLOCK;
10403     }
10404     for(i=0;i<slen-1;i++)
10405     {
10406       // Can't do anything if no registers are available
10407       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10408         for(hr=0;hr<HOST_REGS;hr++) {
10409           score[hr]=0;earliest_available[hr]=i+1;
10410           loop_start[hr]=MAXBLOCK;
10411         }
10412       }
10413       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10414         if(!ooo[i]) {
10415           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10416             for(hr=0;hr<HOST_REGS;hr++) {
10417               score[hr]=0;earliest_available[hr]=i+1;
10418               loop_start[hr]=MAXBLOCK;
10419             }
10420           }
10421         }else{
10422           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10423             for(hr=0;hr<HOST_REGS;hr++) {
10424               score[hr]=0;earliest_available[hr]=i+1;
10425               loop_start[hr]=MAXBLOCK;
10426             }
10427           }
10428         }
10429       }
10430       // Mark unavailable registers
10431       for(hr=0;hr<HOST_REGS;hr++) {
10432         if(regs[i].regmap[hr]>=0) {
10433           score[hr]=0;earliest_available[hr]=i+1;
10434           loop_start[hr]=MAXBLOCK;
10435         }
10436         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10437           if(branch_regs[i].regmap[hr]>=0) {
10438             score[hr]=0;earliest_available[hr]=i+2;
10439             loop_start[hr]=MAXBLOCK;
10440           }
10441         }
10442       }
10443       // No register allocations after unconditional jumps
10444       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10445       {
10446         for(hr=0;hr<HOST_REGS;hr++) {
10447           score[hr]=0;earliest_available[hr]=i+2;
10448           loop_start[hr]=MAXBLOCK;
10449         }
10450         i++; // Skip delay slot too
10451         //printf("skip delay slot: %x\n",start+i*4);
10452       }
10453       else
10454       // Possible match
10455       if(itype[i]==LOAD||itype[i]==LOADLR||
10456          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10457         for(hr=0;hr<HOST_REGS;hr++) {
10458           if(hr!=EXCLUDE_REG) {
10459             end[hr]=i-1;
10460             for(j=i;j<slen-1;j++) {
10461               if(regs[j].regmap[hr]>=0) break;
10462               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10463                 if(branch_regs[j].regmap[hr]>=0) break;
10464                 if(ooo[j]) {
10465                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10466                 }else{
10467                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10468                 }
10469               }
10470               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10471               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10472                 int t=(ba[j]-start)>>2;
10473                 if(t<j&&t>=earliest_available[hr]) {
10474                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10475                     // Score a point for hoisting loop invariant
10476                     if(t<loop_start[hr]) loop_start[hr]=t;
10477                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10478                     score[hr]++;
10479                     end[hr]=j;
10480                   }
10481                 }
10482                 else if(t<j) {
10483                   if(regs[t].regmap[hr]==reg) {
10484                     // Score a point if the branch target matches this register
10485                     score[hr]++;
10486                     end[hr]=j;
10487                   }
10488                 }
10489                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10490                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10491                   score[hr]++;
10492                   end[hr]=j;
10493                 }
10494               }
10495               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10496               {
10497                 // Stop on unconditional branch
10498                 break;
10499               }
10500               else
10501               if(itype[j]==LOAD||itype[j]==LOADLR||
10502                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10503                 score[hr]++;
10504                 end[hr]=j;
10505               }
10506             }
10507           }
10508         }
10509         // Find highest score and allocate that register
10510         int maxscore=0;
10511         for(hr=0;hr<HOST_REGS;hr++) {
10512           if(hr!=EXCLUDE_REG) {
10513             if(score[hr]>score[maxscore]) {
10514               maxscore=hr;
10515               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10516             }
10517           }
10518         }
10519         if(score[maxscore]>1)
10520         {
10521           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10522           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10523             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10524             assert(regs[j].regmap[maxscore]<0);
10525             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10526             regs[j].regmap[maxscore]=reg;
10527             regs[j].dirty&=~(1<<maxscore);
10528             regs[j].wasconst&=~(1<<maxscore);
10529             regs[j].isconst&=~(1<<maxscore);
10530             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10531               branch_regs[j].regmap[maxscore]=reg;
10532               branch_regs[j].wasdirty&=~(1<<maxscore);
10533               branch_regs[j].dirty&=~(1<<maxscore);
10534               branch_regs[j].wasconst&=~(1<<maxscore);
10535               branch_regs[j].isconst&=~(1<<maxscore);
10536               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10537                 regmap_pre[j+2][maxscore]=reg;
10538                 regs[j+2].wasdirty&=~(1<<maxscore);
10539               }
10540               // loop optimization (loop_preload)
10541               int t=(ba[j]-start)>>2;
10542               if(t==loop_start[maxscore]) {
10543                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10544                   regs[t].regmap_entry[maxscore]=reg;
10545               }
10546             }
10547             else
10548             {
10549               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10550                 regmap_pre[j+1][maxscore]=reg;
10551                 regs[j+1].wasdirty&=~(1<<maxscore);
10552               }
10553             }
10554           }
10555           i=j-1;
10556           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10557           for(hr=0;hr<HOST_REGS;hr++) {
10558             score[hr]=0;earliest_available[hr]=i+i;
10559             loop_start[hr]=MAXBLOCK;
10560           }
10561         }
10562       }
10563     }
10564   }
10565   #endif
10566   
10567   // This allocates registers (if possible) one instruction prior
10568   // to use, which can avoid a load-use penalty on certain CPUs.
10569   for(i=0;i<slen-1;i++)
10570   {
10571     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10572     {
10573       if(!bt[i+1])
10574       {
10575         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10576            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10577         {
10578           if(rs1[i+1]) {
10579             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10580             {
10581               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10582               {
10583                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10584                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10585                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10586                 regs[i].isconst&=~(1<<hr);
10587                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10588                 constmap[i][hr]=constmap[i+1][hr];
10589                 regs[i+1].wasdirty&=~(1<<hr);
10590                 regs[i].dirty&=~(1<<hr);
10591               }
10592             }
10593           }
10594           if(rs2[i+1]) {
10595             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10596             {
10597               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10598               {
10599                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10600                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10601                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10602                 regs[i].isconst&=~(1<<hr);
10603                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10604                 constmap[i][hr]=constmap[i+1][hr];
10605                 regs[i+1].wasdirty&=~(1<<hr);
10606                 regs[i].dirty&=~(1<<hr);
10607               }
10608             }
10609           }
10610           // Preload target address for load instruction (non-constant)
10611           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10612             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10613             {
10614               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10615               {
10616                 regs[i].regmap[hr]=rs1[i+1];
10617                 regmap_pre[i+1][hr]=rs1[i+1];
10618                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10619                 regs[i].isconst&=~(1<<hr);
10620                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10621                 constmap[i][hr]=constmap[i+1][hr];
10622                 regs[i+1].wasdirty&=~(1<<hr);
10623                 regs[i].dirty&=~(1<<hr);
10624               }
10625             }
10626           }
10627           // Load source into target register 
10628           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10629             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10630             {
10631               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10632               {
10633                 regs[i].regmap[hr]=rs1[i+1];
10634                 regmap_pre[i+1][hr]=rs1[i+1];
10635                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10636                 regs[i].isconst&=~(1<<hr);
10637                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10638                 constmap[i][hr]=constmap[i+1][hr];
10639                 regs[i+1].wasdirty&=~(1<<hr);
10640                 regs[i].dirty&=~(1<<hr);
10641               }
10642             }
10643           }
10644           // Preload map address
10645           #ifndef HOST_IMM_ADDR32
10646           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10647             hr=get_reg(regs[i+1].regmap,TLREG);
10648             if(hr>=0) {
10649               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10650               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10651                 int nr;
10652                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10653                 {
10654                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10655                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10656                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10657                   regs[i].isconst&=~(1<<hr);
10658                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10659                   constmap[i][hr]=constmap[i+1][hr];
10660                   regs[i+1].wasdirty&=~(1<<hr);
10661                   regs[i].dirty&=~(1<<hr);
10662                 }
10663                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10664                 {
10665                   // move it to another register
10666                   regs[i+1].regmap[hr]=-1;
10667                   regmap_pre[i+2][hr]=-1;
10668                   regs[i+1].regmap[nr]=TLREG;
10669                   regmap_pre[i+2][nr]=TLREG;
10670                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10671                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10672                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10673                   regs[i].isconst&=~(1<<nr);
10674                   regs[i+1].isconst&=~(1<<nr);
10675                   regs[i].dirty&=~(1<<nr);
10676                   regs[i+1].wasdirty&=~(1<<nr);
10677                   regs[i+1].dirty&=~(1<<nr);
10678                   regs[i+2].wasdirty&=~(1<<nr);
10679                 }
10680               }
10681             }
10682           }
10683           #endif
10684           // Address for store instruction (non-constant)
10685           if(itype[i+1]==STORE||itype[i+1]==STORELR
10686              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10687             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10688               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10689               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10690               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10691               assert(hr>=0);
10692               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10693               {
10694                 regs[i].regmap[hr]=rs1[i+1];
10695                 regmap_pre[i+1][hr]=rs1[i+1];
10696                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10697                 regs[i].isconst&=~(1<<hr);
10698                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10699                 constmap[i][hr]=constmap[i+1][hr];
10700                 regs[i+1].wasdirty&=~(1<<hr);
10701                 regs[i].dirty&=~(1<<hr);
10702               }
10703             }
10704           }
10705           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10706             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10707               int nr;
10708               hr=get_reg(regs[i+1].regmap,FTEMP);
10709               assert(hr>=0);
10710               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10711               {
10712                 regs[i].regmap[hr]=rs1[i+1];
10713                 regmap_pre[i+1][hr]=rs1[i+1];
10714                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10715                 regs[i].isconst&=~(1<<hr);
10716                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10717                 constmap[i][hr]=constmap[i+1][hr];
10718                 regs[i+1].wasdirty&=~(1<<hr);
10719                 regs[i].dirty&=~(1<<hr);
10720               }
10721               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10722               {
10723                 // move it to another register
10724                 regs[i+1].regmap[hr]=-1;
10725                 regmap_pre[i+2][hr]=-1;
10726                 regs[i+1].regmap[nr]=FTEMP;
10727                 regmap_pre[i+2][nr]=FTEMP;
10728                 regs[i].regmap[nr]=rs1[i+1];
10729                 regmap_pre[i+1][nr]=rs1[i+1];
10730                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10731                 regs[i].isconst&=~(1<<nr);
10732                 regs[i+1].isconst&=~(1<<nr);
10733                 regs[i].dirty&=~(1<<nr);
10734                 regs[i+1].wasdirty&=~(1<<nr);
10735                 regs[i+1].dirty&=~(1<<nr);
10736                 regs[i+2].wasdirty&=~(1<<nr);
10737               }
10738             }
10739           }
10740           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10741             if(itype[i+1]==LOAD) 
10742               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10743             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10744               hr=get_reg(regs[i+1].regmap,FTEMP);
10745             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10746               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10747               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10748             }
10749             if(hr>=0&&regs[i].regmap[hr]<0) {
10750               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10751               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10752                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10753                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10754                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10755                 regs[i].isconst&=~(1<<hr);
10756                 regs[i+1].wasdirty&=~(1<<hr);
10757                 regs[i].dirty&=~(1<<hr);
10758               }
10759             }
10760           }
10761         }
10762       }
10763     }
10764   }
10765   
10766   /* Pass 6 - Optimize clean/dirty state */
10767   clean_registers(0,slen-1,1);
10768   
10769   /* Pass 7 - Identify 32-bit registers */
10770 #ifndef FORCE32
10771   provisional_r32();
10772
10773   u_int r32=0;
10774   
10775   for (i=slen-1;i>=0;i--)
10776   {
10777     int hr;
10778     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10779     {
10780       if(ba[i]<start || ba[i]>=(start+slen*4))
10781       {
10782         // Branch out of this block, don't need anything
10783         r32=0;
10784       }
10785       else
10786       {
10787         // Internal branch
10788         // Need whatever matches the target
10789         // (and doesn't get overwritten by the delay slot instruction)
10790         r32=0;
10791         int t=(ba[i]-start)>>2;
10792         if(ba[i]>start+i*4) {
10793           // Forward branch
10794           if(!(requires_32bit[t]&~regs[i].was32))
10795             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10796         }else{
10797           // Backward branch
10798           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10799           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10800           if(!(pr32[t]&~regs[i].was32))
10801             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10802         }
10803       }
10804       // Conditional branch may need registers for following instructions
10805       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10806       {
10807         if(i<slen-2) {
10808           r32|=requires_32bit[i+2];
10809           r32&=regs[i].was32;
10810           // Mark this address as a branch target since it may be called
10811           // upon return from interrupt
10812           bt[i+2]=1;
10813         }
10814       }
10815       // Merge in delay slot
10816       if(!likely[i]) {
10817         // These are overwritten unless the branch is "likely"
10818         // and the delay slot is nullified if not taken
10819         r32&=~(1LL<<rt1[i+1]);
10820         r32&=~(1LL<<rt2[i+1]);
10821       }
10822       // Assume these are needed (delay slot)
10823       if(us1[i+1]>0)
10824       {
10825         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10826       }
10827       if(us2[i+1]>0)
10828       {
10829         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10830       }
10831       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10832       {
10833         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10834       }
10835       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10836       {
10837         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10838       }
10839     }
10840     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10841     {
10842       // SYSCALL instruction (software interrupt)
10843       r32=0;
10844     }
10845     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10846     {
10847       // ERET instruction (return from interrupt)
10848       r32=0;
10849     }
10850     // Check 32 bits
10851     r32&=~(1LL<<rt1[i]);
10852     r32&=~(1LL<<rt2[i]);
10853     if(us1[i]>0)
10854     {
10855       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10856     }
10857     if(us2[i]>0)
10858     {
10859       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10860     }
10861     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10862     {
10863       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10864     }
10865     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10866     {
10867       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10868     }
10869     requires_32bit[i]=r32;
10870     
10871     // Dirty registers which are 32-bit, require 32-bit input
10872     // as they will be written as 32-bit values
10873     for(hr=0;hr<HOST_REGS;hr++)
10874     {
10875       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10876         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10877           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10878           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10879         }
10880       }
10881     }
10882     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10883   }
10884 #else
10885   for (i=slen-1;i>=0;i--)
10886   {
10887     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10888     {
10889       // Conditional branch
10890       if((source[i]>>16)!=0x1000&&i<slen-2) {
10891         // Mark this address as a branch target since it may be called
10892         // upon return from interrupt
10893         bt[i+2]=1;
10894       }
10895     }
10896   }
10897 #endif
10898
10899   if(itype[slen-1]==SPAN) {
10900     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10901   }
10902
10903 #ifdef DISASM
10904   /* Debug/disassembly */
10905   for(i=0;i<slen;i++)
10906   {
10907     printf("U:");
10908     int r;
10909     for(r=1;r<=CCREG;r++) {
10910       if((unneeded_reg[i]>>r)&1) {
10911         if(r==HIREG) printf(" HI");
10912         else if(r==LOREG) printf(" LO");
10913         else printf(" r%d",r);
10914       }
10915     }
10916 #ifndef FORCE32
10917     printf(" UU:");
10918     for(r=1;r<=CCREG;r++) {
10919       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10920         if(r==HIREG) printf(" HI");
10921         else if(r==LOREG) printf(" LO");
10922         else printf(" r%d",r);
10923       }
10924     }
10925     printf(" 32:");
10926     for(r=0;r<=CCREG;r++) {
10927       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10928       if((regs[i].was32>>r)&1) {
10929         if(r==CCREG) printf(" CC");
10930         else if(r==HIREG) printf(" HI");
10931         else if(r==LOREG) printf(" LO");
10932         else printf(" r%d",r);
10933       }
10934     }
10935 #endif
10936     printf("\n");
10937     #if defined(__i386__) || defined(__x86_64__)
10938     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10939     #endif
10940     #ifdef __arm__
10941     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10942     #endif
10943     printf("needs: ");
10944     if(needed_reg[i]&1) printf("eax ");
10945     if((needed_reg[i]>>1)&1) printf("ecx ");
10946     if((needed_reg[i]>>2)&1) printf("edx ");
10947     if((needed_reg[i]>>3)&1) printf("ebx ");
10948     if((needed_reg[i]>>5)&1) printf("ebp ");
10949     if((needed_reg[i]>>6)&1) printf("esi ");
10950     if((needed_reg[i]>>7)&1) printf("edi ");
10951     printf("r:");
10952     for(r=0;r<=CCREG;r++) {
10953       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10954       if((requires_32bit[i]>>r)&1) {
10955         if(r==CCREG) printf(" CC");
10956         else if(r==HIREG) printf(" HI");
10957         else if(r==LOREG) printf(" LO");
10958         else printf(" r%d",r);
10959       }
10960     }
10961     printf("\n");
10962     /*printf("pr:");
10963     for(r=0;r<=CCREG;r++) {
10964       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10965       if((pr32[i]>>r)&1) {
10966         if(r==CCREG) printf(" CC");
10967         else if(r==HIREG) printf(" HI");
10968         else if(r==LOREG) printf(" LO");
10969         else printf(" r%d",r);
10970       }
10971     }
10972     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10973     printf("\n");*/
10974     #if defined(__i386__) || defined(__x86_64__)
10975     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10976     printf("dirty: ");
10977     if(regs[i].wasdirty&1) printf("eax ");
10978     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10979     if((regs[i].wasdirty>>2)&1) printf("edx ");
10980     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10981     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10982     if((regs[i].wasdirty>>6)&1) printf("esi ");
10983     if((regs[i].wasdirty>>7)&1) printf("edi ");
10984     #endif
10985     #ifdef __arm__
10986     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10987     printf("dirty: ");
10988     if(regs[i].wasdirty&1) printf("r0 ");
10989     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10990     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10991     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10992     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10993     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10994     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10995     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10996     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10997     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10998     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10999     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11000     #endif
11001     printf("\n");
11002     disassemble_inst(i);
11003     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11004     #if defined(__i386__) || defined(__x86_64__)
11005     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11006     if(regs[i].dirty&1) printf("eax ");
11007     if((regs[i].dirty>>1)&1) printf("ecx ");
11008     if((regs[i].dirty>>2)&1) printf("edx ");
11009     if((regs[i].dirty>>3)&1) printf("ebx ");
11010     if((regs[i].dirty>>5)&1) printf("ebp ");
11011     if((regs[i].dirty>>6)&1) printf("esi ");
11012     if((regs[i].dirty>>7)&1) printf("edi ");
11013     #endif
11014     #ifdef __arm__
11015     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11016     if(regs[i].dirty&1) printf("r0 ");
11017     if((regs[i].dirty>>1)&1) printf("r1 ");
11018     if((regs[i].dirty>>2)&1) printf("r2 ");
11019     if((regs[i].dirty>>3)&1) printf("r3 ");
11020     if((regs[i].dirty>>4)&1) printf("r4 ");
11021     if((regs[i].dirty>>5)&1) printf("r5 ");
11022     if((regs[i].dirty>>6)&1) printf("r6 ");
11023     if((regs[i].dirty>>7)&1) printf("r7 ");
11024     if((regs[i].dirty>>8)&1) printf("r8 ");
11025     if((regs[i].dirty>>9)&1) printf("r9 ");
11026     if((regs[i].dirty>>10)&1) printf("r10 ");
11027     if((regs[i].dirty>>12)&1) printf("r12 ");
11028     #endif
11029     printf("\n");
11030     if(regs[i].isconst) {
11031       printf("constants: ");
11032       #if defined(__i386__) || defined(__x86_64__)
11033       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11034       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11035       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11036       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11037       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11038       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11039       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11040       #endif
11041       #ifdef __arm__
11042       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11043       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11044       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11045       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11046       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11047       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11048       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11049       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11050       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11051       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11052       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11053       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11054       #endif
11055       printf("\n");
11056     }
11057 #ifndef FORCE32
11058     printf(" 32:");
11059     for(r=0;r<=CCREG;r++) {
11060       if((regs[i].is32>>r)&1) {
11061         if(r==CCREG) printf(" CC");
11062         else if(r==HIREG) printf(" HI");
11063         else if(r==LOREG) printf(" LO");
11064         else printf(" r%d",r);
11065       }
11066     }
11067     printf("\n");
11068 #endif
11069     /*printf(" p32:");
11070     for(r=0;r<=CCREG;r++) {
11071       if((p32[i]>>r)&1) {
11072         if(r==CCREG) printf(" CC");
11073         else if(r==HIREG) printf(" HI");
11074         else if(r==LOREG) printf(" LO");
11075         else printf(" r%d",r);
11076       }
11077     }
11078     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11079     else printf("\n");*/
11080     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11081       #if defined(__i386__) || defined(__x86_64__)
11082       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11083       if(branch_regs[i].dirty&1) printf("eax ");
11084       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11085       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11086       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11087       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11088       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11089       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11090       #endif
11091       #ifdef __arm__
11092       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11093       if(branch_regs[i].dirty&1) printf("r0 ");
11094       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11095       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11096       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11097       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11098       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11099       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11100       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11101       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11102       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11103       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11104       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11105       #endif
11106 #ifndef FORCE32
11107       printf(" 32:");
11108       for(r=0;r<=CCREG;r++) {
11109         if((branch_regs[i].is32>>r)&1) {
11110           if(r==CCREG) printf(" CC");
11111           else if(r==HIREG) printf(" HI");
11112           else if(r==LOREG) printf(" LO");
11113           else printf(" r%d",r);
11114         }
11115       }
11116       printf("\n");
11117 #endif
11118     }
11119   }
11120 #endif // DISASM
11121
11122   /* Pass 8 - Assembly */
11123   linkcount=0;stubcount=0;
11124   ds=0;is_delayslot=0;
11125   cop1_usable=0;
11126   uint64_t is32_pre=0;
11127   u_int dirty_pre=0;
11128   u_int beginning=(u_int)out;
11129   if((u_int)addr&1) {
11130     ds=1;
11131     pagespan_ds();
11132   }
11133   u_int instr_addr0_override=0;
11134
11135 #ifdef PCSX
11136   if (start == 0x80030000) {
11137     // nasty hack for fastbios thing
11138     // override block entry to this code
11139     instr_addr0_override=(u_int)out;
11140     emit_movimm(start,0);
11141     // abuse io address var as a flag that we
11142     // have already returned here once
11143     emit_readword((int)&address,1);
11144     emit_writeword(0,(int)&pcaddr);
11145     emit_writeword(0,(int)&address);
11146     emit_cmp(0,1);
11147     emit_jne((int)new_dyna_leave);
11148   }
11149 #endif
11150   for(i=0;i<slen;i++)
11151   {
11152     //if(ds) printf("ds: ");
11153     disassemble_inst(i);
11154     if(ds) {
11155       ds=0; // Skip delay slot
11156       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11157       instr_addr[i]=0;
11158     } else {
11159       speculate_register_values(i);
11160       #ifndef DESTRUCTIVE_WRITEBACK
11161       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11162       {
11163         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11164               unneeded_reg[i],unneeded_reg_upper[i]);
11165         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11166               unneeded_reg[i],unneeded_reg_upper[i]);
11167       }
11168       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11169         is32_pre=branch_regs[i].is32;
11170         dirty_pre=branch_regs[i].dirty;
11171       }else{
11172         is32_pre=regs[i].is32;
11173         dirty_pre=regs[i].dirty;
11174       }
11175       #endif
11176       // write back
11177       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11178       {
11179         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11180                       unneeded_reg[i],unneeded_reg_upper[i]);
11181         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11182       }
11183       // branch target entry point
11184       instr_addr[i]=(u_int)out;
11185       assem_debug("<->\n");
11186       // load regs
11187       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11188         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11189       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11190       address_generation(i,&regs[i],regs[i].regmap_entry);
11191       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11192       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11193       {
11194         // Load the delay slot registers if necessary
11195         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11196           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11197         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11198           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11199         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11200           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11201       }
11202       else if(i+1<slen)
11203       {
11204         // Preload registers for following instruction
11205         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11206           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11207             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11208         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11209           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11210             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11211       }
11212       // TODO: if(is_ooo(i)) address_generation(i+1);
11213       if(itype[i]==CJUMP||itype[i]==FJUMP)
11214         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11215       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11216         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11217       if(bt[i]) cop1_usable=0;
11218       // assemble
11219       switch(itype[i]) {
11220         case ALU:
11221           alu_assemble(i,&regs[i]);break;
11222         case IMM16:
11223           imm16_assemble(i,&regs[i]);break;
11224         case SHIFT:
11225           shift_assemble(i,&regs[i]);break;
11226         case SHIFTIMM:
11227           shiftimm_assemble(i,&regs[i]);break;
11228         case LOAD:
11229           load_assemble(i,&regs[i]);break;
11230         case LOADLR:
11231           loadlr_assemble(i,&regs[i]);break;
11232         case STORE:
11233           store_assemble(i,&regs[i]);break;
11234         case STORELR:
11235           storelr_assemble(i,&regs[i]);break;
11236         case COP0:
11237           cop0_assemble(i,&regs[i]);break;
11238         case COP1:
11239           cop1_assemble(i,&regs[i]);break;
11240         case C1LS:
11241           c1ls_assemble(i,&regs[i]);break;
11242         case COP2:
11243           cop2_assemble(i,&regs[i]);break;
11244         case C2LS:
11245           c2ls_assemble(i,&regs[i]);break;
11246         case C2OP:
11247           c2op_assemble(i,&regs[i]);break;
11248         case FCONV:
11249           fconv_assemble(i,&regs[i]);break;
11250         case FLOAT:
11251           float_assemble(i,&regs[i]);break;
11252         case FCOMP:
11253           fcomp_assemble(i,&regs[i]);break;
11254         case MULTDIV:
11255           multdiv_assemble(i,&regs[i]);break;
11256         case MOV:
11257           mov_assemble(i,&regs[i]);break;
11258         case SYSCALL:
11259           syscall_assemble(i,&regs[i]);break;
11260         case HLECALL:
11261           hlecall_assemble(i,&regs[i]);break;
11262         case INTCALL:
11263           intcall_assemble(i,&regs[i]);break;
11264         case UJUMP:
11265           ujump_assemble(i,&regs[i]);ds=1;break;
11266         case RJUMP:
11267           rjump_assemble(i,&regs[i]);ds=1;break;
11268         case CJUMP:
11269           cjump_assemble(i,&regs[i]);ds=1;break;
11270         case SJUMP:
11271           sjump_assemble(i,&regs[i]);ds=1;break;
11272         case FJUMP:
11273           fjump_assemble(i,&regs[i]);ds=1;break;
11274         case SPAN:
11275           pagespan_assemble(i,&regs[i]);break;
11276       }
11277       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11278         literal_pool(1024);
11279       else
11280         literal_pool_jumpover(256);
11281     }
11282   }
11283   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11284   // If the block did not end with an unconditional branch,
11285   // add a jump to the next instruction.
11286   if(i>1) {
11287     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11288       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11289       assert(i==slen);
11290       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11291         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11292         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11293           emit_loadreg(CCREG,HOST_CCREG);
11294         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11295       }
11296       else if(!likely[i-2])
11297       {
11298         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11299         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11300       }
11301       else
11302       {
11303         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11304         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11305       }
11306       add_to_linker((int)out,start+i*4,0);
11307       emit_jmp(0);
11308     }
11309   }
11310   else
11311   {
11312     assert(i>0);
11313     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11314     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11315     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11316       emit_loadreg(CCREG,HOST_CCREG);
11317     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
11318     add_to_linker((int)out,start+i*4,0);
11319     emit_jmp(0);
11320   }
11321
11322   // TODO: delay slot stubs?
11323   // Stubs
11324   for(i=0;i<stubcount;i++)
11325   {
11326     switch(stubs[i][0])
11327     {
11328       case LOADB_STUB:
11329       case LOADH_STUB:
11330       case LOADW_STUB:
11331       case LOADD_STUB:
11332       case LOADBU_STUB:
11333       case LOADHU_STUB:
11334         do_readstub(i);break;
11335       case STOREB_STUB:
11336       case STOREH_STUB:
11337       case STOREW_STUB:
11338       case STORED_STUB:
11339         do_writestub(i);break;
11340       case CC_STUB:
11341         do_ccstub(i);break;
11342       case INVCODE_STUB:
11343         do_invstub(i);break;
11344       case FP_STUB:
11345         do_cop1stub(i);break;
11346       case STORELR_STUB:
11347         do_unalignedwritestub(i);break;
11348     }
11349   }
11350
11351   if (instr_addr0_override)
11352     instr_addr[0] = instr_addr0_override;
11353
11354   /* Pass 9 - Linker */
11355   for(i=0;i<linkcount;i++)
11356   {
11357     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11358     literal_pool(64);
11359     if(!link_addr[i][2])
11360     {
11361       void *stub=out;
11362       void *addr=check_addr(link_addr[i][1]);
11363       emit_extjump(link_addr[i][0],link_addr[i][1]);
11364       if(addr) {
11365         set_jump_target(link_addr[i][0],(int)addr);
11366         add_link(link_addr[i][1],stub);
11367       }
11368       else set_jump_target(link_addr[i][0],(int)stub);
11369     }
11370     else
11371     {
11372       // Internal branch
11373       int target=(link_addr[i][1]-start)>>2;
11374       assert(target>=0&&target<slen);
11375       assert(instr_addr[target]);
11376       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11377       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11378       //#else
11379       set_jump_target(link_addr[i][0],instr_addr[target]);
11380       //#endif
11381     }
11382   }
11383   // External Branch Targets (jump_in)
11384   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11385   for(i=0;i<slen;i++)
11386   {
11387     if(bt[i]||i==0)
11388     {
11389       if(instr_addr[i]) // TODO - delay slots (=null)
11390       {
11391         u_int vaddr=start+i*4;
11392         u_int page=get_page(vaddr);
11393         u_int vpage=get_vpage(vaddr);
11394         literal_pool(256);
11395         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11396 #ifndef FORCE32
11397         if(!requires_32bit[i])
11398 #else
11399         if(1)
11400 #endif
11401         {
11402           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11403           assem_debug("jump_in: %x\n",start+i*4);
11404           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11405           int entry_point=do_dirty_stub(i);
11406           ll_add(jump_in+page,vaddr,(void *)entry_point);
11407           // If there was an existing entry in the hash table,
11408           // replace it with the new address.
11409           // Don't add new entries.  We'll insert the
11410           // ones that actually get used in check_addr().
11411           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11412           if(ht_bin[0]==vaddr) {
11413             ht_bin[1]=entry_point;
11414           }
11415           if(ht_bin[2]==vaddr) {
11416             ht_bin[3]=entry_point;
11417           }
11418         }
11419         else
11420         {
11421           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11422           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11423           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11424           //int entry_point=(int)out;
11425           ////assem_debug("entry_point: %x\n",entry_point);
11426           //load_regs_entry(i);
11427           //if(entry_point==(int)out)
11428           //  entry_point=instr_addr[i];
11429           //else
11430           //  emit_jmp(instr_addr[i]);
11431           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11432           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11433           int entry_point=do_dirty_stub(i);
11434           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11435         }
11436       }
11437     }
11438   }
11439   // Write out the literal pool if necessary
11440   literal_pool(0);
11441   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11442   // Align code
11443   if(((u_int)out)&7) emit_addnop(13);
11444   #endif
11445   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11446   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11447   memcpy(copy,source,slen*4);
11448   copy+=slen*4;
11449   
11450   #ifdef __arm__
11451   __clear_cache((void *)beginning,out);
11452   #endif
11453   
11454   // If we're within 256K of the end of the buffer,
11455   // start over from the beginning. (Is 256K enough?)
11456   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11457   
11458   // Trap writes to any of the pages we compiled
11459   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11460     invalid_code[i]=0;
11461 #ifndef DISABLE_TLB
11462     memory_map[i]|=0x40000000;
11463     if((signed int)start>=(signed int)0xC0000000) {
11464       assert(using_tlb);
11465       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11466       invalid_code[j]=0;
11467       memory_map[j]|=0x40000000;
11468       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11469     }
11470 #endif
11471   }
11472   inv_code_start=inv_code_end=~0;
11473 #ifdef PCSX
11474   // for PCSX we need to mark all mirrors too
11475   if(get_page(start)<(RAM_SIZE>>12))
11476     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11477       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11478       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11479       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11480 #endif
11481   
11482   /* Pass 10 - Free memory by expiring oldest blocks */
11483   
11484   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11485   while(expirep!=end)
11486   {
11487     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11488     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11489     inv_debug("EXP: Phase %d\n",expirep);
11490     switch((expirep>>11)&3)
11491     {
11492       case 0:
11493         // Clear jump_in and jump_dirty
11494         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11495         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11496         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11497         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11498         break;
11499       case 1:
11500         // Clear pointers
11501         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11502         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11503         break;
11504       case 2:
11505         // Clear hash table
11506         for(i=0;i<32;i++) {
11507           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11508           if((ht_bin[3]>>shift)==(base>>shift) ||
11509              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11510             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11511             ht_bin[2]=ht_bin[3]=-1;
11512           }
11513           if((ht_bin[1]>>shift)==(base>>shift) ||
11514              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11515             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11516             ht_bin[0]=ht_bin[2];
11517             ht_bin[1]=ht_bin[3];
11518             ht_bin[2]=ht_bin[3]=-1;
11519           }
11520         }
11521         break;
11522       case 3:
11523         // Clear jump_out
11524         #ifdef __arm__
11525         if((expirep&2047)==0) 
11526           do_clear_cache();
11527         #endif
11528         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11529         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11530         break;
11531     }
11532     expirep=(expirep+1)&65535;
11533   }
11534   return 0;
11535 }
11536
11537 // vim:shiftwidth=2:expandtab