support emulated RAM mapped at offset
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <sys/mman.h>
25
26 #include "emu_if.h" //emulator interface
27
28 //#define DISASM
29 //#define assem_debug printf
30 //#define inv_debug printf
31 #define assem_debug(...)
32 #define inv_debug(...)
33
34 #ifdef __i386__
35 #include "assem_x86.h"
36 #endif
37 #ifdef __x86_64__
38 #include "assem_x64.h"
39 #endif
40 #ifdef __arm__
41 #include "assem_arm.h"
42 #endif
43
44 #define MAXBLOCK 4096
45 #define MAX_OUTPUT_BLOCK_SIZE 262144
46
47 struct regstat
48 {
49   signed char regmap_entry[HOST_REGS];
50   signed char regmap[HOST_REGS];
51   uint64_t was32;
52   uint64_t is32;
53   uint64_t wasdirty;
54   uint64_t dirty;
55   uint64_t u;
56   uint64_t uu;
57   u_int wasconst;
58   u_int isconst;
59   u_int loadedconst;             // host regs that have constants loaded
60   u_int waswritten;              // MIPS regs that were used as store base before
61 };
62
63 struct ll_entry
64 {
65   u_int vaddr;
66   u_int reg32;
67   void *addr;
68   struct ll_entry *next;
69 };
70
71   u_int start;
72   u_int *source;
73   u_int pagelimit;
74   char insn[MAXBLOCK][10];
75   u_char itype[MAXBLOCK];
76   u_char opcode[MAXBLOCK];
77   u_char opcode2[MAXBLOCK];
78   u_char bt[MAXBLOCK];
79   u_char rs1[MAXBLOCK];
80   u_char rs2[MAXBLOCK];
81   u_char rt1[MAXBLOCK];
82   u_char rt2[MAXBLOCK];
83   u_char us1[MAXBLOCK];
84   u_char us2[MAXBLOCK];
85   u_char dep1[MAXBLOCK];
86   u_char dep2[MAXBLOCK];
87   u_char lt1[MAXBLOCK];
88   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
89   static uint64_t gte_rt[MAXBLOCK];
90   static uint64_t gte_unneeded[MAXBLOCK];
91   static u_int smrv[32]; // speculated MIPS register values
92   static u_int smrv_strong; // mask or regs that are likely to have correct values
93   static u_int smrv_weak; // same, but somewhat less likely
94   static u_int smrv_strong_next; // same, but after current insn executes
95   static u_int smrv_weak_next;
96   int imm[MAXBLOCK];
97   u_int ba[MAXBLOCK];
98   char likely[MAXBLOCK];
99   char is_ds[MAXBLOCK];
100   char ooo[MAXBLOCK];
101   uint64_t unneeded_reg[MAXBLOCK];
102   uint64_t unneeded_reg_upper[MAXBLOCK];
103   uint64_t branch_unneeded_reg[MAXBLOCK];
104   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
105   uint64_t p32[MAXBLOCK];
106   uint64_t pr32[MAXBLOCK];
107   signed char regmap_pre[MAXBLOCK][HOST_REGS];
108   static uint64_t current_constmap[HOST_REGS];
109   static uint64_t constmap[MAXBLOCK][HOST_REGS];
110   static struct regstat regs[MAXBLOCK];
111   static struct regstat branch_regs[MAXBLOCK];
112   signed char minimum_free_regs[MAXBLOCK];
113   u_int needed_reg[MAXBLOCK];
114   uint64_t requires_32bit[MAXBLOCK];
115   u_int wont_dirty[MAXBLOCK];
116   u_int will_dirty[MAXBLOCK];
117   int ccadj[MAXBLOCK];
118   int slen;
119   u_int instr_addr[MAXBLOCK];
120   u_int link_addr[MAXBLOCK][3];
121   int linkcount;
122   u_int stubs[MAXBLOCK*3][8];
123   int stubcount;
124   u_int literals[1024][2];
125   int literalcount;
126   int is_delayslot;
127   int cop1_usable;
128   u_char *out;
129   struct ll_entry *jump_in[4096];
130   struct ll_entry *jump_out[4096];
131   struct ll_entry *jump_dirty[4096];
132   u_int hash_table[65536][4]  __attribute__((aligned(16)));
133   char shadow[1048576]  __attribute__((aligned(16)));
134   void *copy;
135   int expirep;
136 #ifndef PCSX
137   u_int using_tlb;
138 #else
139   static const u_int using_tlb=0;
140 #endif
141   int new_dynarec_did_compile;
142   int new_dynarec_hacks;
143   u_int stop_after_jal;
144 #ifndef RAM_FIXED
145   static u_int ram_offset;
146 #else
147   static const u_int ram_offset=0;
148 #endif
149   extern u_char restore_candidate[512];
150   extern int cycle_count;
151
152   /* registers that may be allocated */
153   /* 1-31 gpr */
154 #define HIREG 32 // hi
155 #define LOREG 33 // lo
156 #define FSREG 34 // FPU status (FCSR)
157 #define CSREG 35 // Coprocessor status
158 #define CCREG 36 // Cycle count
159 #define INVCP 37 // Pointer to invalid_code
160 #define MMREG 38 // Pointer to memory_map
161 #define ROREG 39 // ram offset (if rdram!=0x80000000)
162 #define TEMPREG 40
163 #define FTEMP 40 // FPU temporary register
164 #define PTEMP 41 // Prefetch temporary register
165 #define TLREG 42 // TLB mapping offset
166 #define RHASH 43 // Return address hash
167 #define RHTBL 44 // Return address hash table address
168 #define RTEMP 45 // JR/JALR address register
169 #define MAXREG 45
170 #define AGEN1 46 // Address generation temporary register
171 #define AGEN2 47 // Address generation temporary register
172 #define MGEN1 48 // Maptable address generation temporary register
173 #define MGEN2 49 // Maptable address generation temporary register
174 #define BTREG 50 // Branch target temporary register
175
176   /* instruction types */
177 #define NOP 0     // No operation
178 #define LOAD 1    // Load
179 #define STORE 2   // Store
180 #define LOADLR 3  // Unaligned load
181 #define STORELR 4 // Unaligned store
182 #define MOV 5     // Move 
183 #define ALU 6     // Arithmetic/logic
184 #define MULTDIV 7 // Multiply/divide
185 #define SHIFT 8   // Shift by register
186 #define SHIFTIMM 9// Shift by immediate
187 #define IMM16 10  // 16-bit immediate
188 #define RJUMP 11  // Unconditional jump to register
189 #define UJUMP 12  // Unconditional jump
190 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
191 #define SJUMP 14  // Conditional branch (regimm format)
192 #define COP0 15   // Coprocessor 0
193 #define COP1 16   // Coprocessor 1
194 #define C1LS 17   // Coprocessor 1 load/store
195 #define FJUMP 18  // Conditional branch (floating point)
196 #define FLOAT 19  // Floating point unit
197 #define FCONV 20  // Convert integer to float
198 #define FCOMP 21  // Floating point compare (sets FSREG)
199 #define SYSCALL 22// SYSCALL
200 #define OTHER 23  // Other
201 #define SPAN 24   // Branch/delay slot spans 2 pages
202 #define NI 25     // Not implemented
203 #define HLECALL 26// PCSX fake opcodes for HLE
204 #define COP2 27   // Coprocessor 2 move
205 #define C2LS 28   // Coprocessor 2 load/store
206 #define C2OP 29   // Coprocessor 2 operation
207 #define INTCALL 30// Call interpreter to handle rare corner cases
208
209   /* stubs */
210 #define CC_STUB 1
211 #define FP_STUB 2
212 #define LOADB_STUB 3
213 #define LOADH_STUB 4
214 #define LOADW_STUB 5
215 #define LOADD_STUB 6
216 #define LOADBU_STUB 7
217 #define LOADHU_STUB 8
218 #define STOREB_STUB 9
219 #define STOREH_STUB 10
220 #define STOREW_STUB 11
221 #define STORED_STUB 12
222 #define STORELR_STUB 13
223 #define INVCODE_STUB 14
224
225   /* branch codes */
226 #define TAKEN 1
227 #define NOTTAKEN 2
228 #define NULLDS 3
229
230 // asm linkage
231 int new_recompile_block(int addr);
232 void *get_addr_ht(u_int vaddr);
233 void invalidate_block(u_int block);
234 void invalidate_addr(u_int addr);
235 void remove_hash(int vaddr);
236 void jump_vaddr();
237 void dyna_linker();
238 void dyna_linker_ds();
239 void verify_code();
240 void verify_code_vm();
241 void verify_code_ds();
242 void cc_interrupt();
243 void fp_exception();
244 void fp_exception_ds();
245 void jump_syscall();
246 void jump_syscall_hle();
247 void jump_eret();
248 void jump_hlecall();
249 void jump_intcall();
250 void new_dyna_leave();
251
252 // TLB
253 void TLBWI_new();
254 void TLBWR_new();
255 void read_nomem_new();
256 void read_nomemb_new();
257 void read_nomemh_new();
258 void read_nomemd_new();
259 void write_nomem_new();
260 void write_nomemb_new();
261 void write_nomemh_new();
262 void write_nomemd_new();
263 void write_rdram_new();
264 void write_rdramb_new();
265 void write_rdramh_new();
266 void write_rdramd_new();
267 extern u_int memory_map[1048576];
268
269 // Needed by assembler
270 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
271 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
272 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
273 void load_all_regs(signed char i_regmap[]);
274 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
275 void load_regs_entry(int t);
276 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
277
278 int tracedebug=0;
279
280 //#define DEBUG_CYCLE_COUNT 1
281
282 int cycle_multiplier; // 100 for 1.0
283
284 static int CLOCK_ADJUST(int x)
285 {
286   int s=(x>>31)|1;
287   return (x * cycle_multiplier + s * 50) / 100;
288 }
289
290 static void tlb_hacks()
291 {
292 #ifndef DISABLE_TLB
293   // Goldeneye hack
294   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
295   {
296     u_int addr;
297     int n;
298     switch (ROM_HEADER->Country_code&0xFF) 
299     {
300       case 0x45: // U
301         addr=0x34b30;
302         break;                   
303       case 0x4A: // J 
304         addr=0x34b70;    
305         break;    
306       case 0x50: // E 
307         addr=0x329f0;
308         break;                        
309       default: 
310         // Unknown country code
311         addr=0;
312         break;
313     }
314     u_int rom_addr=(u_int)rom;
315     #ifdef ROM_COPY
316     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
317     // in the lower 4G of memory to use this hack.  Copy it if necessary.
318     if((void *)rom>(void *)0xffffffff) {
319       munmap(ROM_COPY, 67108864);
320       if(mmap(ROM_COPY, 12582912,
321               PROT_READ | PROT_WRITE,
322               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
323               -1, 0) <= 0) {printf("mmap() failed\n");}
324       memcpy(ROM_COPY,rom,12582912);
325       rom_addr=(u_int)ROM_COPY;
326     }
327     #endif
328     if(addr) {
329       for(n=0x7F000;n<0x80000;n++) {
330         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
331       }
332     }
333   }
334 #endif
335 }
336
337 static u_int get_page(u_int vaddr)
338 {
339 #ifndef PCSX
340   u_int page=(vaddr^0x80000000)>>12;
341 #else
342   u_int page=vaddr&~0xe0000000;
343   if (page < 0x1000000)
344     page &= ~0x0e00000; // RAM mirrors
345   page>>=12;
346 #endif
347 #ifndef DISABLE_TLB
348   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
349 #endif
350   if(page>2048) page=2048+(page&2047);
351   return page;
352 }
353
354 #ifndef PCSX
355 static u_int get_vpage(u_int vaddr)
356 {
357   u_int vpage=(vaddr^0x80000000)>>12;
358 #ifndef DISABLE_TLB
359   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
360 #endif
361   if(vpage>2048) vpage=2048+(vpage&2047);
362   return vpage;
363 }
364 #else
365 // no virtual mem in PCSX
366 static u_int get_vpage(u_int vaddr)
367 {
368   return get_page(vaddr);
369 }
370 #endif
371
372 // Get address from virtual address
373 // This is called from the recompiled JR/JALR instructions
374 void *get_addr(u_int vaddr)
375 {
376   u_int page=get_page(vaddr);
377   u_int vpage=get_vpage(vaddr);
378   struct ll_entry *head;
379   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
380   head=jump_in[page];
381   while(head!=NULL) {
382     if(head->vaddr==vaddr&&head->reg32==0) {
383   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
384       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
385       ht_bin[3]=ht_bin[1];
386       ht_bin[2]=ht_bin[0];
387       ht_bin[1]=(int)head->addr;
388       ht_bin[0]=vaddr;
389       return head->addr;
390     }
391     head=head->next;
392   }
393   head=jump_dirty[vpage];
394   while(head!=NULL) {
395     if(head->vaddr==vaddr&&head->reg32==0) {
396       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
397       // Don't restore blocks which are about to expire from the cache
398       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
399       if(verify_dirty(head->addr)) {
400         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
401         invalid_code[vaddr>>12]=0;
402         inv_code_start=inv_code_end=~0;
403 #ifndef DISABLE_TLB
404         memory_map[vaddr>>12]|=0x40000000;
405 #endif
406         if(vpage<2048) {
407 #ifndef DISABLE_TLB
408           if(tlb_LUT_r[vaddr>>12]) {
409             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
410             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
411           }
412 #endif
413           restore_candidate[vpage>>3]|=1<<(vpage&7);
414         }
415         else restore_candidate[page>>3]|=1<<(page&7);
416         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
417         if(ht_bin[0]==vaddr) {
418           ht_bin[1]=(int)head->addr; // Replace existing entry
419         }
420         else
421         {
422           ht_bin[3]=ht_bin[1];
423           ht_bin[2]=ht_bin[0];
424           ht_bin[1]=(int)head->addr;
425           ht_bin[0]=vaddr;
426         }
427         return head->addr;
428       }
429     }
430     head=head->next;
431   }
432   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
433   int r=new_recompile_block(vaddr);
434   if(r==0) return get_addr(vaddr);
435   // Execute in unmapped page, generate pagefault execption
436   Status|=2;
437   Cause=(vaddr<<31)|0x8;
438   EPC=(vaddr&1)?vaddr-5:vaddr;
439   BadVAddr=(vaddr&~1);
440   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
441   EntryHi=BadVAddr&0xFFFFE000;
442   return get_addr_ht(0x80000000);
443 }
444 // Look up address in hash table first
445 void *get_addr_ht(u_int vaddr)
446 {
447   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
448   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
449   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
450   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
451   return get_addr(vaddr);
452 }
453
454 void *get_addr_32(u_int vaddr,u_int flags)
455 {
456 #ifdef FORCE32
457   return get_addr(vaddr);
458 #else
459   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
460   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
461   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
462   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
463   u_int page=get_page(vaddr);
464   u_int vpage=get_vpage(vaddr);
465   struct ll_entry *head;
466   head=jump_in[page];
467   while(head!=NULL) {
468     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
469       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
470       if(head->reg32==0) {
471         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
472         if(ht_bin[0]==-1) {
473           ht_bin[1]=(int)head->addr;
474           ht_bin[0]=vaddr;
475         }else if(ht_bin[2]==-1) {
476           ht_bin[3]=(int)head->addr;
477           ht_bin[2]=vaddr;
478         }
479         //ht_bin[3]=ht_bin[1];
480         //ht_bin[2]=ht_bin[0];
481         //ht_bin[1]=(int)head->addr;
482         //ht_bin[0]=vaddr;
483       }
484       return head->addr;
485     }
486     head=head->next;
487   }
488   head=jump_dirty[vpage];
489   while(head!=NULL) {
490     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
491       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
492       // Don't restore blocks which are about to expire from the cache
493       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
494       if(verify_dirty(head->addr)) {
495         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
496         invalid_code[vaddr>>12]=0;
497         inv_code_start=inv_code_end=~0;
498         memory_map[vaddr>>12]|=0x40000000;
499         if(vpage<2048) {
500 #ifndef DISABLE_TLB
501           if(tlb_LUT_r[vaddr>>12]) {
502             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
503             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
504           }
505 #endif
506           restore_candidate[vpage>>3]|=1<<(vpage&7);
507         }
508         else restore_candidate[page>>3]|=1<<(page&7);
509         if(head->reg32==0) {
510           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
511           if(ht_bin[0]==-1) {
512             ht_bin[1]=(int)head->addr;
513             ht_bin[0]=vaddr;
514           }else if(ht_bin[2]==-1) {
515             ht_bin[3]=(int)head->addr;
516             ht_bin[2]=vaddr;
517           }
518           //ht_bin[3]=ht_bin[1];
519           //ht_bin[2]=ht_bin[0];
520           //ht_bin[1]=(int)head->addr;
521           //ht_bin[0]=vaddr;
522         }
523         return head->addr;
524       }
525     }
526     head=head->next;
527   }
528   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
529   int r=new_recompile_block(vaddr);
530   if(r==0) return get_addr(vaddr);
531   // Execute in unmapped page, generate pagefault execption
532   Status|=2;
533   Cause=(vaddr<<31)|0x8;
534   EPC=(vaddr&1)?vaddr-5:vaddr;
535   BadVAddr=(vaddr&~1);
536   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
537   EntryHi=BadVAddr&0xFFFFE000;
538   return get_addr_ht(0x80000000);
539 #endif
540 }
541
542 void clear_all_regs(signed char regmap[])
543 {
544   int hr;
545   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
546 }
547
548 signed char get_reg(signed char regmap[],int r)
549 {
550   int hr;
551   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
552   return -1;
553 }
554
555 // Find a register that is available for two consecutive cycles
556 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
557 {
558   int hr;
559   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
560   return -1;
561 }
562
563 int count_free_regs(signed char regmap[])
564 {
565   int count=0;
566   int hr;
567   for(hr=0;hr<HOST_REGS;hr++)
568   {
569     if(hr!=EXCLUDE_REG) {
570       if(regmap[hr]<0) count++;
571     }
572   }
573   return count;
574 }
575
576 void dirty_reg(struct regstat *cur,signed char reg)
577 {
578   int hr;
579   if(!reg) return;
580   for (hr=0;hr<HOST_REGS;hr++) {
581     if((cur->regmap[hr]&63)==reg) {
582       cur->dirty|=1<<hr;
583     }
584   }
585 }
586
587 // If we dirty the lower half of a 64 bit register which is now being
588 // sign-extended, we need to dump the upper half.
589 // Note: Do this only after completion of the instruction, because
590 // some instructions may need to read the full 64-bit value even if
591 // overwriting it (eg SLTI, DSRA32).
592 static void flush_dirty_uppers(struct regstat *cur)
593 {
594   int hr,reg;
595   for (hr=0;hr<HOST_REGS;hr++) {
596     if((cur->dirty>>hr)&1) {
597       reg=cur->regmap[hr];
598       if(reg>=64) 
599         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
600     }
601   }
602 }
603
604 void set_const(struct regstat *cur,signed char reg,uint64_t value)
605 {
606   int hr;
607   if(!reg) return;
608   for (hr=0;hr<HOST_REGS;hr++) {
609     if(cur->regmap[hr]==reg) {
610       cur->isconst|=1<<hr;
611       current_constmap[hr]=value;
612     }
613     else if((cur->regmap[hr]^64)==reg) {
614       cur->isconst|=1<<hr;
615       current_constmap[hr]=value>>32;
616     }
617   }
618 }
619
620 void clear_const(struct regstat *cur,signed char reg)
621 {
622   int hr;
623   if(!reg) return;
624   for (hr=0;hr<HOST_REGS;hr++) {
625     if((cur->regmap[hr]&63)==reg) {
626       cur->isconst&=~(1<<hr);
627     }
628   }
629 }
630
631 int is_const(struct regstat *cur,signed char reg)
632 {
633   int hr;
634   if(reg<0) return 0;
635   if(!reg) return 1;
636   for (hr=0;hr<HOST_REGS;hr++) {
637     if((cur->regmap[hr]&63)==reg) {
638       return (cur->isconst>>hr)&1;
639     }
640   }
641   return 0;
642 }
643 uint64_t get_const(struct regstat *cur,signed char reg)
644 {
645   int hr;
646   if(!reg) return 0;
647   for (hr=0;hr<HOST_REGS;hr++) {
648     if(cur->regmap[hr]==reg) {
649       return current_constmap[hr];
650     }
651   }
652   printf("Unknown constant in r%d\n",reg);
653   exit(1);
654 }
655
656 // Least soon needed registers
657 // Look at the next ten instructions and see which registers
658 // will be used.  Try not to reallocate these.
659 void lsn(u_char hsn[], int i, int *preferred_reg)
660 {
661   int j;
662   int b=-1;
663   for(j=0;j<9;j++)
664   {
665     if(i+j>=slen) {
666       j=slen-i-1;
667       break;
668     }
669     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
670     {
671       // Don't go past an unconditonal jump
672       j++;
673       break;
674     }
675   }
676   for(;j>=0;j--)
677   {
678     if(rs1[i+j]) hsn[rs1[i+j]]=j;
679     if(rs2[i+j]) hsn[rs2[i+j]]=j;
680     if(rt1[i+j]) hsn[rt1[i+j]]=j;
681     if(rt2[i+j]) hsn[rt2[i+j]]=j;
682     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
683       // Stores can allocate zero
684       hsn[rs1[i+j]]=j;
685       hsn[rs2[i+j]]=j;
686     }
687     // On some architectures stores need invc_ptr
688     #if defined(HOST_IMM8)
689     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
690       hsn[INVCP]=j;
691     }
692     #endif
693     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
694     {
695       hsn[CCREG]=j;
696       b=j;
697     }
698   }
699   if(b>=0)
700   {
701     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
702     {
703       // Follow first branch
704       int t=(ba[i+b]-start)>>2;
705       j=7-b;if(t+j>=slen) j=slen-t-1;
706       for(;j>=0;j--)
707       {
708         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
709         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
710         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
711         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
712       }
713     }
714     // TODO: preferred register based on backward branch
715   }
716   // Delay slot should preferably not overwrite branch conditions or cycle count
717   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
718     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
719     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
720     hsn[CCREG]=1;
721     // ...or hash tables
722     hsn[RHASH]=1;
723     hsn[RHTBL]=1;
724   }
725   // Coprocessor load/store needs FTEMP, even if not declared
726   if(itype[i]==C1LS||itype[i]==C2LS) {
727     hsn[FTEMP]=0;
728   }
729   // Load L/R also uses FTEMP as a temporary register
730   if(itype[i]==LOADLR) {
731     hsn[FTEMP]=0;
732   }
733   // Also SWL/SWR/SDL/SDR
734   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
735     hsn[FTEMP]=0;
736   }
737   // Don't remove the TLB registers either
738   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
739     hsn[TLREG]=0;
740   }
741   // Don't remove the miniht registers
742   if(itype[i]==UJUMP||itype[i]==RJUMP)
743   {
744     hsn[RHASH]=0;
745     hsn[RHTBL]=0;
746   }
747 }
748
749 // We only want to allocate registers if we're going to use them again soon
750 int needed_again(int r, int i)
751 {
752   int j;
753   int b=-1;
754   int rn=10;
755   
756   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
757   {
758     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
759       return 0; // Don't need any registers if exiting the block
760   }
761   for(j=0;j<9;j++)
762   {
763     if(i+j>=slen) {
764       j=slen-i-1;
765       break;
766     }
767     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
768     {
769       // Don't go past an unconditonal jump
770       j++;
771       break;
772     }
773     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
774     {
775       break;
776     }
777   }
778   for(;j>=1;j--)
779   {
780     if(rs1[i+j]==r) rn=j;
781     if(rs2[i+j]==r) rn=j;
782     if((unneeded_reg[i+j]>>r)&1) rn=10;
783     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
784     {
785       b=j;
786     }
787   }
788   /*
789   if(b>=0)
790   {
791     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
792     {
793       // Follow first branch
794       int o=rn;
795       int t=(ba[i+b]-start)>>2;
796       j=7-b;if(t+j>=slen) j=slen-t-1;
797       for(;j>=0;j--)
798       {
799         if(!((unneeded_reg[t+j]>>r)&1)) {
800           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
801           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
802         }
803         else rn=o;
804       }
805     }
806   }*/
807   if(rn<10) return 1;
808   return 0;
809 }
810
811 // Try to match register allocations at the end of a loop with those
812 // at the beginning
813 int loop_reg(int i, int r, int hr)
814 {
815   int j,k;
816   for(j=0;j<9;j++)
817   {
818     if(i+j>=slen) {
819       j=slen-i-1;
820       break;
821     }
822     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
823     {
824       // Don't go past an unconditonal jump
825       j++;
826       break;
827     }
828   }
829   k=0;
830   if(i>0){
831     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
832       k--;
833   }
834   for(;k<j;k++)
835   {
836     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
837     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
838     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
839     {
840       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
841       {
842         int t=(ba[i+k]-start)>>2;
843         int reg=get_reg(regs[t].regmap_entry,r);
844         if(reg>=0) return reg;
845         //reg=get_reg(regs[t+1].regmap_entry,r);
846         //if(reg>=0) return reg;
847       }
848     }
849   }
850   return hr;
851 }
852
853
854 // Allocate every register, preserving source/target regs
855 void alloc_all(struct regstat *cur,int i)
856 {
857   int hr;
858   
859   for(hr=0;hr<HOST_REGS;hr++) {
860     if(hr!=EXCLUDE_REG) {
861       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
862          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
863       {
864         cur->regmap[hr]=-1;
865         cur->dirty&=~(1<<hr);
866       }
867       // Don't need zeros
868       if((cur->regmap[hr]&63)==0)
869       {
870         cur->regmap[hr]=-1;
871         cur->dirty&=~(1<<hr);
872       }
873     }
874   }
875 }
876
877 #ifndef FORCE32
878 void div64(int64_t dividend,int64_t divisor)
879 {
880   lo=dividend/divisor;
881   hi=dividend%divisor;
882   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
883   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
884 }
885 void divu64(uint64_t dividend,uint64_t divisor)
886 {
887   lo=dividend/divisor;
888   hi=dividend%divisor;
889   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
890   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
891 }
892
893 void mult64(uint64_t m1,uint64_t m2)
894 {
895    unsigned long long int op1, op2, op3, op4;
896    unsigned long long int result1, result2, result3, result4;
897    unsigned long long int temp1, temp2, temp3, temp4;
898    int sign = 0;
899    
900    if (m1 < 0)
901      {
902     op2 = -m1;
903     sign = 1 - sign;
904      }
905    else op2 = m1;
906    if (m2 < 0)
907      {
908     op4 = -m2;
909     sign = 1 - sign;
910      }
911    else op4 = m2;
912    
913    op1 = op2 & 0xFFFFFFFF;
914    op2 = (op2 >> 32) & 0xFFFFFFFF;
915    op3 = op4 & 0xFFFFFFFF;
916    op4 = (op4 >> 32) & 0xFFFFFFFF;
917    
918    temp1 = op1 * op3;
919    temp2 = (temp1 >> 32) + op1 * op4;
920    temp3 = op2 * op3;
921    temp4 = (temp3 >> 32) + op2 * op4;
922    
923    result1 = temp1 & 0xFFFFFFFF;
924    result2 = temp2 + (temp3 & 0xFFFFFFFF);
925    result3 = (result2 >> 32) + temp4;
926    result4 = (result3 >> 32);
927    
928    lo = result1 | (result2 << 32);
929    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
930    if (sign)
931      {
932     hi = ~hi;
933     if (!lo) hi++;
934     else lo = ~lo + 1;
935      }
936 }
937
938 void multu64(uint64_t m1,uint64_t m2)
939 {
940    unsigned long long int op1, op2, op3, op4;
941    unsigned long long int result1, result2, result3, result4;
942    unsigned long long int temp1, temp2, temp3, temp4;
943    
944    op1 = m1 & 0xFFFFFFFF;
945    op2 = (m1 >> 32) & 0xFFFFFFFF;
946    op3 = m2 & 0xFFFFFFFF;
947    op4 = (m2 >> 32) & 0xFFFFFFFF;
948    
949    temp1 = op1 * op3;
950    temp2 = (temp1 >> 32) + op1 * op4;
951    temp3 = op2 * op3;
952    temp4 = (temp3 >> 32) + op2 * op4;
953    
954    result1 = temp1 & 0xFFFFFFFF;
955    result2 = temp2 + (temp3 & 0xFFFFFFFF);
956    result3 = (result2 >> 32) + temp4;
957    result4 = (result3 >> 32);
958    
959    lo = result1 | (result2 << 32);
960    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
961    
962   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
963   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
964 }
965
966 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
967 {
968   if(bits) {
969     original<<=64-bits;
970     original>>=64-bits;
971     loaded<<=bits;
972     original|=loaded;
973   }
974   else original=loaded;
975   return original;
976 }
977 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
978 {
979   if(bits^56) {
980     original>>=64-(bits^56);
981     original<<=64-(bits^56);
982     loaded>>=bits^56;
983     original|=loaded;
984   }
985   else original=loaded;
986   return original;
987 }
988 #endif
989
990 #ifdef __i386__
991 #include "assem_x86.c"
992 #endif
993 #ifdef __x86_64__
994 #include "assem_x64.c"
995 #endif
996 #ifdef __arm__
997 #include "assem_arm.c"
998 #endif
999
1000 // Add virtual address mapping to linked list
1001 void ll_add(struct ll_entry **head,int vaddr,void *addr)
1002 {
1003   struct ll_entry *new_entry;
1004   new_entry=malloc(sizeof(struct ll_entry));
1005   assert(new_entry!=NULL);
1006   new_entry->vaddr=vaddr;
1007   new_entry->reg32=0;
1008   new_entry->addr=addr;
1009   new_entry->next=*head;
1010   *head=new_entry;
1011 }
1012
1013 // Add virtual address mapping for 32-bit compiled block
1014 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
1015 {
1016   ll_add(head,vaddr,addr);
1017 #ifndef FORCE32
1018   (*head)->reg32=reg32;
1019 #endif
1020 }
1021
1022 // Check if an address is already compiled
1023 // but don't return addresses which are about to expire from the cache
1024 void *check_addr(u_int vaddr)
1025 {
1026   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1027   if(ht_bin[0]==vaddr) {
1028     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1029       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1030   }
1031   if(ht_bin[2]==vaddr) {
1032     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1033       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1034   }
1035   u_int page=get_page(vaddr);
1036   struct ll_entry *head;
1037   head=jump_in[page];
1038   while(head!=NULL) {
1039     if(head->vaddr==vaddr&&head->reg32==0) {
1040       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1041         // Update existing entry with current address
1042         if(ht_bin[0]==vaddr) {
1043           ht_bin[1]=(int)head->addr;
1044           return head->addr;
1045         }
1046         if(ht_bin[2]==vaddr) {
1047           ht_bin[3]=(int)head->addr;
1048           return head->addr;
1049         }
1050         // Insert into hash table with low priority.
1051         // Don't evict existing entries, as they are probably
1052         // addresses that are being accessed frequently.
1053         if(ht_bin[0]==-1) {
1054           ht_bin[1]=(int)head->addr;
1055           ht_bin[0]=vaddr;
1056         }else if(ht_bin[2]==-1) {
1057           ht_bin[3]=(int)head->addr;
1058           ht_bin[2]=vaddr;
1059         }
1060         return head->addr;
1061       }
1062     }
1063     head=head->next;
1064   }
1065   return 0;
1066 }
1067
1068 void remove_hash(int vaddr)
1069 {
1070   //printf("remove hash: %x\n",vaddr);
1071   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1072   if(ht_bin[2]==vaddr) {
1073     ht_bin[2]=ht_bin[3]=-1;
1074   }
1075   if(ht_bin[0]==vaddr) {
1076     ht_bin[0]=ht_bin[2];
1077     ht_bin[1]=ht_bin[3];
1078     ht_bin[2]=ht_bin[3]=-1;
1079   }
1080 }
1081
1082 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1083 {
1084   struct ll_entry *next;
1085   while(*head) {
1086     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1087        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1088     {
1089       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1090       remove_hash((*head)->vaddr);
1091       next=(*head)->next;
1092       free(*head);
1093       *head=next;
1094     }
1095     else
1096     {
1097       head=&((*head)->next);
1098     }
1099   }
1100 }
1101
1102 // Remove all entries from linked list
1103 void ll_clear(struct ll_entry **head)
1104 {
1105   struct ll_entry *cur;
1106   struct ll_entry *next;
1107   if(cur=*head) {
1108     *head=0;
1109     while(cur) {
1110       next=cur->next;
1111       free(cur);
1112       cur=next;
1113     }
1114   }
1115 }
1116
1117 // Dereference the pointers and remove if it matches
1118 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1119 {
1120   while(head) {
1121     int ptr=get_pointer(head->addr);
1122     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1123     if(((ptr>>shift)==(addr>>shift)) ||
1124        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1125     {
1126       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1127       u_int host_addr=(u_int)kill_pointer(head->addr);
1128       #ifdef __arm__
1129         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1130       #endif
1131     }
1132     head=head->next;
1133   }
1134 }
1135
1136 // This is called when we write to a compiled block (see do_invstub)
1137 void invalidate_page(u_int page)
1138 {
1139   struct ll_entry *head;
1140   struct ll_entry *next;
1141   head=jump_in[page];
1142   jump_in[page]=0;
1143   while(head!=NULL) {
1144     inv_debug("INVALIDATE: %x\n",head->vaddr);
1145     remove_hash(head->vaddr);
1146     next=head->next;
1147     free(head);
1148     head=next;
1149   }
1150   head=jump_out[page];
1151   jump_out[page]=0;
1152   while(head!=NULL) {
1153     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1154     u_int host_addr=(u_int)kill_pointer(head->addr);
1155     #ifdef __arm__
1156       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1157     #endif
1158     next=head->next;
1159     free(head);
1160     head=next;
1161   }
1162 }
1163
1164 static void invalidate_block_range(u_int block, u_int first, u_int last)
1165 {
1166   u_int page=get_page(block<<12);
1167   //printf("first=%d last=%d\n",first,last);
1168   invalidate_page(page);
1169   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1170   assert(last<page+5);
1171   // Invalidate the adjacent pages if a block crosses a 4K boundary
1172   while(first<page) {
1173     invalidate_page(first);
1174     first++;
1175   }
1176   for(first=page+1;first<last;first++) {
1177     invalidate_page(first);
1178   }
1179   #ifdef __arm__
1180     do_clear_cache();
1181   #endif
1182   
1183   // Don't trap writes
1184   invalid_code[block]=1;
1185 #ifndef DISABLE_TLB
1186   // If there is a valid TLB entry for this page, remove write protect
1187   if(tlb_LUT_w[block]) {
1188     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1189     // CHECK: Is this right?
1190     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1191     u_int real_block=tlb_LUT_w[block]>>12;
1192     invalid_code[real_block]=1;
1193     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1194   }
1195   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1196 #endif
1197
1198   #ifdef USE_MINI_HT
1199   memset(mini_ht,-1,sizeof(mini_ht));
1200   #endif
1201 }
1202
1203 void invalidate_block(u_int block)
1204 {
1205   u_int page=get_page(block<<12);
1206   u_int vpage=get_vpage(block<<12);
1207   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1208   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1209   u_int first,last;
1210   first=last=page;
1211   struct ll_entry *head;
1212   head=jump_dirty[vpage];
1213   //printf("page=%d vpage=%d\n",page,vpage);
1214   while(head!=NULL) {
1215     u_int start,end;
1216     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1217       get_bounds((int)head->addr,&start,&end);
1218       //printf("start: %x end: %x\n",start,end);
1219       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1220         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1221           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1222           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1223         }
1224       }
1225 #ifndef DISABLE_TLB
1226       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1227         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1228           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1229           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1230         }
1231       }
1232 #endif
1233     }
1234     head=head->next;
1235   }
1236   invalidate_block_range(block,first,last);
1237 }
1238
1239 void invalidate_addr(u_int addr)
1240 {
1241 #ifdef PCSX
1242   //static int rhits;
1243   // this check is done by the caller
1244   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1245   u_int page=get_vpage(addr);
1246   if(page<2048) { // RAM
1247     struct ll_entry *head;
1248     u_int addr_min=~0, addr_max=0;
1249     int mask=RAM_SIZE-1;
1250     int pg1;
1251     inv_code_start=addr&~0xfff;
1252     inv_code_end=addr|0xfff;
1253     pg1=page;
1254     if (pg1>0) {
1255       // must check previous page too because of spans..
1256       pg1--;
1257       inv_code_start-=0x1000;
1258     }
1259     for(;pg1<=page;pg1++) {
1260       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1261         u_int start,end;
1262         get_bounds((int)head->addr,&start,&end);
1263         if((start&mask)<=(addr&mask)&&(addr&mask)<(end&mask)) {
1264           if(start<addr_min) addr_min=start;
1265           if(end>addr_max) addr_max=end;
1266         }
1267         else if(addr<start) {
1268           if(start<inv_code_end)
1269             inv_code_end=start-1;
1270         }
1271         else {
1272           if(end>inv_code_start)
1273             inv_code_start=end;
1274         }
1275       }
1276     }
1277     if (addr_min!=~0) {
1278       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1279       inv_code_start=inv_code_end=~0;
1280       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1281       return;
1282     }
1283     else {
1284       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1285       return;
1286     }
1287   }
1288 #endif
1289   invalidate_block(addr>>12);
1290 }
1291
1292 // This is called when loading a save state.
1293 // Anything could have changed, so invalidate everything.
1294 void invalidate_all_pages()
1295 {
1296   u_int page,n;
1297   for(page=0;page<4096;page++)
1298     invalidate_page(page);
1299   for(page=0;page<1048576;page++)
1300     if(!invalid_code[page]) {
1301       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1302       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1303     }
1304   #ifdef __arm__
1305   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1306   #endif
1307   #ifdef USE_MINI_HT
1308   memset(mini_ht,-1,sizeof(mini_ht));
1309   #endif
1310   #ifndef DISABLE_TLB
1311   // TLB
1312   for(page=0;page<0x100000;page++) {
1313     if(tlb_LUT_r[page]) {
1314       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1315       if(!tlb_LUT_w[page]||!invalid_code[page])
1316         memory_map[page]|=0x40000000; // Write protect
1317     }
1318     else memory_map[page]=-1;
1319     if(page==0x80000) page=0xC0000;
1320   }
1321   tlb_hacks();
1322   #endif
1323 }
1324
1325 // Add an entry to jump_out after making a link
1326 void add_link(u_int vaddr,void *src)
1327 {
1328   u_int page=get_page(vaddr);
1329   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1330   int *ptr=(int *)(src+4);
1331   assert((*ptr&0x0fff0000)==0x059f0000);
1332   ll_add(jump_out+page,vaddr,src);
1333   //int ptr=get_pointer(src);
1334   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1335 }
1336
1337 // If a code block was found to be unmodified (bit was set in
1338 // restore_candidate) and it remains unmodified (bit is clear
1339 // in invalid_code) then move the entries for that 4K page from
1340 // the dirty list to the clean list.
1341 void clean_blocks(u_int page)
1342 {
1343   struct ll_entry *head;
1344   inv_debug("INV: clean_blocks page=%d\n",page);
1345   head=jump_dirty[page];
1346   while(head!=NULL) {
1347     if(!invalid_code[head->vaddr>>12]) {
1348       // Don't restore blocks which are about to expire from the cache
1349       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1350         u_int start,end;
1351         if(verify_dirty((int)head->addr)) {
1352           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1353           u_int i;
1354           u_int inv=0;
1355           get_bounds((int)head->addr,&start,&end);
1356           if(start-(u_int)rdram<RAM_SIZE) {
1357             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1358               inv|=invalid_code[i];
1359             }
1360           }
1361 #ifndef DISABLE_TLB
1362           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1363             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1364             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1365             if(addr<start||addr>=end) inv=1;
1366           }
1367 #endif
1368           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1369             inv=1;
1370           }
1371           if(!inv) {
1372             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1373             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1374               u_int ppage=page;
1375 #ifndef DISABLE_TLB
1376               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1377 #endif
1378               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1379               //printf("page=%x, addr=%x\n",page,head->vaddr);
1380               //assert(head->vaddr>>12==(page|0x80000));
1381               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1382               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1383               if(!head->reg32) {
1384                 if(ht_bin[0]==head->vaddr) {
1385                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1386                 }
1387                 if(ht_bin[2]==head->vaddr) {
1388                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1389                 }
1390               }
1391             }
1392           }
1393         }
1394       }
1395     }
1396     head=head->next;
1397   }
1398 }
1399
1400
1401 void mov_alloc(struct regstat *current,int i)
1402 {
1403   // Note: Don't need to actually alloc the source registers
1404   if((~current->is32>>rs1[i])&1) {
1405     //alloc_reg64(current,i,rs1[i]);
1406     alloc_reg64(current,i,rt1[i]);
1407     current->is32&=~(1LL<<rt1[i]);
1408   } else {
1409     //alloc_reg(current,i,rs1[i]);
1410     alloc_reg(current,i,rt1[i]);
1411     current->is32|=(1LL<<rt1[i]);
1412   }
1413   clear_const(current,rs1[i]);
1414   clear_const(current,rt1[i]);
1415   dirty_reg(current,rt1[i]);
1416 }
1417
1418 void shiftimm_alloc(struct regstat *current,int i)
1419 {
1420   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1421   {
1422     if(rt1[i]) {
1423       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1424       else lt1[i]=rs1[i];
1425       alloc_reg(current,i,rt1[i]);
1426       current->is32|=1LL<<rt1[i];
1427       dirty_reg(current,rt1[i]);
1428       if(is_const(current,rs1[i])) {
1429         int v=get_const(current,rs1[i]);
1430         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1431         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1432         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1433       }
1434       else clear_const(current,rt1[i]);
1435     }
1436   }
1437   else
1438   {
1439     clear_const(current,rs1[i]);
1440     clear_const(current,rt1[i]);
1441   }
1442
1443   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1444   {
1445     if(rt1[i]) {
1446       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1447       alloc_reg64(current,i,rt1[i]);
1448       current->is32&=~(1LL<<rt1[i]);
1449       dirty_reg(current,rt1[i]);
1450     }
1451   }
1452   if(opcode2[i]==0x3c) // DSLL32
1453   {
1454     if(rt1[i]) {
1455       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1456       alloc_reg64(current,i,rt1[i]);
1457       current->is32&=~(1LL<<rt1[i]);
1458       dirty_reg(current,rt1[i]);
1459     }
1460   }
1461   if(opcode2[i]==0x3e) // DSRL32
1462   {
1463     if(rt1[i]) {
1464       alloc_reg64(current,i,rs1[i]);
1465       if(imm[i]==32) {
1466         alloc_reg64(current,i,rt1[i]);
1467         current->is32&=~(1LL<<rt1[i]);
1468       } else {
1469         alloc_reg(current,i,rt1[i]);
1470         current->is32|=1LL<<rt1[i];
1471       }
1472       dirty_reg(current,rt1[i]);
1473     }
1474   }
1475   if(opcode2[i]==0x3f) // DSRA32
1476   {
1477     if(rt1[i]) {
1478       alloc_reg64(current,i,rs1[i]);
1479       alloc_reg(current,i,rt1[i]);
1480       current->is32|=1LL<<rt1[i];
1481       dirty_reg(current,rt1[i]);
1482     }
1483   }
1484 }
1485
1486 void shift_alloc(struct regstat *current,int i)
1487 {
1488   if(rt1[i]) {
1489     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1490     {
1491       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1492       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1493       alloc_reg(current,i,rt1[i]);
1494       if(rt1[i]==rs2[i]) {
1495         alloc_reg_temp(current,i,-1);
1496         minimum_free_regs[i]=1;
1497       }
1498       current->is32|=1LL<<rt1[i];
1499     } else { // DSLLV/DSRLV/DSRAV
1500       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1501       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1502       alloc_reg64(current,i,rt1[i]);
1503       current->is32&=~(1LL<<rt1[i]);
1504       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1505       {
1506         alloc_reg_temp(current,i,-1);
1507         minimum_free_regs[i]=1;
1508       }
1509     }
1510     clear_const(current,rs1[i]);
1511     clear_const(current,rs2[i]);
1512     clear_const(current,rt1[i]);
1513     dirty_reg(current,rt1[i]);
1514   }
1515 }
1516
1517 void alu_alloc(struct regstat *current,int i)
1518 {
1519   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1520     if(rt1[i]) {
1521       if(rs1[i]&&rs2[i]) {
1522         alloc_reg(current,i,rs1[i]);
1523         alloc_reg(current,i,rs2[i]);
1524       }
1525       else {
1526         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1527         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1528       }
1529       alloc_reg(current,i,rt1[i]);
1530     }
1531     current->is32|=1LL<<rt1[i];
1532   }
1533   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1534     if(rt1[i]) {
1535       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1536       {
1537         alloc_reg64(current,i,rs1[i]);
1538         alloc_reg64(current,i,rs2[i]);
1539         alloc_reg(current,i,rt1[i]);
1540       } else {
1541         alloc_reg(current,i,rs1[i]);
1542         alloc_reg(current,i,rs2[i]);
1543         alloc_reg(current,i,rt1[i]);
1544       }
1545     }
1546     current->is32|=1LL<<rt1[i];
1547   }
1548   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1549     if(rt1[i]) {
1550       if(rs1[i]&&rs2[i]) {
1551         alloc_reg(current,i,rs1[i]);
1552         alloc_reg(current,i,rs2[i]);
1553       }
1554       else
1555       {
1556         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1557         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1558       }
1559       alloc_reg(current,i,rt1[i]);
1560       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1561       {
1562         if(!((current->uu>>rt1[i])&1)) {
1563           alloc_reg64(current,i,rt1[i]);
1564         }
1565         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1566           if(rs1[i]&&rs2[i]) {
1567             alloc_reg64(current,i,rs1[i]);
1568             alloc_reg64(current,i,rs2[i]);
1569           }
1570           else
1571           {
1572             // Is is really worth it to keep 64-bit values in registers?
1573             #ifdef NATIVE_64BIT
1574             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1575             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1576             #endif
1577           }
1578         }
1579         current->is32&=~(1LL<<rt1[i]);
1580       } else {
1581         current->is32|=1LL<<rt1[i];
1582       }
1583     }
1584   }
1585   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1586     if(rt1[i]) {
1587       if(rs1[i]&&rs2[i]) {
1588         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1589           alloc_reg64(current,i,rs1[i]);
1590           alloc_reg64(current,i,rs2[i]);
1591           alloc_reg64(current,i,rt1[i]);
1592         } else {
1593           alloc_reg(current,i,rs1[i]);
1594           alloc_reg(current,i,rs2[i]);
1595           alloc_reg(current,i,rt1[i]);
1596         }
1597       }
1598       else {
1599         alloc_reg(current,i,rt1[i]);
1600         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1601           // DADD used as move, or zeroing
1602           // If we have a 64-bit source, then make the target 64 bits too
1603           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1604             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1605             alloc_reg64(current,i,rt1[i]);
1606           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1607             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1608             alloc_reg64(current,i,rt1[i]);
1609           }
1610           if(opcode2[i]>=0x2e&&rs2[i]) {
1611             // DSUB used as negation - 64-bit result
1612             // If we have a 32-bit register, extend it to 64 bits
1613             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1614             alloc_reg64(current,i,rt1[i]);
1615           }
1616         }
1617       }
1618       if(rs1[i]&&rs2[i]) {
1619         current->is32&=~(1LL<<rt1[i]);
1620       } else if(rs1[i]) {
1621         current->is32&=~(1LL<<rt1[i]);
1622         if((current->is32>>rs1[i])&1)
1623           current->is32|=1LL<<rt1[i];
1624       } else if(rs2[i]) {
1625         current->is32&=~(1LL<<rt1[i]);
1626         if((current->is32>>rs2[i])&1)
1627           current->is32|=1LL<<rt1[i];
1628       } else {
1629         current->is32|=1LL<<rt1[i];
1630       }
1631     }
1632   }
1633   clear_const(current,rs1[i]);
1634   clear_const(current,rs2[i]);
1635   clear_const(current,rt1[i]);
1636   dirty_reg(current,rt1[i]);
1637 }
1638
1639 void imm16_alloc(struct regstat *current,int i)
1640 {
1641   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1642   else lt1[i]=rs1[i];
1643   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1644   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1645     current->is32&=~(1LL<<rt1[i]);
1646     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1647       // TODO: Could preserve the 32-bit flag if the immediate is zero
1648       alloc_reg64(current,i,rt1[i]);
1649       alloc_reg64(current,i,rs1[i]);
1650     }
1651     clear_const(current,rs1[i]);
1652     clear_const(current,rt1[i]);
1653   }
1654   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1655     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1656     current->is32|=1LL<<rt1[i];
1657     clear_const(current,rs1[i]);
1658     clear_const(current,rt1[i]);
1659   }
1660   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1661     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1662       if(rs1[i]!=rt1[i]) {
1663         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1664         alloc_reg64(current,i,rt1[i]);
1665         current->is32&=~(1LL<<rt1[i]);
1666       }
1667     }
1668     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1669     if(is_const(current,rs1[i])) {
1670       int v=get_const(current,rs1[i]);
1671       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1672       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1673       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1674     }
1675     else clear_const(current,rt1[i]);
1676   }
1677   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1678     if(is_const(current,rs1[i])) {
1679       int v=get_const(current,rs1[i]);
1680       set_const(current,rt1[i],v+imm[i]);
1681     }
1682     else clear_const(current,rt1[i]);
1683     current->is32|=1LL<<rt1[i];
1684   }
1685   else {
1686     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1687     current->is32|=1LL<<rt1[i];
1688   }
1689   dirty_reg(current,rt1[i]);
1690 }
1691
1692 void load_alloc(struct regstat *current,int i)
1693 {
1694   clear_const(current,rt1[i]);
1695   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1696   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1697   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1698   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1699     alloc_reg(current,i,rt1[i]);
1700     assert(get_reg(current->regmap,rt1[i])>=0);
1701     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1702     {
1703       current->is32&=~(1LL<<rt1[i]);
1704       alloc_reg64(current,i,rt1[i]);
1705     }
1706     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1707     {
1708       current->is32&=~(1LL<<rt1[i]);
1709       alloc_reg64(current,i,rt1[i]);
1710       alloc_all(current,i);
1711       alloc_reg64(current,i,FTEMP);
1712       minimum_free_regs[i]=HOST_REGS;
1713     }
1714     else current->is32|=1LL<<rt1[i];
1715     dirty_reg(current,rt1[i]);
1716     // If using TLB, need a register for pointer to the mapping table
1717     if(using_tlb) alloc_reg(current,i,TLREG);
1718     // LWL/LWR need a temporary register for the old value
1719     if(opcode[i]==0x22||opcode[i]==0x26)
1720     {
1721       alloc_reg(current,i,FTEMP);
1722       alloc_reg_temp(current,i,-1);
1723       minimum_free_regs[i]=1;
1724     }
1725   }
1726   else
1727   {
1728     // Load to r0 or unneeded register (dummy load)
1729     // but we still need a register to calculate the address
1730     if(opcode[i]==0x22||opcode[i]==0x26)
1731     {
1732       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1733     }
1734     // If using TLB, need a register for pointer to the mapping table
1735     if(using_tlb) alloc_reg(current,i,TLREG);
1736     alloc_reg_temp(current,i,-1);
1737     minimum_free_regs[i]=1;
1738     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1739     {
1740       alloc_all(current,i);
1741       alloc_reg64(current,i,FTEMP);
1742       minimum_free_regs[i]=HOST_REGS;
1743     }
1744   }
1745 }
1746
1747 void store_alloc(struct regstat *current,int i)
1748 {
1749   clear_const(current,rs2[i]);
1750   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1751   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1752   alloc_reg(current,i,rs2[i]);
1753   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1754     alloc_reg64(current,i,rs2[i]);
1755     if(rs2[i]) alloc_reg(current,i,FTEMP);
1756   }
1757   // If using TLB, need a register for pointer to the mapping table
1758   if(using_tlb) alloc_reg(current,i,TLREG);
1759   #if defined(HOST_IMM8)
1760   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1761   else alloc_reg(current,i,INVCP);
1762   #endif
1763   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1764     alloc_reg(current,i,FTEMP);
1765   }
1766   // We need a temporary register for address generation
1767   alloc_reg_temp(current,i,-1);
1768   minimum_free_regs[i]=1;
1769 }
1770
1771 void c1ls_alloc(struct regstat *current,int i)
1772 {
1773   //clear_const(current,rs1[i]); // FIXME
1774   clear_const(current,rt1[i]);
1775   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1776   alloc_reg(current,i,CSREG); // Status
1777   alloc_reg(current,i,FTEMP);
1778   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1779     alloc_reg64(current,i,FTEMP);
1780   }
1781   // If using TLB, need a register for pointer to the mapping table
1782   if(using_tlb) alloc_reg(current,i,TLREG);
1783   #if defined(HOST_IMM8)
1784   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1785   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1786     alloc_reg(current,i,INVCP);
1787   #endif
1788   // We need a temporary register for address generation
1789   alloc_reg_temp(current,i,-1);
1790 }
1791
1792 void c2ls_alloc(struct regstat *current,int i)
1793 {
1794   clear_const(current,rt1[i]);
1795   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1796   alloc_reg(current,i,FTEMP);
1797   // If using TLB, need a register for pointer to the mapping table
1798   if(using_tlb) alloc_reg(current,i,TLREG);
1799   #if defined(HOST_IMM8)
1800   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1801   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1802     alloc_reg(current,i,INVCP);
1803   #endif
1804   // We need a temporary register for address generation
1805   alloc_reg_temp(current,i,-1);
1806   minimum_free_regs[i]=1;
1807 }
1808
1809 #ifndef multdiv_alloc
1810 void multdiv_alloc(struct regstat *current,int i)
1811 {
1812   //  case 0x18: MULT
1813   //  case 0x19: MULTU
1814   //  case 0x1A: DIV
1815   //  case 0x1B: DIVU
1816   //  case 0x1C: DMULT
1817   //  case 0x1D: DMULTU
1818   //  case 0x1E: DDIV
1819   //  case 0x1F: DDIVU
1820   clear_const(current,rs1[i]);
1821   clear_const(current,rs2[i]);
1822   if(rs1[i]&&rs2[i])
1823   {
1824     if((opcode2[i]&4)==0) // 32-bit
1825     {
1826       current->u&=~(1LL<<HIREG);
1827       current->u&=~(1LL<<LOREG);
1828       alloc_reg(current,i,HIREG);
1829       alloc_reg(current,i,LOREG);
1830       alloc_reg(current,i,rs1[i]);
1831       alloc_reg(current,i,rs2[i]);
1832       current->is32|=1LL<<HIREG;
1833       current->is32|=1LL<<LOREG;
1834       dirty_reg(current,HIREG);
1835       dirty_reg(current,LOREG);
1836     }
1837     else // 64-bit
1838     {
1839       current->u&=~(1LL<<HIREG);
1840       current->u&=~(1LL<<LOREG);
1841       current->uu&=~(1LL<<HIREG);
1842       current->uu&=~(1LL<<LOREG);
1843       alloc_reg64(current,i,HIREG);
1844       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1845       alloc_reg64(current,i,rs1[i]);
1846       alloc_reg64(current,i,rs2[i]);
1847       alloc_all(current,i);
1848       current->is32&=~(1LL<<HIREG);
1849       current->is32&=~(1LL<<LOREG);
1850       dirty_reg(current,HIREG);
1851       dirty_reg(current,LOREG);
1852       minimum_free_regs[i]=HOST_REGS;
1853     }
1854   }
1855   else
1856   {
1857     // Multiply by zero is zero.
1858     // MIPS does not have a divide by zero exception.
1859     // The result is undefined, we return zero.
1860     alloc_reg(current,i,HIREG);
1861     alloc_reg(current,i,LOREG);
1862     current->is32|=1LL<<HIREG;
1863     current->is32|=1LL<<LOREG;
1864     dirty_reg(current,HIREG);
1865     dirty_reg(current,LOREG);
1866   }
1867 }
1868 #endif
1869
1870 void cop0_alloc(struct regstat *current,int i)
1871 {
1872   if(opcode2[i]==0) // MFC0
1873   {
1874     if(rt1[i]) {
1875       clear_const(current,rt1[i]);
1876       alloc_all(current,i);
1877       alloc_reg(current,i,rt1[i]);
1878       current->is32|=1LL<<rt1[i];
1879       dirty_reg(current,rt1[i]);
1880     }
1881   }
1882   else if(opcode2[i]==4) // MTC0
1883   {
1884     if(rs1[i]){
1885       clear_const(current,rs1[i]);
1886       alloc_reg(current,i,rs1[i]);
1887       alloc_all(current,i);
1888     }
1889     else {
1890       alloc_all(current,i); // FIXME: Keep r0
1891       current->u&=~1LL;
1892       alloc_reg(current,i,0);
1893     }
1894   }
1895   else
1896   {
1897     // TLBR/TLBWI/TLBWR/TLBP/ERET
1898     assert(opcode2[i]==0x10);
1899     alloc_all(current,i);
1900   }
1901   minimum_free_regs[i]=HOST_REGS;
1902 }
1903
1904 void cop1_alloc(struct regstat *current,int i)
1905 {
1906   alloc_reg(current,i,CSREG); // Load status
1907   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1908   {
1909     if(rt1[i]){
1910       clear_const(current,rt1[i]);
1911       if(opcode2[i]==1) {
1912         alloc_reg64(current,i,rt1[i]); // DMFC1
1913         current->is32&=~(1LL<<rt1[i]);
1914       }else{
1915         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1916         current->is32|=1LL<<rt1[i];
1917       }
1918       dirty_reg(current,rt1[i]);
1919     }
1920     alloc_reg_temp(current,i,-1);
1921   }
1922   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1923   {
1924     if(rs1[i]){
1925       clear_const(current,rs1[i]);
1926       if(opcode2[i]==5)
1927         alloc_reg64(current,i,rs1[i]); // DMTC1
1928       else
1929         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1930       alloc_reg_temp(current,i,-1);
1931     }
1932     else {
1933       current->u&=~1LL;
1934       alloc_reg(current,i,0);
1935       alloc_reg_temp(current,i,-1);
1936     }
1937   }
1938   minimum_free_regs[i]=1;
1939 }
1940 void fconv_alloc(struct regstat *current,int i)
1941 {
1942   alloc_reg(current,i,CSREG); // Load status
1943   alloc_reg_temp(current,i,-1);
1944   minimum_free_regs[i]=1;
1945 }
1946 void float_alloc(struct regstat *current,int i)
1947 {
1948   alloc_reg(current,i,CSREG); // Load status
1949   alloc_reg_temp(current,i,-1);
1950   minimum_free_regs[i]=1;
1951 }
1952 void c2op_alloc(struct regstat *current,int i)
1953 {
1954   alloc_reg_temp(current,i,-1);
1955 }
1956 void fcomp_alloc(struct regstat *current,int i)
1957 {
1958   alloc_reg(current,i,CSREG); // Load status
1959   alloc_reg(current,i,FSREG); // Load flags
1960   dirty_reg(current,FSREG); // Flag will be modified
1961   alloc_reg_temp(current,i,-1);
1962   minimum_free_regs[i]=1;
1963 }
1964
1965 void syscall_alloc(struct regstat *current,int i)
1966 {
1967   alloc_cc(current,i);
1968   dirty_reg(current,CCREG);
1969   alloc_all(current,i);
1970   minimum_free_regs[i]=HOST_REGS;
1971   current->isconst=0;
1972 }
1973
1974 void delayslot_alloc(struct regstat *current,int i)
1975 {
1976   switch(itype[i]) {
1977     case UJUMP:
1978     case CJUMP:
1979     case SJUMP:
1980     case RJUMP:
1981     case FJUMP:
1982     case SYSCALL:
1983     case HLECALL:
1984     case SPAN:
1985       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1986       printf("Disabled speculative precompilation\n");
1987       stop_after_jal=1;
1988       break;
1989     case IMM16:
1990       imm16_alloc(current,i);
1991       break;
1992     case LOAD:
1993     case LOADLR:
1994       load_alloc(current,i);
1995       break;
1996     case STORE:
1997     case STORELR:
1998       store_alloc(current,i);
1999       break;
2000     case ALU:
2001       alu_alloc(current,i);
2002       break;
2003     case SHIFT:
2004       shift_alloc(current,i);
2005       break;
2006     case MULTDIV:
2007       multdiv_alloc(current,i);
2008       break;
2009     case SHIFTIMM:
2010       shiftimm_alloc(current,i);
2011       break;
2012     case MOV:
2013       mov_alloc(current,i);
2014       break;
2015     case COP0:
2016       cop0_alloc(current,i);
2017       break;
2018     case COP1:
2019     case COP2:
2020       cop1_alloc(current,i);
2021       break;
2022     case C1LS:
2023       c1ls_alloc(current,i);
2024       break;
2025     case C2LS:
2026       c2ls_alloc(current,i);
2027       break;
2028     case FCONV:
2029       fconv_alloc(current,i);
2030       break;
2031     case FLOAT:
2032       float_alloc(current,i);
2033       break;
2034     case FCOMP:
2035       fcomp_alloc(current,i);
2036       break;
2037     case C2OP:
2038       c2op_alloc(current,i);
2039       break;
2040   }
2041 }
2042
2043 // Special case where a branch and delay slot span two pages in virtual memory
2044 static void pagespan_alloc(struct regstat *current,int i)
2045 {
2046   current->isconst=0;
2047   current->wasconst=0;
2048   regs[i].wasconst=0;
2049   minimum_free_regs[i]=HOST_REGS;
2050   alloc_all(current,i);
2051   alloc_cc(current,i);
2052   dirty_reg(current,CCREG);
2053   if(opcode[i]==3) // JAL
2054   {
2055     alloc_reg(current,i,31);
2056     dirty_reg(current,31);
2057   }
2058   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2059   {
2060     alloc_reg(current,i,rs1[i]);
2061     if (rt1[i]!=0) {
2062       alloc_reg(current,i,rt1[i]);
2063       dirty_reg(current,rt1[i]);
2064     }
2065   }
2066   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2067   {
2068     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2069     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2070     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2071     {
2072       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2073       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2074     }
2075   }
2076   else
2077   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2078   {
2079     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2080     if(!((current->is32>>rs1[i])&1))
2081     {
2082       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2083     }
2084   }
2085   else
2086   if(opcode[i]==0x11) // BC1
2087   {
2088     alloc_reg(current,i,FSREG);
2089     alloc_reg(current,i,CSREG);
2090   }
2091   //else ...
2092 }
2093
2094 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2095 {
2096   stubs[stubcount][0]=type;
2097   stubs[stubcount][1]=addr;
2098   stubs[stubcount][2]=retaddr;
2099   stubs[stubcount][3]=a;
2100   stubs[stubcount][4]=b;
2101   stubs[stubcount][5]=c;
2102   stubs[stubcount][6]=d;
2103   stubs[stubcount][7]=e;
2104   stubcount++;
2105 }
2106
2107 // Write out a single register
2108 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2109 {
2110   int hr;
2111   for(hr=0;hr<HOST_REGS;hr++) {
2112     if(hr!=EXCLUDE_REG) {
2113       if((regmap[hr]&63)==r) {
2114         if((dirty>>hr)&1) {
2115           if(regmap[hr]<64) {
2116             emit_storereg(r,hr);
2117 #ifndef FORCE32
2118             if((is32>>regmap[hr])&1) {
2119               emit_sarimm(hr,31,hr);
2120               emit_storereg(r|64,hr);
2121             }
2122 #endif
2123           }else{
2124             emit_storereg(r|64,hr);
2125           }
2126         }
2127       }
2128     }
2129   }
2130 }
2131
2132 int mchecksum()
2133 {
2134   //if(!tracedebug) return 0;
2135   int i;
2136   int sum=0;
2137   for(i=0;i<2097152;i++) {
2138     unsigned int temp=sum;
2139     sum<<=1;
2140     sum|=(~temp)>>31;
2141     sum^=((u_int *)rdram)[i];
2142   }
2143   return sum;
2144 }
2145 int rchecksum()
2146 {
2147   int i;
2148   int sum=0;
2149   for(i=0;i<64;i++)
2150     sum^=((u_int *)reg)[i];
2151   return sum;
2152 }
2153 void rlist()
2154 {
2155   int i;
2156   printf("TRACE: ");
2157   for(i=0;i<32;i++)
2158     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2159   printf("\n");
2160 #ifndef DISABLE_COP1
2161   printf("TRACE: ");
2162   for(i=0;i<32;i++)
2163     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2164   printf("\n");
2165 #endif
2166 }
2167
2168 void enabletrace()
2169 {
2170   tracedebug=1;
2171 }
2172
2173 void memdebug(int i)
2174 {
2175   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2176   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2177   //rlist();
2178   //if(tracedebug) {
2179   //if(Count>=-2084597794) {
2180   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2181   //if(0) {
2182     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2183     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2184     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2185     rlist();
2186     #ifdef __i386__
2187     printf("TRACE: %x\n",(&i)[-1]);
2188     #endif
2189     #ifdef __arm__
2190     int j;
2191     printf("TRACE: %x \n",(&j)[10]);
2192     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2193     #endif
2194     //fflush(stdout);
2195   }
2196   //printf("TRACE: %x\n",(&i)[-1]);
2197 }
2198
2199 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2200 {
2201   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2202 }
2203
2204 void alu_assemble(int i,struct regstat *i_regs)
2205 {
2206   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2207     if(rt1[i]) {
2208       signed char s1,s2,t;
2209       t=get_reg(i_regs->regmap,rt1[i]);
2210       if(t>=0) {
2211         s1=get_reg(i_regs->regmap,rs1[i]);
2212         s2=get_reg(i_regs->regmap,rs2[i]);
2213         if(rs1[i]&&rs2[i]) {
2214           assert(s1>=0);
2215           assert(s2>=0);
2216           if(opcode2[i]&2) emit_sub(s1,s2,t);
2217           else emit_add(s1,s2,t);
2218         }
2219         else if(rs1[i]) {
2220           if(s1>=0) emit_mov(s1,t);
2221           else emit_loadreg(rs1[i],t);
2222         }
2223         else if(rs2[i]) {
2224           if(s2>=0) {
2225             if(opcode2[i]&2) emit_neg(s2,t);
2226             else emit_mov(s2,t);
2227           }
2228           else {
2229             emit_loadreg(rs2[i],t);
2230             if(opcode2[i]&2) emit_neg(t,t);
2231           }
2232         }
2233         else emit_zeroreg(t);
2234       }
2235     }
2236   }
2237   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2238     if(rt1[i]) {
2239       signed char s1l,s2l,s1h,s2h,tl,th;
2240       tl=get_reg(i_regs->regmap,rt1[i]);
2241       th=get_reg(i_regs->regmap,rt1[i]|64);
2242       if(tl>=0) {
2243         s1l=get_reg(i_regs->regmap,rs1[i]);
2244         s2l=get_reg(i_regs->regmap,rs2[i]);
2245         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2246         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2247         if(rs1[i]&&rs2[i]) {
2248           assert(s1l>=0);
2249           assert(s2l>=0);
2250           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2251           else emit_adds(s1l,s2l,tl);
2252           if(th>=0) {
2253             #ifdef INVERTED_CARRY
2254             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2255             #else
2256             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2257             #endif
2258             else emit_add(s1h,s2h,th);
2259           }
2260         }
2261         else if(rs1[i]) {
2262           if(s1l>=0) emit_mov(s1l,tl);
2263           else emit_loadreg(rs1[i],tl);
2264           if(th>=0) {
2265             if(s1h>=0) emit_mov(s1h,th);
2266             else emit_loadreg(rs1[i]|64,th);
2267           }
2268         }
2269         else if(rs2[i]) {
2270           if(s2l>=0) {
2271             if(opcode2[i]&2) emit_negs(s2l,tl);
2272             else emit_mov(s2l,tl);
2273           }
2274           else {
2275             emit_loadreg(rs2[i],tl);
2276             if(opcode2[i]&2) emit_negs(tl,tl);
2277           }
2278           if(th>=0) {
2279             #ifdef INVERTED_CARRY
2280             if(s2h>=0) emit_mov(s2h,th);
2281             else emit_loadreg(rs2[i]|64,th);
2282             if(opcode2[i]&2) {
2283               emit_adcimm(-1,th); // x86 has inverted carry flag
2284               emit_not(th,th);
2285             }
2286             #else
2287             if(opcode2[i]&2) {
2288               if(s2h>=0) emit_rscimm(s2h,0,th);
2289               else {
2290                 emit_loadreg(rs2[i]|64,th);
2291                 emit_rscimm(th,0,th);
2292               }
2293             }else{
2294               if(s2h>=0) emit_mov(s2h,th);
2295               else emit_loadreg(rs2[i]|64,th);
2296             }
2297             #endif
2298           }
2299         }
2300         else {
2301           emit_zeroreg(tl);
2302           if(th>=0) emit_zeroreg(th);
2303         }
2304       }
2305     }
2306   }
2307   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2308     if(rt1[i]) {
2309       signed char s1l,s1h,s2l,s2h,t;
2310       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2311       {
2312         t=get_reg(i_regs->regmap,rt1[i]);
2313         //assert(t>=0);
2314         if(t>=0) {
2315           s1l=get_reg(i_regs->regmap,rs1[i]);
2316           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2317           s2l=get_reg(i_regs->regmap,rs2[i]);
2318           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2319           if(rs2[i]==0) // rx<r0
2320           {
2321             assert(s1h>=0);
2322             if(opcode2[i]==0x2a) // SLT
2323               emit_shrimm(s1h,31,t);
2324             else // SLTU (unsigned can not be less than zero)
2325               emit_zeroreg(t);
2326           }
2327           else if(rs1[i]==0) // r0<rx
2328           {
2329             assert(s2h>=0);
2330             if(opcode2[i]==0x2a) // SLT
2331               emit_set_gz64_32(s2h,s2l,t);
2332             else // SLTU (set if not zero)
2333               emit_set_nz64_32(s2h,s2l,t);
2334           }
2335           else {
2336             assert(s1l>=0);assert(s1h>=0);
2337             assert(s2l>=0);assert(s2h>=0);
2338             if(opcode2[i]==0x2a) // SLT
2339               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2340             else // SLTU
2341               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2342           }
2343         }
2344       } else {
2345         t=get_reg(i_regs->regmap,rt1[i]);
2346         //assert(t>=0);
2347         if(t>=0) {
2348           s1l=get_reg(i_regs->regmap,rs1[i]);
2349           s2l=get_reg(i_regs->regmap,rs2[i]);
2350           if(rs2[i]==0) // rx<r0
2351           {
2352             assert(s1l>=0);
2353             if(opcode2[i]==0x2a) // SLT
2354               emit_shrimm(s1l,31,t);
2355             else // SLTU (unsigned can not be less than zero)
2356               emit_zeroreg(t);
2357           }
2358           else if(rs1[i]==0) // r0<rx
2359           {
2360             assert(s2l>=0);
2361             if(opcode2[i]==0x2a) // SLT
2362               emit_set_gz32(s2l,t);
2363             else // SLTU (set if not zero)
2364               emit_set_nz32(s2l,t);
2365           }
2366           else{
2367             assert(s1l>=0);assert(s2l>=0);
2368             if(opcode2[i]==0x2a) // SLT
2369               emit_set_if_less32(s1l,s2l,t);
2370             else // SLTU
2371               emit_set_if_carry32(s1l,s2l,t);
2372           }
2373         }
2374       }
2375     }
2376   }
2377   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2378     if(rt1[i]) {
2379       signed char s1l,s1h,s2l,s2h,th,tl;
2380       tl=get_reg(i_regs->regmap,rt1[i]);
2381       th=get_reg(i_regs->regmap,rt1[i]|64);
2382       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2383       {
2384         assert(tl>=0);
2385         if(tl>=0) {
2386           s1l=get_reg(i_regs->regmap,rs1[i]);
2387           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2388           s2l=get_reg(i_regs->regmap,rs2[i]);
2389           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2390           if(rs1[i]&&rs2[i]) {
2391             assert(s1l>=0);assert(s1h>=0);
2392             assert(s2l>=0);assert(s2h>=0);
2393             if(opcode2[i]==0x24) { // AND
2394               emit_and(s1l,s2l,tl);
2395               emit_and(s1h,s2h,th);
2396             } else
2397             if(opcode2[i]==0x25) { // OR
2398               emit_or(s1l,s2l,tl);
2399               emit_or(s1h,s2h,th);
2400             } else
2401             if(opcode2[i]==0x26) { // XOR
2402               emit_xor(s1l,s2l,tl);
2403               emit_xor(s1h,s2h,th);
2404             } else
2405             if(opcode2[i]==0x27) { // NOR
2406               emit_or(s1l,s2l,tl);
2407               emit_or(s1h,s2h,th);
2408               emit_not(tl,tl);
2409               emit_not(th,th);
2410             }
2411           }
2412           else
2413           {
2414             if(opcode2[i]==0x24) { // AND
2415               emit_zeroreg(tl);
2416               emit_zeroreg(th);
2417             } else
2418             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2419               if(rs1[i]){
2420                 if(s1l>=0) emit_mov(s1l,tl);
2421                 else emit_loadreg(rs1[i],tl);
2422                 if(s1h>=0) emit_mov(s1h,th);
2423                 else emit_loadreg(rs1[i]|64,th);
2424               }
2425               else
2426               if(rs2[i]){
2427                 if(s2l>=0) emit_mov(s2l,tl);
2428                 else emit_loadreg(rs2[i],tl);
2429                 if(s2h>=0) emit_mov(s2h,th);
2430                 else emit_loadreg(rs2[i]|64,th);
2431               }
2432               else{
2433                 emit_zeroreg(tl);
2434                 emit_zeroreg(th);
2435               }
2436             } else
2437             if(opcode2[i]==0x27) { // NOR
2438               if(rs1[i]){
2439                 if(s1l>=0) emit_not(s1l,tl);
2440                 else{
2441                   emit_loadreg(rs1[i],tl);
2442                   emit_not(tl,tl);
2443                 }
2444                 if(s1h>=0) emit_not(s1h,th);
2445                 else{
2446                   emit_loadreg(rs1[i]|64,th);
2447                   emit_not(th,th);
2448                 }
2449               }
2450               else
2451               if(rs2[i]){
2452                 if(s2l>=0) emit_not(s2l,tl);
2453                 else{
2454                   emit_loadreg(rs2[i],tl);
2455                   emit_not(tl,tl);
2456                 }
2457                 if(s2h>=0) emit_not(s2h,th);
2458                 else{
2459                   emit_loadreg(rs2[i]|64,th);
2460                   emit_not(th,th);
2461                 }
2462               }
2463               else {
2464                 emit_movimm(-1,tl);
2465                 emit_movimm(-1,th);
2466               }
2467             }
2468           }
2469         }
2470       }
2471       else
2472       {
2473         // 32 bit
2474         if(tl>=0) {
2475           s1l=get_reg(i_regs->regmap,rs1[i]);
2476           s2l=get_reg(i_regs->regmap,rs2[i]);
2477           if(rs1[i]&&rs2[i]) {
2478             assert(s1l>=0);
2479             assert(s2l>=0);
2480             if(opcode2[i]==0x24) { // AND
2481               emit_and(s1l,s2l,tl);
2482             } else
2483             if(opcode2[i]==0x25) { // OR
2484               emit_or(s1l,s2l,tl);
2485             } else
2486             if(opcode2[i]==0x26) { // XOR
2487               emit_xor(s1l,s2l,tl);
2488             } else
2489             if(opcode2[i]==0x27) { // NOR
2490               emit_or(s1l,s2l,tl);
2491               emit_not(tl,tl);
2492             }
2493           }
2494           else
2495           {
2496             if(opcode2[i]==0x24) { // AND
2497               emit_zeroreg(tl);
2498             } else
2499             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2500               if(rs1[i]){
2501                 if(s1l>=0) emit_mov(s1l,tl);
2502                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2503               }
2504               else
2505               if(rs2[i]){
2506                 if(s2l>=0) emit_mov(s2l,tl);
2507                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2508               }
2509               else emit_zeroreg(tl);
2510             } else
2511             if(opcode2[i]==0x27) { // NOR
2512               if(rs1[i]){
2513                 if(s1l>=0) emit_not(s1l,tl);
2514                 else {
2515                   emit_loadreg(rs1[i],tl);
2516                   emit_not(tl,tl);
2517                 }
2518               }
2519               else
2520               if(rs2[i]){
2521                 if(s2l>=0) emit_not(s2l,tl);
2522                 else {
2523                   emit_loadreg(rs2[i],tl);
2524                   emit_not(tl,tl);
2525                 }
2526               }
2527               else emit_movimm(-1,tl);
2528             }
2529           }
2530         }
2531       }
2532     }
2533   }
2534 }
2535
2536 void imm16_assemble(int i,struct regstat *i_regs)
2537 {
2538   if (opcode[i]==0x0f) { // LUI
2539     if(rt1[i]) {
2540       signed char t;
2541       t=get_reg(i_regs->regmap,rt1[i]);
2542       //assert(t>=0);
2543       if(t>=0) {
2544         if(!((i_regs->isconst>>t)&1))
2545           emit_movimm(imm[i]<<16,t);
2546       }
2547     }
2548   }
2549   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2550     if(rt1[i]) {
2551       signed char s,t;
2552       t=get_reg(i_regs->regmap,rt1[i]);
2553       s=get_reg(i_regs->regmap,rs1[i]);
2554       if(rs1[i]) {
2555         //assert(t>=0);
2556         //assert(s>=0);
2557         if(t>=0) {
2558           if(!((i_regs->isconst>>t)&1)) {
2559             if(s<0) {
2560               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2561               emit_addimm(t,imm[i],t);
2562             }else{
2563               if(!((i_regs->wasconst>>s)&1))
2564                 emit_addimm(s,imm[i],t);
2565               else
2566                 emit_movimm(constmap[i][s]+imm[i],t);
2567             }
2568           }
2569         }
2570       } else {
2571         if(t>=0) {
2572           if(!((i_regs->isconst>>t)&1))
2573             emit_movimm(imm[i],t);
2574         }
2575       }
2576     }
2577   }
2578   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2579     if(rt1[i]) {
2580       signed char sh,sl,th,tl;
2581       th=get_reg(i_regs->regmap,rt1[i]|64);
2582       tl=get_reg(i_regs->regmap,rt1[i]);
2583       sh=get_reg(i_regs->regmap,rs1[i]|64);
2584       sl=get_reg(i_regs->regmap,rs1[i]);
2585       if(tl>=0) {
2586         if(rs1[i]) {
2587           assert(sh>=0);
2588           assert(sl>=0);
2589           if(th>=0) {
2590             emit_addimm64_32(sh,sl,imm[i],th,tl);
2591           }
2592           else {
2593             emit_addimm(sl,imm[i],tl);
2594           }
2595         } else {
2596           emit_movimm(imm[i],tl);
2597           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2598         }
2599       }
2600     }
2601   }
2602   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2603     if(rt1[i]) {
2604       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2605       signed char sh,sl,t;
2606       t=get_reg(i_regs->regmap,rt1[i]);
2607       sh=get_reg(i_regs->regmap,rs1[i]|64);
2608       sl=get_reg(i_regs->regmap,rs1[i]);
2609       //assert(t>=0);
2610       if(t>=0) {
2611         if(rs1[i]>0) {
2612           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2613           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2614             if(opcode[i]==0x0a) { // SLTI
2615               if(sl<0) {
2616                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2617                 emit_slti32(t,imm[i],t);
2618               }else{
2619                 emit_slti32(sl,imm[i],t);
2620               }
2621             }
2622             else { // SLTIU
2623               if(sl<0) {
2624                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2625                 emit_sltiu32(t,imm[i],t);
2626               }else{
2627                 emit_sltiu32(sl,imm[i],t);
2628               }
2629             }
2630           }else{ // 64-bit
2631             assert(sl>=0);
2632             if(opcode[i]==0x0a) // SLTI
2633               emit_slti64_32(sh,sl,imm[i],t);
2634             else // SLTIU
2635               emit_sltiu64_32(sh,sl,imm[i],t);
2636           }
2637         }else{
2638           // SLTI(U) with r0 is just stupid,
2639           // nonetheless examples can be found
2640           if(opcode[i]==0x0a) // SLTI
2641             if(0<imm[i]) emit_movimm(1,t);
2642             else emit_zeroreg(t);
2643           else // SLTIU
2644           {
2645             if(imm[i]) emit_movimm(1,t);
2646             else emit_zeroreg(t);
2647           }
2648         }
2649       }
2650     }
2651   }
2652   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2653     if(rt1[i]) {
2654       signed char sh,sl,th,tl;
2655       th=get_reg(i_regs->regmap,rt1[i]|64);
2656       tl=get_reg(i_regs->regmap,rt1[i]);
2657       sh=get_reg(i_regs->regmap,rs1[i]|64);
2658       sl=get_reg(i_regs->regmap,rs1[i]);
2659       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2660         if(opcode[i]==0x0c) //ANDI
2661         {
2662           if(rs1[i]) {
2663             if(sl<0) {
2664               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2665               emit_andimm(tl,imm[i],tl);
2666             }else{
2667               if(!((i_regs->wasconst>>sl)&1))
2668                 emit_andimm(sl,imm[i],tl);
2669               else
2670                 emit_movimm(constmap[i][sl]&imm[i],tl);
2671             }
2672           }
2673           else
2674             emit_zeroreg(tl);
2675           if(th>=0) emit_zeroreg(th);
2676         }
2677         else
2678         {
2679           if(rs1[i]) {
2680             if(sl<0) {
2681               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2682             }
2683             if(th>=0) {
2684               if(sh<0) {
2685                 emit_loadreg(rs1[i]|64,th);
2686               }else{
2687                 emit_mov(sh,th);
2688               }
2689             }
2690             if(opcode[i]==0x0d) //ORI
2691             if(sl<0) {
2692               emit_orimm(tl,imm[i],tl);
2693             }else{
2694               if(!((i_regs->wasconst>>sl)&1))
2695                 emit_orimm(sl,imm[i],tl);
2696               else
2697                 emit_movimm(constmap[i][sl]|imm[i],tl);
2698             }
2699             if(opcode[i]==0x0e) //XORI
2700             if(sl<0) {
2701               emit_xorimm(tl,imm[i],tl);
2702             }else{
2703               if(!((i_regs->wasconst>>sl)&1))
2704                 emit_xorimm(sl,imm[i],tl);
2705               else
2706                 emit_movimm(constmap[i][sl]^imm[i],tl);
2707             }
2708           }
2709           else {
2710             emit_movimm(imm[i],tl);
2711             if(th>=0) emit_zeroreg(th);
2712           }
2713         }
2714       }
2715     }
2716   }
2717 }
2718
2719 void shiftimm_assemble(int i,struct regstat *i_regs)
2720 {
2721   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2722   {
2723     if(rt1[i]) {
2724       signed char s,t;
2725       t=get_reg(i_regs->regmap,rt1[i]);
2726       s=get_reg(i_regs->regmap,rs1[i]);
2727       //assert(t>=0);
2728       if(t>=0&&!((i_regs->isconst>>t)&1)){
2729         if(rs1[i]==0)
2730         {
2731           emit_zeroreg(t);
2732         }
2733         else
2734         {
2735           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2736           if(imm[i]) {
2737             if(opcode2[i]==0) // SLL
2738             {
2739               emit_shlimm(s<0?t:s,imm[i],t);
2740             }
2741             if(opcode2[i]==2) // SRL
2742             {
2743               emit_shrimm(s<0?t:s,imm[i],t);
2744             }
2745             if(opcode2[i]==3) // SRA
2746             {
2747               emit_sarimm(s<0?t:s,imm[i],t);
2748             }
2749           }else{
2750             // Shift by zero
2751             if(s>=0 && s!=t) emit_mov(s,t);
2752           }
2753         }
2754       }
2755       //emit_storereg(rt1[i],t); //DEBUG
2756     }
2757   }
2758   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2759   {
2760     if(rt1[i]) {
2761       signed char sh,sl,th,tl;
2762       th=get_reg(i_regs->regmap,rt1[i]|64);
2763       tl=get_reg(i_regs->regmap,rt1[i]);
2764       sh=get_reg(i_regs->regmap,rs1[i]|64);
2765       sl=get_reg(i_regs->regmap,rs1[i]);
2766       if(tl>=0) {
2767         if(rs1[i]==0)
2768         {
2769           emit_zeroreg(tl);
2770           if(th>=0) emit_zeroreg(th);
2771         }
2772         else
2773         {
2774           assert(sl>=0);
2775           assert(sh>=0);
2776           if(imm[i]) {
2777             if(opcode2[i]==0x38) // DSLL
2778             {
2779               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2780               emit_shlimm(sl,imm[i],tl);
2781             }
2782             if(opcode2[i]==0x3a) // DSRL
2783             {
2784               emit_shrdimm(sl,sh,imm[i],tl);
2785               if(th>=0) emit_shrimm(sh,imm[i],th);
2786             }
2787             if(opcode2[i]==0x3b) // DSRA
2788             {
2789               emit_shrdimm(sl,sh,imm[i],tl);
2790               if(th>=0) emit_sarimm(sh,imm[i],th);
2791             }
2792           }else{
2793             // Shift by zero
2794             if(sl!=tl) emit_mov(sl,tl);
2795             if(th>=0&&sh!=th) emit_mov(sh,th);
2796           }
2797         }
2798       }
2799     }
2800   }
2801   if(opcode2[i]==0x3c) // DSLL32
2802   {
2803     if(rt1[i]) {
2804       signed char sl,tl,th;
2805       tl=get_reg(i_regs->regmap,rt1[i]);
2806       th=get_reg(i_regs->regmap,rt1[i]|64);
2807       sl=get_reg(i_regs->regmap,rs1[i]);
2808       if(th>=0||tl>=0){
2809         assert(tl>=0);
2810         assert(th>=0);
2811         assert(sl>=0);
2812         emit_mov(sl,th);
2813         emit_zeroreg(tl);
2814         if(imm[i]>32)
2815         {
2816           emit_shlimm(th,imm[i]&31,th);
2817         }
2818       }
2819     }
2820   }
2821   if(opcode2[i]==0x3e) // DSRL32
2822   {
2823     if(rt1[i]) {
2824       signed char sh,tl,th;
2825       tl=get_reg(i_regs->regmap,rt1[i]);
2826       th=get_reg(i_regs->regmap,rt1[i]|64);
2827       sh=get_reg(i_regs->regmap,rs1[i]|64);
2828       if(tl>=0){
2829         assert(sh>=0);
2830         emit_mov(sh,tl);
2831         if(th>=0) emit_zeroreg(th);
2832         if(imm[i]>32)
2833         {
2834           emit_shrimm(tl,imm[i]&31,tl);
2835         }
2836       }
2837     }
2838   }
2839   if(opcode2[i]==0x3f) // DSRA32
2840   {
2841     if(rt1[i]) {
2842       signed char sh,tl;
2843       tl=get_reg(i_regs->regmap,rt1[i]);
2844       sh=get_reg(i_regs->regmap,rs1[i]|64);
2845       if(tl>=0){
2846         assert(sh>=0);
2847         emit_mov(sh,tl);
2848         if(imm[i]>32)
2849         {
2850           emit_sarimm(tl,imm[i]&31,tl);
2851         }
2852       }
2853     }
2854   }
2855 }
2856
2857 #ifndef shift_assemble
2858 void shift_assemble(int i,struct regstat *i_regs)
2859 {
2860   printf("Need shift_assemble for this architecture.\n");
2861   exit(1);
2862 }
2863 #endif
2864
2865 void load_assemble(int i,struct regstat *i_regs)
2866 {
2867   int s,th,tl,addr,map=-1;
2868   int offset;
2869   int jaddr=0;
2870   int memtarget=0,c=0;
2871   int fastload_reg_override=0;
2872   u_int hr,reglist=0;
2873   th=get_reg(i_regs->regmap,rt1[i]|64);
2874   tl=get_reg(i_regs->regmap,rt1[i]);
2875   s=get_reg(i_regs->regmap,rs1[i]);
2876   offset=imm[i];
2877   for(hr=0;hr<HOST_REGS;hr++) {
2878     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2879   }
2880   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2881   if(s>=0) {
2882     c=(i_regs->wasconst>>s)&1;
2883     if (c) {
2884       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2885       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2886     }
2887   }
2888   //printf("load_assemble: c=%d\n",c);
2889   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2890   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2891 #ifdef PCSX
2892   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2893     ||rt1[i]==0) {
2894       // could be FIFO, must perform the read
2895       // ||dummy read
2896       assem_debug("(forced read)\n");
2897       tl=get_reg(i_regs->regmap,-1);
2898       assert(tl>=0);
2899   }
2900 #endif
2901   if(offset||s<0||c) addr=tl;
2902   else addr=s;
2903   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2904  if(tl>=0) {
2905   //printf("load_assemble: c=%d\n",c);
2906   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2907   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2908   reglist&=~(1<<tl);
2909   if(th>=0) reglist&=~(1<<th);
2910   if(!using_tlb) {
2911     if(!c) {
2912       #ifdef RAM_OFFSET
2913       map=get_reg(i_regs->regmap,ROREG);
2914       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2915       #endif
2916 //#define R29_HACK 1
2917       #ifdef R29_HACK
2918       // Strmnnrmn's speed hack
2919       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2920       #endif
2921       {
2922         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2923       }
2924     }
2925     else if(ram_offset&&memtarget) {
2926       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2927       fastload_reg_override=HOST_TEMPREG;
2928     }
2929   }else{ // using tlb
2930     int x=0;
2931     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2932     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2933     map=get_reg(i_regs->regmap,TLREG);
2934     assert(map>=0);
2935     reglist&=~(1<<map);
2936     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2937     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2938   }
2939   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2940   if (opcode[i]==0x20) { // LB
2941     if(!c||memtarget) {
2942       if(!dummy) {
2943         #ifdef HOST_IMM_ADDR32
2944         if(c)
2945           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2946         else
2947         #endif
2948         {
2949           //emit_xorimm(addr,3,tl);
2950           //gen_tlb_addr_r(tl,map);
2951           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2952           int x=0,a=tl;
2953 #ifdef BIG_ENDIAN_MIPS
2954           if(!c) emit_xorimm(addr,3,tl);
2955           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2956 #else
2957           if(!c) a=addr;
2958 #endif
2959           if(fastload_reg_override) a=fastload_reg_override;
2960
2961           emit_movsbl_indexed_tlb(x,a,map,tl);
2962         }
2963       }
2964       if(jaddr)
2965         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2966     }
2967     else
2968       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2969   }
2970   if (opcode[i]==0x21) { // LH
2971     if(!c||memtarget) {
2972       if(!dummy) {
2973         #ifdef HOST_IMM_ADDR32
2974         if(c)
2975           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2976         else
2977         #endif
2978         {
2979           int x=0,a=tl;
2980 #ifdef BIG_ENDIAN_MIPS
2981           if(!c) emit_xorimm(addr,2,tl);
2982           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2983 #else
2984           if(!c) a=addr;
2985 #endif
2986           if(fastload_reg_override) a=fastload_reg_override;
2987           //#ifdef
2988           //emit_movswl_indexed_tlb(x,tl,map,tl);
2989           //else
2990           if(map>=0) {
2991             gen_tlb_addr_r(a,map);
2992             emit_movswl_indexed(x,a,tl);
2993           }else{
2994             #if 1 //def RAM_OFFSET
2995             emit_movswl_indexed(x,a,tl);
2996             #else
2997             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2998             #endif
2999           }
3000         }
3001       }
3002       if(jaddr)
3003         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3004     }
3005     else
3006       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3007   }
3008   if (opcode[i]==0x23) { // LW
3009     if(!c||memtarget) {
3010       if(!dummy) {
3011         int a=addr;
3012         if(fastload_reg_override) a=fastload_reg_override;
3013         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3014         #ifdef HOST_IMM_ADDR32
3015         if(c)
3016           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3017         else
3018         #endif
3019         emit_readword_indexed_tlb(0,a,map,tl);
3020       }
3021       if(jaddr)
3022         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3023     }
3024     else
3025       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3026   }
3027   if (opcode[i]==0x24) { // LBU
3028     if(!c||memtarget) {
3029       if(!dummy) {
3030         #ifdef HOST_IMM_ADDR32
3031         if(c)
3032           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
3033         else
3034         #endif
3035         {
3036           //emit_xorimm(addr,3,tl);
3037           //gen_tlb_addr_r(tl,map);
3038           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
3039           int x=0,a=tl;
3040 #ifdef BIG_ENDIAN_MIPS
3041           if(!c) emit_xorimm(addr,3,tl);
3042           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3043 #else
3044           if(!c) a=addr;
3045 #endif
3046           if(fastload_reg_override) a=fastload_reg_override;
3047
3048           emit_movzbl_indexed_tlb(x,a,map,tl);
3049         }
3050       }
3051       if(jaddr)
3052         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3053     }
3054     else
3055       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3056   }
3057   if (opcode[i]==0x25) { // LHU
3058     if(!c||memtarget) {
3059       if(!dummy) {
3060         #ifdef HOST_IMM_ADDR32
3061         if(c)
3062           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
3063         else
3064         #endif
3065         {
3066           int x=0,a=tl;
3067 #ifdef BIG_ENDIAN_MIPS
3068           if(!c) emit_xorimm(addr,2,tl);
3069           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3070 #else
3071           if(!c) a=addr;
3072 #endif
3073           if(fastload_reg_override) a=fastload_reg_override;
3074           //#ifdef
3075           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3076           //#else
3077           if(map>=0) {
3078             gen_tlb_addr_r(a,map);
3079             emit_movzwl_indexed(x,a,tl);
3080           }else{
3081             #if 1 //def RAM_OFFSET
3082             emit_movzwl_indexed(x,a,tl);
3083             #else
3084             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3085             #endif
3086           }
3087         }
3088       }
3089       if(jaddr)
3090         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3091     }
3092     else
3093       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3094   }
3095   if (opcode[i]==0x27) { // LWU
3096     assert(th>=0);
3097     if(!c||memtarget) {
3098       if(!dummy) {
3099         int a=addr;
3100         if(fastload_reg_override) a=fastload_reg_override;
3101         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3102         #ifdef HOST_IMM_ADDR32
3103         if(c)
3104           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3105         else
3106         #endif
3107         emit_readword_indexed_tlb(0,a,map,tl);
3108       }
3109       if(jaddr)
3110         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3111     }
3112     else {
3113       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3114     }
3115     emit_zeroreg(th);
3116   }
3117   if (opcode[i]==0x37) { // LD
3118     if(!c||memtarget) {
3119       if(!dummy) {
3120         int a=addr;
3121         if(fastload_reg_override) a=fastload_reg_override;
3122         //gen_tlb_addr_r(tl,map);
3123         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3124         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3125         #ifdef HOST_IMM_ADDR32
3126         if(c)
3127           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3128         else
3129         #endif
3130         emit_readdword_indexed_tlb(0,a,map,th,tl);
3131       }
3132       if(jaddr)
3133         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3134     }
3135     else
3136       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3137   }
3138  }
3139   //emit_storereg(rt1[i],tl); // DEBUG
3140   //if(opcode[i]==0x23)
3141   //if(opcode[i]==0x24)
3142   //if(opcode[i]==0x23||opcode[i]==0x24)
3143   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3144   {
3145     //emit_pusha();
3146     save_regs(0x100f);
3147         emit_readword((int)&last_count,ECX);
3148         #ifdef __i386__
3149         if(get_reg(i_regs->regmap,CCREG)<0)
3150           emit_loadreg(CCREG,HOST_CCREG);
3151         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3152         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3153         emit_writeword(HOST_CCREG,(int)&Count);
3154         #endif
3155         #ifdef __arm__
3156         if(get_reg(i_regs->regmap,CCREG)<0)
3157           emit_loadreg(CCREG,0);
3158         else
3159           emit_mov(HOST_CCREG,0);
3160         emit_add(0,ECX,0);
3161         emit_addimm(0,2*ccadj[i],0);
3162         emit_writeword(0,(int)&Count);
3163         #endif
3164     emit_call((int)memdebug);
3165     //emit_popa();
3166     restore_regs(0x100f);
3167   }/**/
3168 }
3169
3170 #ifndef loadlr_assemble
3171 void loadlr_assemble(int i,struct regstat *i_regs)
3172 {
3173   printf("Need loadlr_assemble for this architecture.\n");
3174   exit(1);
3175 }
3176 #endif
3177
3178 void store_assemble(int i,struct regstat *i_regs)
3179 {
3180   int s,th,tl,map=-1;
3181   int addr,temp;
3182   int offset;
3183   int jaddr=0,jaddr2,type;
3184   int memtarget=0,c=0;
3185   int agr=AGEN1+(i&1);
3186   int faststore_reg_override=0;
3187   u_int hr,reglist=0;
3188   th=get_reg(i_regs->regmap,rs2[i]|64);
3189   tl=get_reg(i_regs->regmap,rs2[i]);
3190   s=get_reg(i_regs->regmap,rs1[i]);
3191   temp=get_reg(i_regs->regmap,agr);
3192   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3193   offset=imm[i];
3194   if(s>=0) {
3195     c=(i_regs->wasconst>>s)&1;
3196     if(c) {
3197       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3198       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3199     }
3200   }
3201   assert(tl>=0);
3202   assert(temp>=0);
3203   for(hr=0;hr<HOST_REGS;hr++) {
3204     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3205   }
3206   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3207   if(offset||s<0||c) addr=temp;
3208   else addr=s;
3209   if(!using_tlb) {
3210     if(!c) {
3211       #ifndef PCSX
3212       #ifdef R29_HACK
3213       // Strmnnrmn's speed hack
3214       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3215       #endif
3216       emit_cmpimm(addr,RAM_SIZE);
3217       #ifdef DESTRUCTIVE_SHIFT
3218       if(s==addr) emit_mov(s,temp);
3219       #endif
3220       #ifdef R29_HACK
3221       memtarget=1;
3222       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3223       #endif
3224       {
3225         jaddr=(int)out;
3226         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3227         // Hint to branch predictor that the branch is unlikely to be taken
3228         if(rs1[i]>=28)
3229           emit_jno_unlikely(0);
3230         else
3231         #endif
3232         emit_jno(0);
3233       }
3234       #else
3235         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3236       #endif
3237     }
3238     else if(ram_offset&&memtarget) {
3239       emit_addimm(addr,ram_offset,HOST_TEMPREG);
3240       faststore_reg_override=HOST_TEMPREG;
3241     }
3242   }else{ // using tlb
3243     int x=0;
3244     if (opcode[i]==0x28) x=3; // SB
3245     if (opcode[i]==0x29) x=2; // SH
3246     map=get_reg(i_regs->regmap,TLREG);
3247     assert(map>=0);
3248     reglist&=~(1<<map);
3249     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3250     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3251   }
3252
3253   if (opcode[i]==0x28) { // SB
3254     if(!c||memtarget) {
3255       int x=0,a=temp;
3256 #ifdef BIG_ENDIAN_MIPS
3257       if(!c) emit_xorimm(addr,3,temp);
3258       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3259 #else
3260       if(!c) a=addr;
3261 #endif
3262       if(faststore_reg_override) a=faststore_reg_override;
3263       //gen_tlb_addr_w(temp,map);
3264       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3265       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3266     }
3267     type=STOREB_STUB;
3268   }
3269   if (opcode[i]==0x29) { // SH
3270     if(!c||memtarget) {
3271       int x=0,a=temp;
3272 #ifdef BIG_ENDIAN_MIPS
3273       if(!c) emit_xorimm(addr,2,temp);
3274       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3275 #else
3276       if(!c) a=addr;
3277 #endif
3278       if(faststore_reg_override) a=faststore_reg_override;
3279       //#ifdef
3280       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3281       //#else
3282       if(map>=0) {
3283         gen_tlb_addr_w(a,map);
3284         emit_writehword_indexed(tl,x,a);
3285       }else
3286         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3287         emit_writehword_indexed(tl,x,a);
3288     }
3289     type=STOREH_STUB;
3290   }
3291   if (opcode[i]==0x2B) { // SW
3292     if(!c||memtarget) {
3293       int a=addr;
3294       if(faststore_reg_override) a=faststore_reg_override;
3295       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3296       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3297     }
3298     type=STOREW_STUB;
3299   }
3300   if (opcode[i]==0x3F) { // SD
3301     if(!c||memtarget) {
3302       int a=addr;
3303       if(faststore_reg_override) a=faststore_reg_override;
3304       if(rs2[i]) {
3305         assert(th>=0);
3306         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3307         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3308         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3309       }else{
3310         // Store zero
3311         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3312         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3313         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3314       }
3315     }
3316     type=STORED_STUB;
3317   }
3318 #ifdef PCSX
3319   if(jaddr) {
3320     // PCSX store handlers don't check invcode again
3321     reglist|=1<<addr;
3322     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3323     jaddr=0;
3324   }
3325 #endif
3326   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3327     if(!c||memtarget) {
3328       #ifdef DESTRUCTIVE_SHIFT
3329       // The x86 shift operation is 'destructive'; it overwrites the
3330       // source register, so we need to make a copy first and use that.
3331       addr=temp;
3332       #endif
3333       #if defined(HOST_IMM8)
3334       int ir=get_reg(i_regs->regmap,INVCP);
3335       assert(ir>=0);
3336       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3337       #else
3338       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3339       #endif
3340       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3341       emit_callne(invalidate_addr_reg[addr]);
3342       #else
3343       jaddr2=(int)out;
3344       emit_jne(0);
3345       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3346       #endif
3347     }
3348   }
3349   u_int addr_val=constmap[i][s]+offset;
3350   if(jaddr) {
3351     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3352   } else if(c&&!memtarget) {
3353     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3354   }
3355   // basic current block modification detection..
3356   // not looking back as that should be in mips cache already
3357   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3358     printf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3359     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3360     if(i_regs->regmap==regs[i].regmap) {
3361       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3362       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3363       emit_movimm(start+i*4+4,0);
3364       emit_writeword(0,(int)&pcaddr);
3365       emit_jmp((int)do_interrupt);
3366     }
3367   }
3368   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3369   //if(opcode[i]==0x2B || opcode[i]==0x28)
3370   //if(opcode[i]==0x2B || opcode[i]==0x29)
3371   //if(opcode[i]==0x2B)
3372   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3373   {
3374     #ifdef __i386__
3375     emit_pusha();
3376     #endif
3377     #ifdef __arm__
3378     save_regs(0x100f);
3379     #endif
3380         emit_readword((int)&last_count,ECX);
3381         #ifdef __i386__
3382         if(get_reg(i_regs->regmap,CCREG)<0)
3383           emit_loadreg(CCREG,HOST_CCREG);
3384         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3385         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3386         emit_writeword(HOST_CCREG,(int)&Count);
3387         #endif
3388         #ifdef __arm__
3389         if(get_reg(i_regs->regmap,CCREG)<0)
3390           emit_loadreg(CCREG,0);
3391         else
3392           emit_mov(HOST_CCREG,0);
3393         emit_add(0,ECX,0);
3394         emit_addimm(0,2*ccadj[i],0);
3395         emit_writeword(0,(int)&Count);
3396         #endif
3397     emit_call((int)memdebug);
3398     #ifdef __i386__
3399     emit_popa();
3400     #endif
3401     #ifdef __arm__
3402     restore_regs(0x100f);
3403     #endif
3404   }/**/
3405 }
3406
3407 void storelr_assemble(int i,struct regstat *i_regs)
3408 {
3409   int s,th,tl;
3410   int temp;
3411   int temp2;
3412   int offset;
3413   int jaddr=0,jaddr2;
3414   int case1,case2,case3;
3415   int done0,done1,done2;
3416   int memtarget=0,c=0;
3417   int agr=AGEN1+(i&1);
3418   u_int hr,reglist=0;
3419   th=get_reg(i_regs->regmap,rs2[i]|64);
3420   tl=get_reg(i_regs->regmap,rs2[i]);
3421   s=get_reg(i_regs->regmap,rs1[i]);
3422   temp=get_reg(i_regs->regmap,agr);
3423   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3424   offset=imm[i];
3425   if(s>=0) {
3426     c=(i_regs->isconst>>s)&1;
3427     if(c) {
3428       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3429       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3430     }
3431   }
3432   assert(tl>=0);
3433   for(hr=0;hr<HOST_REGS;hr++) {
3434     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3435   }
3436   assert(temp>=0);
3437   if(!using_tlb) {
3438     if(!c) {
3439       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3440       if(!offset&&s!=temp) emit_mov(s,temp);
3441       jaddr=(int)out;
3442       emit_jno(0);
3443     }
3444     else
3445     {
3446       if(!memtarget||!rs1[i]) {
3447         jaddr=(int)out;
3448         emit_jmp(0);
3449       }
3450     }
3451     #ifdef RAM_OFFSET
3452     int map=get_reg(i_regs->regmap,ROREG);
3453     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3454     gen_tlb_addr_w(temp,map);
3455     #else
3456     if((u_int)rdram!=0x80000000) 
3457       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3458     #endif
3459   }else{ // using tlb
3460     int map=get_reg(i_regs->regmap,TLREG);
3461     assert(map>=0);
3462     reglist&=~(1<<map);
3463     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3464     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3465     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3466     if(!jaddr&&!memtarget) {
3467       jaddr=(int)out;
3468       emit_jmp(0);
3469     }
3470     gen_tlb_addr_w(temp,map);
3471   }
3472
3473   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3474     temp2=get_reg(i_regs->regmap,FTEMP);
3475     if(!rs2[i]) temp2=th=tl;
3476   }
3477
3478 #ifndef BIG_ENDIAN_MIPS
3479     emit_xorimm(temp,3,temp);
3480 #endif
3481   emit_testimm(temp,2);
3482   case2=(int)out;
3483   emit_jne(0);
3484   emit_testimm(temp,1);
3485   case1=(int)out;
3486   emit_jne(0);
3487   // 0
3488   if (opcode[i]==0x2A) { // SWL
3489     emit_writeword_indexed(tl,0,temp);
3490   }
3491   if (opcode[i]==0x2E) { // SWR
3492     emit_writebyte_indexed(tl,3,temp);
3493   }
3494   if (opcode[i]==0x2C) { // SDL
3495     emit_writeword_indexed(th,0,temp);
3496     if(rs2[i]) emit_mov(tl,temp2);
3497   }
3498   if (opcode[i]==0x2D) { // SDR
3499     emit_writebyte_indexed(tl,3,temp);
3500     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3501   }
3502   done0=(int)out;
3503   emit_jmp(0);
3504   // 1
3505   set_jump_target(case1,(int)out);
3506   if (opcode[i]==0x2A) { // SWL
3507     // Write 3 msb into three least significant bytes
3508     if(rs2[i]) emit_rorimm(tl,8,tl);
3509     emit_writehword_indexed(tl,-1,temp);
3510     if(rs2[i]) emit_rorimm(tl,16,tl);
3511     emit_writebyte_indexed(tl,1,temp);
3512     if(rs2[i]) emit_rorimm(tl,8,tl);
3513   }
3514   if (opcode[i]==0x2E) { // SWR
3515     // Write two lsb into two most significant bytes
3516     emit_writehword_indexed(tl,1,temp);
3517   }
3518   if (opcode[i]==0x2C) { // SDL
3519     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3520     // Write 3 msb into three least significant bytes
3521     if(rs2[i]) emit_rorimm(th,8,th);
3522     emit_writehword_indexed(th,-1,temp);
3523     if(rs2[i]) emit_rorimm(th,16,th);
3524     emit_writebyte_indexed(th,1,temp);
3525     if(rs2[i]) emit_rorimm(th,8,th);
3526   }
3527   if (opcode[i]==0x2D) { // SDR
3528     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3529     // Write two lsb into two most significant bytes
3530     emit_writehword_indexed(tl,1,temp);
3531   }
3532   done1=(int)out;
3533   emit_jmp(0);
3534   // 2
3535   set_jump_target(case2,(int)out);
3536   emit_testimm(temp,1);
3537   case3=(int)out;
3538   emit_jne(0);
3539   if (opcode[i]==0x2A) { // SWL
3540     // Write two msb into two least significant bytes
3541     if(rs2[i]) emit_rorimm(tl,16,tl);
3542     emit_writehword_indexed(tl,-2,temp);
3543     if(rs2[i]) emit_rorimm(tl,16,tl);
3544   }
3545   if (opcode[i]==0x2E) { // SWR
3546     // Write 3 lsb into three most significant bytes
3547     emit_writebyte_indexed(tl,-1,temp);
3548     if(rs2[i]) emit_rorimm(tl,8,tl);
3549     emit_writehword_indexed(tl,0,temp);
3550     if(rs2[i]) emit_rorimm(tl,24,tl);
3551   }
3552   if (opcode[i]==0x2C) { // SDL
3553     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3554     // Write two msb into two least significant bytes
3555     if(rs2[i]) emit_rorimm(th,16,th);
3556     emit_writehword_indexed(th,-2,temp);
3557     if(rs2[i]) emit_rorimm(th,16,th);
3558   }
3559   if (opcode[i]==0x2D) { // SDR
3560     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3561     // Write 3 lsb into three most significant bytes
3562     emit_writebyte_indexed(tl,-1,temp);
3563     if(rs2[i]) emit_rorimm(tl,8,tl);
3564     emit_writehword_indexed(tl,0,temp);
3565     if(rs2[i]) emit_rorimm(tl,24,tl);
3566   }
3567   done2=(int)out;
3568   emit_jmp(0);
3569   // 3
3570   set_jump_target(case3,(int)out);
3571   if (opcode[i]==0x2A) { // SWL
3572     // Write msb into least significant byte
3573     if(rs2[i]) emit_rorimm(tl,24,tl);
3574     emit_writebyte_indexed(tl,-3,temp);
3575     if(rs2[i]) emit_rorimm(tl,8,tl);
3576   }
3577   if (opcode[i]==0x2E) { // SWR
3578     // Write entire word
3579     emit_writeword_indexed(tl,-3,temp);
3580   }
3581   if (opcode[i]==0x2C) { // SDL
3582     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3583     // Write msb into least significant byte
3584     if(rs2[i]) emit_rorimm(th,24,th);
3585     emit_writebyte_indexed(th,-3,temp);
3586     if(rs2[i]) emit_rorimm(th,8,th);
3587   }
3588   if (opcode[i]==0x2D) { // SDR
3589     if(rs2[i]) emit_mov(th,temp2);
3590     // Write entire word
3591     emit_writeword_indexed(tl,-3,temp);
3592   }
3593   set_jump_target(done0,(int)out);
3594   set_jump_target(done1,(int)out);
3595   set_jump_target(done2,(int)out);
3596   if (opcode[i]==0x2C) { // SDL
3597     emit_testimm(temp,4);
3598     done0=(int)out;
3599     emit_jne(0);
3600     emit_andimm(temp,~3,temp);
3601     emit_writeword_indexed(temp2,4,temp);
3602     set_jump_target(done0,(int)out);
3603   }
3604   if (opcode[i]==0x2D) { // SDR
3605     emit_testimm(temp,4);
3606     done0=(int)out;
3607     emit_jeq(0);
3608     emit_andimm(temp,~3,temp);
3609     emit_writeword_indexed(temp2,-4,temp);
3610     set_jump_target(done0,(int)out);
3611   }
3612   if(!c||!memtarget)
3613     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3614   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3615     #ifdef RAM_OFFSET
3616     int map=get_reg(i_regs->regmap,ROREG);
3617     if(map<0) map=HOST_TEMPREG;
3618     gen_orig_addr_w(temp,map);
3619     #else
3620     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3621     #endif
3622     #if defined(HOST_IMM8)
3623     int ir=get_reg(i_regs->regmap,INVCP);
3624     assert(ir>=0);
3625     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3626     #else
3627     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3628     #endif
3629     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3630     emit_callne(invalidate_addr_reg[temp]);
3631     #else
3632     jaddr2=(int)out;
3633     emit_jne(0);
3634     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3635     #endif
3636   }
3637   /*
3638     emit_pusha();
3639     //save_regs(0x100f);
3640         emit_readword((int)&last_count,ECX);
3641         if(get_reg(i_regs->regmap,CCREG)<0)
3642           emit_loadreg(CCREG,HOST_CCREG);
3643         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3644         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3645         emit_writeword(HOST_CCREG,(int)&Count);
3646     emit_call((int)memdebug);
3647     emit_popa();
3648     //restore_regs(0x100f);
3649   /**/
3650 }
3651
3652 void c1ls_assemble(int i,struct regstat *i_regs)
3653 {
3654 #ifndef DISABLE_COP1
3655   int s,th,tl;
3656   int temp,ar;
3657   int map=-1;
3658   int offset;
3659   int c=0;
3660   int jaddr,jaddr2=0,jaddr3,type;
3661   int agr=AGEN1+(i&1);
3662   u_int hr,reglist=0;
3663   th=get_reg(i_regs->regmap,FTEMP|64);
3664   tl=get_reg(i_regs->regmap,FTEMP);
3665   s=get_reg(i_regs->regmap,rs1[i]);
3666   temp=get_reg(i_regs->regmap,agr);
3667   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3668   offset=imm[i];
3669   assert(tl>=0);
3670   assert(rs1[i]>0);
3671   assert(temp>=0);
3672   for(hr=0;hr<HOST_REGS;hr++) {
3673     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3674   }
3675   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3676   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3677   {
3678     // Loads use a temporary register which we need to save
3679     reglist|=1<<temp;
3680   }
3681   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3682     ar=temp;
3683   else // LWC1/LDC1
3684     ar=tl;
3685   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3686   //else c=(i_regs->wasconst>>s)&1;
3687   if(s>=0) c=(i_regs->wasconst>>s)&1;
3688   // Check cop1 unusable
3689   if(!cop1_usable) {
3690     signed char rs=get_reg(i_regs->regmap,CSREG);
3691     assert(rs>=0);
3692     emit_testimm(rs,0x20000000);
3693     jaddr=(int)out;
3694     emit_jeq(0);
3695     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3696     cop1_usable=1;
3697   }
3698   if (opcode[i]==0x39) { // SWC1 (get float address)
3699     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3700   }
3701   if (opcode[i]==0x3D) { // SDC1 (get double address)
3702     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3703   }
3704   // Generate address + offset
3705   if(!using_tlb) {
3706     if(!c)
3707       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3708   }
3709   else
3710   {
3711     map=get_reg(i_regs->regmap,TLREG);
3712     assert(map>=0);
3713     reglist&=~(1<<map);
3714     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3715       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3716     }
3717     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3718       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3719     }
3720   }
3721   if (opcode[i]==0x39) { // SWC1 (read float)
3722     emit_readword_indexed(0,tl,tl);
3723   }
3724   if (opcode[i]==0x3D) { // SDC1 (read double)
3725     emit_readword_indexed(4,tl,th);
3726     emit_readword_indexed(0,tl,tl);
3727   }
3728   if (opcode[i]==0x31) { // LWC1 (get target address)
3729     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3730   }
3731   if (opcode[i]==0x35) { // LDC1 (get target address)
3732     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3733   }
3734   if(!using_tlb) {
3735     if(!c) {
3736       jaddr2=(int)out;
3737       emit_jno(0);
3738     }
3739     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3740       jaddr2=(int)out;
3741       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3742     }
3743     #ifdef DESTRUCTIVE_SHIFT
3744     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3745       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3746     }
3747     #endif
3748   }else{
3749     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3750       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3751     }
3752     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3753       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3754     }
3755   }
3756   if (opcode[i]==0x31) { // LWC1
3757     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3758     //gen_tlb_addr_r(ar,map);
3759     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3760     #ifdef HOST_IMM_ADDR32
3761     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3762     else
3763     #endif
3764     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3765     type=LOADW_STUB;
3766   }
3767   if (opcode[i]==0x35) { // LDC1
3768     assert(th>=0);
3769     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3770     //gen_tlb_addr_r(ar,map);
3771     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3772     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3773     #ifdef HOST_IMM_ADDR32
3774     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3775     else
3776     #endif
3777     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3778     type=LOADD_STUB;
3779   }
3780   if (opcode[i]==0x39) { // SWC1
3781     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3782     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3783     type=STOREW_STUB;
3784   }
3785   if (opcode[i]==0x3D) { // SDC1
3786     assert(th>=0);
3787     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3788     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3789     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3790     type=STORED_STUB;
3791   }
3792   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3793     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3794       #ifndef DESTRUCTIVE_SHIFT
3795       temp=offset||c||s<0?ar:s;
3796       #endif
3797       #if defined(HOST_IMM8)
3798       int ir=get_reg(i_regs->regmap,INVCP);
3799       assert(ir>=0);
3800       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3801       #else
3802       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3803       #endif
3804       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3805       emit_callne(invalidate_addr_reg[temp]);
3806       #else
3807       jaddr3=(int)out;
3808       emit_jne(0);
3809       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3810       #endif
3811     }
3812   }
3813   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3814   if (opcode[i]==0x31) { // LWC1 (write float)
3815     emit_writeword_indexed(tl,0,temp);
3816   }
3817   if (opcode[i]==0x35) { // LDC1 (write double)
3818     emit_writeword_indexed(th,4,temp);
3819     emit_writeword_indexed(tl,0,temp);
3820   }
3821   //if(opcode[i]==0x39)
3822   /*if(opcode[i]==0x39||opcode[i]==0x31)
3823   {
3824     emit_pusha();
3825         emit_readword((int)&last_count,ECX);
3826         if(get_reg(i_regs->regmap,CCREG)<0)
3827           emit_loadreg(CCREG,HOST_CCREG);
3828         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3829         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3830         emit_writeword(HOST_CCREG,(int)&Count);
3831     emit_call((int)memdebug);
3832     emit_popa();
3833   }/**/
3834 #else
3835   cop1_unusable(i, i_regs);
3836 #endif
3837 }
3838
3839 void c2ls_assemble(int i,struct regstat *i_regs)
3840 {
3841   int s,tl;
3842   int ar;
3843   int offset;
3844   int memtarget=0,c=0;
3845   int jaddr2=0,jaddr3,type;
3846   int agr=AGEN1+(i&1);
3847   int fastio_reg_override=0;
3848   u_int hr,reglist=0;
3849   u_int copr=(source[i]>>16)&0x1f;
3850   s=get_reg(i_regs->regmap,rs1[i]);
3851   tl=get_reg(i_regs->regmap,FTEMP);
3852   offset=imm[i];
3853   assert(rs1[i]>0);
3854   assert(tl>=0);
3855   assert(!using_tlb);
3856
3857   for(hr=0;hr<HOST_REGS;hr++) {
3858     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3859   }
3860   if(i_regs->regmap[HOST_CCREG]==CCREG)
3861     reglist&=~(1<<HOST_CCREG);
3862
3863   // get the address
3864   if (opcode[i]==0x3a) { // SWC2
3865     ar=get_reg(i_regs->regmap,agr);
3866     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3867     reglist|=1<<ar;
3868   } else { // LWC2
3869     ar=tl;
3870   }
3871   if(s>=0) c=(i_regs->wasconst>>s)&1;
3872   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3873   if (!offset&&!c&&s>=0) ar=s;
3874   assert(ar>=0);
3875
3876   if (opcode[i]==0x3a) { // SWC2
3877     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3878     type=STOREW_STUB;
3879   }
3880   else
3881     type=LOADW_STUB;
3882
3883   if(c&&!memtarget) {
3884     jaddr2=(int)out;
3885     emit_jmp(0); // inline_readstub/inline_writestub?
3886   }
3887   else {
3888     if(!c) {
3889       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3890     }
3891     else if(ram_offset&&memtarget) {
3892       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3893       fastio_reg_override=HOST_TEMPREG;
3894     }
3895     if (opcode[i]==0x32) { // LWC2
3896       #ifdef HOST_IMM_ADDR32
3897       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3898       else
3899       #endif
3900       int a=ar;
3901       if(fastio_reg_override) a=fastio_reg_override;
3902       emit_readword_indexed(0,a,tl);
3903     }
3904     if (opcode[i]==0x3a) { // SWC2
3905       #ifdef DESTRUCTIVE_SHIFT
3906       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3907       #endif
3908       int a=ar;
3909       if(fastio_reg_override) a=fastio_reg_override;
3910       emit_writeword_indexed(tl,0,a);
3911     }
3912   }
3913   if(jaddr2)
3914     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3915   if(opcode[i]==0x3a) // SWC2
3916   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3917 #if defined(HOST_IMM8)
3918     int ir=get_reg(i_regs->regmap,INVCP);
3919     assert(ir>=0);
3920     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3921 #else
3922     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3923 #endif
3924     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3925     emit_callne(invalidate_addr_reg[ar]);
3926     #else
3927     jaddr3=(int)out;
3928     emit_jne(0);
3929     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3930     #endif
3931   }
3932   if (opcode[i]==0x32) { // LWC2
3933     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3934   }
3935 }
3936
3937 #ifndef multdiv_assemble
3938 void multdiv_assemble(int i,struct regstat *i_regs)
3939 {
3940   printf("Need multdiv_assemble for this architecture.\n");
3941   exit(1);
3942 }
3943 #endif
3944
3945 void mov_assemble(int i,struct regstat *i_regs)
3946 {
3947   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3948   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3949   if(rt1[i]) {
3950     signed char sh,sl,th,tl;
3951     th=get_reg(i_regs->regmap,rt1[i]|64);
3952     tl=get_reg(i_regs->regmap,rt1[i]);
3953     //assert(tl>=0);
3954     if(tl>=0) {
3955       sh=get_reg(i_regs->regmap,rs1[i]|64);
3956       sl=get_reg(i_regs->regmap,rs1[i]);
3957       if(sl>=0) emit_mov(sl,tl);
3958       else emit_loadreg(rs1[i],tl);
3959       if(th>=0) {
3960         if(sh>=0) emit_mov(sh,th);
3961         else emit_loadreg(rs1[i]|64,th);
3962       }
3963     }
3964   }
3965 }
3966
3967 #ifndef fconv_assemble
3968 void fconv_assemble(int i,struct regstat *i_regs)
3969 {
3970   printf("Need fconv_assemble for this architecture.\n");
3971   exit(1);
3972 }
3973 #endif
3974
3975 #if 0
3976 void float_assemble(int i,struct regstat *i_regs)
3977 {
3978   printf("Need float_assemble for this architecture.\n");
3979   exit(1);
3980 }
3981 #endif
3982
3983 void syscall_assemble(int i,struct regstat *i_regs)
3984 {
3985   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3986   assert(ccreg==HOST_CCREG);
3987   assert(!is_delayslot);
3988   emit_movimm(start+i*4,EAX); // Get PC
3989   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3990   emit_jmp((int)jump_syscall_hle); // XXX
3991 }
3992
3993 void hlecall_assemble(int i,struct regstat *i_regs)
3994 {
3995   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3996   assert(ccreg==HOST_CCREG);
3997   assert(!is_delayslot);
3998   emit_movimm(start+i*4+4,0); // Get PC
3999   emit_movimm((int)psxHLEt[source[i]&7],1);
4000   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
4001   emit_jmp((int)jump_hlecall);
4002 }
4003
4004 void intcall_assemble(int i,struct regstat *i_regs)
4005 {
4006   signed char ccreg=get_reg(i_regs->regmap,CCREG);
4007   assert(ccreg==HOST_CCREG);
4008   assert(!is_delayslot);
4009   emit_movimm(start+i*4,0); // Get PC
4010   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
4011   emit_jmp((int)jump_intcall);
4012 }
4013
4014 void ds_assemble(int i,struct regstat *i_regs)
4015 {
4016   speculate_register_values(i);
4017   is_delayslot=1;
4018   switch(itype[i]) {
4019     case ALU:
4020       alu_assemble(i,i_regs);break;
4021     case IMM16:
4022       imm16_assemble(i,i_regs);break;
4023     case SHIFT:
4024       shift_assemble(i,i_regs);break;
4025     case SHIFTIMM:
4026       shiftimm_assemble(i,i_regs);break;
4027     case LOAD:
4028       load_assemble(i,i_regs);break;
4029     case LOADLR:
4030       loadlr_assemble(i,i_regs);break;
4031     case STORE:
4032       store_assemble(i,i_regs);break;
4033     case STORELR:
4034       storelr_assemble(i,i_regs);break;
4035     case COP0:
4036       cop0_assemble(i,i_regs);break;
4037     case COP1:
4038       cop1_assemble(i,i_regs);break;
4039     case C1LS:
4040       c1ls_assemble(i,i_regs);break;
4041     case COP2:
4042       cop2_assemble(i,i_regs);break;
4043     case C2LS:
4044       c2ls_assemble(i,i_regs);break;
4045     case C2OP:
4046       c2op_assemble(i,i_regs);break;
4047     case FCONV:
4048       fconv_assemble(i,i_regs);break;
4049     case FLOAT:
4050       float_assemble(i,i_regs);break;
4051     case FCOMP:
4052       fcomp_assemble(i,i_regs);break;
4053     case MULTDIV:
4054       multdiv_assemble(i,i_regs);break;
4055     case MOV:
4056       mov_assemble(i,i_regs);break;
4057     case SYSCALL:
4058     case HLECALL:
4059     case INTCALL:
4060     case SPAN:
4061     case UJUMP:
4062     case RJUMP:
4063     case CJUMP:
4064     case SJUMP:
4065     case FJUMP:
4066       printf("Jump in the delay slot.  This is probably a bug.\n");
4067   }
4068   is_delayslot=0;
4069 }
4070
4071 // Is the branch target a valid internal jump?
4072 int internal_branch(uint64_t i_is32,int addr)
4073 {
4074   if(addr&1) return 0; // Indirect (register) jump
4075   if(addr>=start && addr<start+slen*4-4)
4076   {
4077     int t=(addr-start)>>2;
4078     // Delay slots are not valid branch targets
4079     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4080     // 64 -> 32 bit transition requires a recompile
4081     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4082     {
4083       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4084       else printf("optimizable: yes\n");
4085     }*/
4086     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4087 #ifndef FORCE32
4088     if(requires_32bit[t]&~i_is32) return 0;
4089     else
4090 #endif
4091       return 1;
4092   }
4093   return 0;
4094 }
4095
4096 #ifndef wb_invalidate
4097 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4098   uint64_t u,uint64_t uu)
4099 {
4100   int hr;
4101   for(hr=0;hr<HOST_REGS;hr++) {
4102     if(hr!=EXCLUDE_REG) {
4103       if(pre[hr]!=entry[hr]) {
4104         if(pre[hr]>=0) {
4105           if((dirty>>hr)&1) {
4106             if(get_reg(entry,pre[hr])<0) {
4107               if(pre[hr]<64) {
4108                 if(!((u>>pre[hr])&1)) {
4109                   emit_storereg(pre[hr],hr);
4110                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4111                     emit_sarimm(hr,31,hr);
4112                     emit_storereg(pre[hr]|64,hr);
4113                   }
4114                 }
4115               }else{
4116                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4117                   emit_storereg(pre[hr],hr);
4118                 }
4119               }
4120             }
4121           }
4122         }
4123       }
4124     }
4125   }
4126   // Move from one register to another (no writeback)
4127   for(hr=0;hr<HOST_REGS;hr++) {
4128     if(hr!=EXCLUDE_REG) {
4129       if(pre[hr]!=entry[hr]) {
4130         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4131           int nr;
4132           if((nr=get_reg(entry,pre[hr]))>=0) {
4133             emit_mov(hr,nr);
4134           }
4135         }
4136       }
4137     }
4138   }
4139 }
4140 #endif
4141
4142 // Load the specified registers
4143 // This only loads the registers given as arguments because
4144 // we don't want to load things that will be overwritten
4145 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4146 {
4147   int hr;
4148   // Load 32-bit regs
4149   for(hr=0;hr<HOST_REGS;hr++) {
4150     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4151       if(entry[hr]!=regmap[hr]) {
4152         if(regmap[hr]==rs1||regmap[hr]==rs2)
4153         {
4154           if(regmap[hr]==0) {
4155             emit_zeroreg(hr);
4156           }
4157           else
4158           {
4159             emit_loadreg(regmap[hr],hr);
4160           }
4161         }
4162       }
4163     }
4164   }
4165   //Load 64-bit regs
4166   for(hr=0;hr<HOST_REGS;hr++) {
4167     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4168       if(entry[hr]!=regmap[hr]) {
4169         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4170         {
4171           assert(regmap[hr]!=64);
4172           if((is32>>(regmap[hr]&63))&1) {
4173             int lr=get_reg(regmap,regmap[hr]-64);
4174             if(lr>=0)
4175               emit_sarimm(lr,31,hr);
4176             else
4177               emit_loadreg(regmap[hr],hr);
4178           }
4179           else
4180           {
4181             emit_loadreg(regmap[hr],hr);
4182           }
4183         }
4184       }
4185     }
4186   }
4187 }
4188
4189 // Load registers prior to the start of a loop
4190 // so that they are not loaded within the loop
4191 static void loop_preload(signed char pre[],signed char entry[])
4192 {
4193   int hr;
4194   for(hr=0;hr<HOST_REGS;hr++) {
4195     if(hr!=EXCLUDE_REG) {
4196       if(pre[hr]!=entry[hr]) {
4197         if(entry[hr]>=0) {
4198           if(get_reg(pre,entry[hr])<0) {
4199             assem_debug("loop preload:\n");
4200             //printf("loop preload: %d\n",hr);
4201             if(entry[hr]==0) {
4202               emit_zeroreg(hr);
4203             }
4204             else if(entry[hr]<TEMPREG)
4205             {
4206               emit_loadreg(entry[hr],hr);
4207             }
4208             else if(entry[hr]-64<TEMPREG)
4209             {
4210               emit_loadreg(entry[hr],hr);
4211             }
4212           }
4213         }
4214       }
4215     }
4216   }
4217 }
4218
4219 // Generate address for load/store instruction
4220 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4221 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4222 {
4223   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4224     int ra=-1;
4225     int agr=AGEN1+(i&1);
4226     int mgr=MGEN1+(i&1);
4227     if(itype[i]==LOAD) {
4228       ra=get_reg(i_regs->regmap,rt1[i]);
4229       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4230       assert(ra>=0);
4231     }
4232     if(itype[i]==LOADLR) {
4233       ra=get_reg(i_regs->regmap,FTEMP);
4234     }
4235     if(itype[i]==STORE||itype[i]==STORELR) {
4236       ra=get_reg(i_regs->regmap,agr);
4237       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4238     }
4239     if(itype[i]==C1LS||itype[i]==C2LS) {
4240       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4241         ra=get_reg(i_regs->regmap,FTEMP);
4242       else { // SWC1/SDC1/SWC2/SDC2
4243         ra=get_reg(i_regs->regmap,agr);
4244         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4245       }
4246     }
4247     int rs=get_reg(i_regs->regmap,rs1[i]);
4248     int rm=get_reg(i_regs->regmap,TLREG);
4249     if(ra>=0) {
4250       int offset=imm[i];
4251       int c=(i_regs->wasconst>>rs)&1;
4252       if(rs1[i]==0) {
4253         // Using r0 as a base address
4254         /*if(rm>=0) {
4255           if(!entry||entry[rm]!=mgr) {
4256             generate_map_const(offset,rm);
4257           } // else did it in the previous cycle
4258         }*/
4259         if(!entry||entry[ra]!=agr) {
4260           if (opcode[i]==0x22||opcode[i]==0x26) {
4261             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4262           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4263             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4264           }else{
4265             emit_movimm(offset,ra);
4266           }
4267         } // else did it in the previous cycle
4268       }
4269       else if(rs<0) {
4270         if(!entry||entry[ra]!=rs1[i])
4271           emit_loadreg(rs1[i],ra);
4272         //if(!entry||entry[ra]!=rs1[i])
4273         //  printf("poor load scheduling!\n");
4274       }
4275       else if(c) {
4276 #ifndef DISABLE_TLB
4277         if(rm>=0) {
4278           if(!entry||entry[rm]!=mgr) {
4279             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4280               // Stores to memory go thru the mapper to detect self-modifying
4281               // code, loads don't.
4282               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4283                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4284                 generate_map_const(constmap[i][rs]+offset,rm);
4285             }else{
4286               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4287                 generate_map_const(constmap[i][rs]+offset,rm);
4288             }
4289           }
4290         }
4291 #endif
4292         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4293           if(!entry||entry[ra]!=agr) {
4294             if (opcode[i]==0x22||opcode[i]==0x26) {
4295               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4296             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4297               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4298             }else{
4299               #ifdef HOST_IMM_ADDR32
4300               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4301                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4302               #endif
4303               emit_movimm(constmap[i][rs]+offset,ra);
4304               regs[i].loadedconst|=1<<ra;
4305             }
4306           } // else did it in the previous cycle
4307         } // else load_consts already did it
4308       }
4309       if(offset&&!c&&rs1[i]) {
4310         if(rs>=0) {
4311           emit_addimm(rs,offset,ra);
4312         }else{
4313           emit_addimm(ra,offset,ra);
4314         }
4315       }
4316     }
4317   }
4318   // Preload constants for next instruction
4319   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4320     int agr,ra;
4321     #if !defined(HOST_IMM_ADDR32) && !defined(DISABLE_TLB)
4322     // Mapper entry
4323     agr=MGEN1+((i+1)&1);
4324     ra=get_reg(i_regs->regmap,agr);
4325     if(ra>=0) {
4326       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4327       int offset=imm[i+1];
4328       int c=(regs[i+1].wasconst>>rs)&1;
4329       if(c) {
4330         if(itype[i+1]==STORE||itype[i+1]==STORELR
4331            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4332           // Stores to memory go thru the mapper to detect self-modifying
4333           // code, loads don't.
4334           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4335              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4336             generate_map_const(constmap[i+1][rs]+offset,ra);
4337         }else{
4338           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4339             generate_map_const(constmap[i+1][rs]+offset,ra);
4340         }
4341       }
4342       /*else if(rs1[i]==0) {
4343         generate_map_const(offset,ra);
4344       }*/
4345     }
4346     #endif
4347     // Actual address
4348     agr=AGEN1+((i+1)&1);
4349     ra=get_reg(i_regs->regmap,agr);
4350     if(ra>=0) {
4351       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4352       int offset=imm[i+1];
4353       int c=(regs[i+1].wasconst>>rs)&1;
4354       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4355         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4356           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4357         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4358           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4359         }else{
4360           #ifdef HOST_IMM_ADDR32
4361           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4362              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4363           #endif
4364           emit_movimm(constmap[i+1][rs]+offset,ra);
4365           regs[i+1].loadedconst|=1<<ra;
4366         }
4367       }
4368       else if(rs1[i+1]==0) {
4369         // Using r0 as a base address
4370         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4371           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4372         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4373           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4374         }else{
4375           emit_movimm(offset,ra);
4376         }
4377       }
4378     }
4379   }
4380 }
4381
4382 int get_final_value(int hr, int i, int *value)
4383 {
4384   int reg=regs[i].regmap[hr];
4385   while(i<slen-1) {
4386     if(regs[i+1].regmap[hr]!=reg) break;
4387     if(!((regs[i+1].isconst>>hr)&1)) break;
4388     if(bt[i+1]) break;
4389     i++;
4390   }
4391   if(i<slen-1) {
4392     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4393       *value=constmap[i][hr];
4394       return 1;
4395     }
4396     if(!bt[i+1]) {
4397       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4398         // Load in delay slot, out-of-order execution
4399         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4400         {
4401           #ifdef HOST_IMM_ADDR32
4402           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4403           #endif
4404           // Precompute load address
4405           *value=constmap[i][hr]+imm[i+2];
4406           return 1;
4407         }
4408       }
4409       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4410       {
4411         #ifdef HOST_IMM_ADDR32
4412         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4413         #endif
4414         // Precompute load address
4415         *value=constmap[i][hr]+imm[i+1];
4416         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4417         return 1;
4418       }
4419     }
4420   }
4421   *value=constmap[i][hr];
4422   //printf("c=%x\n",(int)constmap[i][hr]);
4423   if(i==slen-1) return 1;
4424   if(reg<64) {
4425     return !((unneeded_reg[i+1]>>reg)&1);
4426   }else{
4427     return !((unneeded_reg_upper[i+1]>>reg)&1);
4428   }
4429 }
4430
4431 // Load registers with known constants
4432 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4433 {
4434   int hr,hr2;
4435   // propagate loaded constant flags
4436   if(i==0||bt[i])
4437     regs[i].loadedconst=0;
4438   else {
4439     for(hr=0;hr<HOST_REGS;hr++) {
4440       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4441          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4442       {
4443         regs[i].loadedconst|=1<<hr;
4444       }
4445     }
4446   }
4447   // Load 32-bit regs
4448   for(hr=0;hr<HOST_REGS;hr++) {
4449     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4450       //if(entry[hr]!=regmap[hr]) {
4451       if(!((regs[i].loadedconst>>hr)&1)) {
4452         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4453           int value,similar=0;
4454           if(get_final_value(hr,i,&value)) {
4455             // see if some other register has similar value
4456             for(hr2=0;hr2<HOST_REGS;hr2++) {
4457               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4458                 if(is_similar_value(value,constmap[i][hr2])) {
4459                   similar=1;
4460                   break;
4461                 }
4462               }
4463             }
4464             if(similar) {
4465               int value2;
4466               if(get_final_value(hr2,i,&value2)) // is this needed?
4467                 emit_movimm_from(value2,hr2,value,hr);
4468               else
4469                 emit_movimm(value,hr);
4470             }
4471             else if(value==0) {
4472               emit_zeroreg(hr);
4473             }
4474             else {
4475               emit_movimm(value,hr);
4476             }
4477           }
4478           regs[i].loadedconst|=1<<hr;
4479         }
4480       }
4481     }
4482   }
4483   // Load 64-bit regs
4484   for(hr=0;hr<HOST_REGS;hr++) {
4485     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4486       //if(entry[hr]!=regmap[hr]) {
4487       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4488         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4489           if((is32>>(regmap[hr]&63))&1) {
4490             int lr=get_reg(regmap,regmap[hr]-64);
4491             assert(lr>=0);
4492             emit_sarimm(lr,31,hr);
4493           }
4494           else
4495           {
4496             int value;
4497             if(get_final_value(hr,i,&value)) {
4498               if(value==0) {
4499                 emit_zeroreg(hr);
4500               }
4501               else {
4502                 emit_movimm(value,hr);
4503               }
4504             }
4505           }
4506         }
4507       }
4508     }
4509   }
4510 }
4511 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4512 {
4513   int hr;
4514   // Load 32-bit regs
4515   for(hr=0;hr<HOST_REGS;hr++) {
4516     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4517       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4518         int value=constmap[i][hr];
4519         if(value==0) {
4520           emit_zeroreg(hr);
4521         }
4522         else {
4523           emit_movimm(value,hr);
4524         }
4525       }
4526     }
4527   }
4528   // Load 64-bit regs
4529   for(hr=0;hr<HOST_REGS;hr++) {
4530     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4531       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4532         if((is32>>(regmap[hr]&63))&1) {
4533           int lr=get_reg(regmap,regmap[hr]-64);
4534           assert(lr>=0);
4535           emit_sarimm(lr,31,hr);
4536         }
4537         else
4538         {
4539           int value=constmap[i][hr];
4540           if(value==0) {
4541             emit_zeroreg(hr);
4542           }
4543           else {
4544             emit_movimm(value,hr);
4545           }
4546         }
4547       }
4548     }
4549   }
4550 }
4551
4552 // Write out all dirty registers (except cycle count)
4553 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4554 {
4555   int hr;
4556   for(hr=0;hr<HOST_REGS;hr++) {
4557     if(hr!=EXCLUDE_REG) {
4558       if(i_regmap[hr]>0) {
4559         if(i_regmap[hr]!=CCREG) {
4560           if((i_dirty>>hr)&1) {
4561             if(i_regmap[hr]<64) {
4562               emit_storereg(i_regmap[hr],hr);
4563 #ifndef FORCE32
4564               if( ((i_is32>>i_regmap[hr])&1) ) {
4565                 #ifdef DESTRUCTIVE_WRITEBACK
4566                 emit_sarimm(hr,31,hr);
4567                 emit_storereg(i_regmap[hr]|64,hr);
4568                 #else
4569                 emit_sarimm(hr,31,HOST_TEMPREG);
4570                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4571                 #endif
4572               }
4573 #endif
4574             }else{
4575               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4576                 emit_storereg(i_regmap[hr],hr);
4577               }
4578             }
4579           }
4580         }
4581       }
4582     }
4583   }
4584 }
4585 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4586 // This writes the registers not written by store_regs_bt
4587 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4588 {
4589   int hr;
4590   int t=(addr-start)>>2;
4591   for(hr=0;hr<HOST_REGS;hr++) {
4592     if(hr!=EXCLUDE_REG) {
4593       if(i_regmap[hr]>0) {
4594         if(i_regmap[hr]!=CCREG) {
4595           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4596             if((i_dirty>>hr)&1) {
4597               if(i_regmap[hr]<64) {
4598                 emit_storereg(i_regmap[hr],hr);
4599 #ifndef FORCE32
4600                 if( ((i_is32>>i_regmap[hr])&1) ) {
4601                   #ifdef DESTRUCTIVE_WRITEBACK
4602                   emit_sarimm(hr,31,hr);
4603                   emit_storereg(i_regmap[hr]|64,hr);
4604                   #else
4605                   emit_sarimm(hr,31,HOST_TEMPREG);
4606                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4607                   #endif
4608                 }
4609 #endif
4610               }else{
4611                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4612                   emit_storereg(i_regmap[hr],hr);
4613                 }
4614               }
4615             }
4616           }
4617         }
4618       }
4619     }
4620   }
4621 }
4622
4623 // Load all registers (except cycle count)
4624 void load_all_regs(signed char i_regmap[])
4625 {
4626   int hr;
4627   for(hr=0;hr<HOST_REGS;hr++) {
4628     if(hr!=EXCLUDE_REG) {
4629       if(i_regmap[hr]==0) {
4630         emit_zeroreg(hr);
4631       }
4632       else
4633       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4634       {
4635         emit_loadreg(i_regmap[hr],hr);
4636       }
4637     }
4638   }
4639 }
4640
4641 // Load all current registers also needed by next instruction
4642 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4643 {
4644   int hr;
4645   for(hr=0;hr<HOST_REGS;hr++) {
4646     if(hr!=EXCLUDE_REG) {
4647       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4648         if(i_regmap[hr]==0) {
4649           emit_zeroreg(hr);
4650         }
4651         else
4652         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4653         {
4654           emit_loadreg(i_regmap[hr],hr);
4655         }
4656       }
4657     }
4658   }
4659 }
4660
4661 // Load all regs, storing cycle count if necessary
4662 void load_regs_entry(int t)
4663 {
4664   int hr;
4665   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4666   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4667   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4668     emit_storereg(CCREG,HOST_CCREG);
4669   }
4670   // Load 32-bit regs
4671   for(hr=0;hr<HOST_REGS;hr++) {
4672     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4673       if(regs[t].regmap_entry[hr]==0) {
4674         emit_zeroreg(hr);
4675       }
4676       else if(regs[t].regmap_entry[hr]!=CCREG)
4677       {
4678         emit_loadreg(regs[t].regmap_entry[hr],hr);
4679       }
4680     }
4681   }
4682   // Load 64-bit regs
4683   for(hr=0;hr<HOST_REGS;hr++) {
4684     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4685       assert(regs[t].regmap_entry[hr]!=64);
4686       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4687         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4688         if(lr<0) {
4689           emit_loadreg(regs[t].regmap_entry[hr],hr);
4690         }
4691         else
4692         {
4693           emit_sarimm(lr,31,hr);
4694         }
4695       }
4696       else
4697       {
4698         emit_loadreg(regs[t].regmap_entry[hr],hr);
4699       }
4700     }
4701   }
4702 }
4703
4704 // Store dirty registers prior to branch
4705 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4706 {
4707   if(internal_branch(i_is32,addr))
4708   {
4709     int t=(addr-start)>>2;
4710     int hr;
4711     for(hr=0;hr<HOST_REGS;hr++) {
4712       if(hr!=EXCLUDE_REG) {
4713         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4714           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4715             if((i_dirty>>hr)&1) {
4716               if(i_regmap[hr]<64) {
4717                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4718                   emit_storereg(i_regmap[hr],hr);
4719                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4720                     #ifdef DESTRUCTIVE_WRITEBACK
4721                     emit_sarimm(hr,31,hr);
4722                     emit_storereg(i_regmap[hr]|64,hr);
4723                     #else
4724                     emit_sarimm(hr,31,HOST_TEMPREG);
4725                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4726                     #endif
4727                   }
4728                 }
4729               }else{
4730                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4731                   emit_storereg(i_regmap[hr],hr);
4732                 }
4733               }
4734             }
4735           }
4736         }
4737       }
4738     }
4739   }
4740   else
4741   {
4742     // Branch out of this block, write out all dirty regs
4743     wb_dirtys(i_regmap,i_is32,i_dirty);
4744   }
4745 }
4746
4747 // Load all needed registers for branch target
4748 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4749 {
4750   //if(addr>=start && addr<(start+slen*4))
4751   if(internal_branch(i_is32,addr))
4752   {
4753     int t=(addr-start)>>2;
4754     int hr;
4755     // Store the cycle count before loading something else
4756     if(i_regmap[HOST_CCREG]!=CCREG) {
4757       assert(i_regmap[HOST_CCREG]==-1);
4758     }
4759     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4760       emit_storereg(CCREG,HOST_CCREG);
4761     }
4762     // Load 32-bit regs
4763     for(hr=0;hr<HOST_REGS;hr++) {
4764       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4765         #ifdef DESTRUCTIVE_WRITEBACK
4766         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4767         #else
4768         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4769         #endif
4770           if(regs[t].regmap_entry[hr]==0) {
4771             emit_zeroreg(hr);
4772           }
4773           else if(regs[t].regmap_entry[hr]!=CCREG)
4774           {
4775             emit_loadreg(regs[t].regmap_entry[hr],hr);
4776           }
4777         }
4778       }
4779     }
4780     //Load 64-bit regs
4781     for(hr=0;hr<HOST_REGS;hr++) {
4782       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4783         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4784           assert(regs[t].regmap_entry[hr]!=64);
4785           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4786             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4787             if(lr<0) {
4788               emit_loadreg(regs[t].regmap_entry[hr],hr);
4789             }
4790             else
4791             {
4792               emit_sarimm(lr,31,hr);
4793             }
4794           }
4795           else
4796           {
4797             emit_loadreg(regs[t].regmap_entry[hr],hr);
4798           }
4799         }
4800         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4801           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4802           assert(lr>=0);
4803           emit_sarimm(lr,31,hr);
4804         }
4805       }
4806     }
4807   }
4808 }
4809
4810 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4811 {
4812   if(addr>=start && addr<start+slen*4-4)
4813   {
4814     int t=(addr-start)>>2;
4815     int hr;
4816     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4817     for(hr=0;hr<HOST_REGS;hr++)
4818     {
4819       if(hr!=EXCLUDE_REG)
4820       {
4821         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4822         {
4823           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4824           {
4825             return 0;
4826           }
4827           else 
4828           if((i_dirty>>hr)&1)
4829           {
4830             if(i_regmap[hr]<TEMPREG)
4831             {
4832               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4833                 return 0;
4834             }
4835             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4836             {
4837               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4838                 return 0;
4839             }
4840           }
4841         }
4842         else // Same register but is it 32-bit or dirty?
4843         if(i_regmap[hr]>=0)
4844         {
4845           if(!((regs[t].dirty>>hr)&1))
4846           {
4847             if((i_dirty>>hr)&1)
4848             {
4849               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4850               {
4851                 //printf("%x: dirty no match\n",addr);
4852                 return 0;
4853               }
4854             }
4855           }
4856           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4857           {
4858             //printf("%x: is32 no match\n",addr);
4859             return 0;
4860           }
4861         }
4862       }
4863     }
4864     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4865 #ifndef FORCE32
4866     if(requires_32bit[t]&~i_is32) return 0;
4867 #endif
4868     // Delay slots are not valid branch targets
4869     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4870     // Delay slots require additional processing, so do not match
4871     if(is_ds[t]) return 0;
4872   }
4873   else
4874   {
4875     int hr;
4876     for(hr=0;hr<HOST_REGS;hr++)
4877     {
4878       if(hr!=EXCLUDE_REG)
4879       {
4880         if(i_regmap[hr]>=0)
4881         {
4882           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4883           {
4884             if((i_dirty>>hr)&1)
4885             {
4886               return 0;
4887             }
4888           }
4889         }
4890       }
4891     }
4892   }
4893   return 1;
4894 }
4895
4896 // Used when a branch jumps into the delay slot of another branch
4897 void ds_assemble_entry(int i)
4898 {
4899   int t=(ba[i]-start)>>2;
4900   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4901   assem_debug("Assemble delay slot at %x\n",ba[i]);
4902   assem_debug("<->\n");
4903   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4904     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4905   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4906   address_generation(t,&regs[t],regs[t].regmap_entry);
4907   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4908     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4909   cop1_usable=0;
4910   is_delayslot=0;
4911   switch(itype[t]) {
4912     case ALU:
4913       alu_assemble(t,&regs[t]);break;
4914     case IMM16:
4915       imm16_assemble(t,&regs[t]);break;
4916     case SHIFT:
4917       shift_assemble(t,&regs[t]);break;
4918     case SHIFTIMM:
4919       shiftimm_assemble(t,&regs[t]);break;
4920     case LOAD:
4921       load_assemble(t,&regs[t]);break;
4922     case LOADLR:
4923       loadlr_assemble(t,&regs[t]);break;
4924     case STORE:
4925       store_assemble(t,&regs[t]);break;
4926     case STORELR:
4927       storelr_assemble(t,&regs[t]);break;
4928     case COP0:
4929       cop0_assemble(t,&regs[t]);break;
4930     case COP1:
4931       cop1_assemble(t,&regs[t]);break;
4932     case C1LS:
4933       c1ls_assemble(t,&regs[t]);break;
4934     case COP2:
4935       cop2_assemble(t,&regs[t]);break;
4936     case C2LS:
4937       c2ls_assemble(t,&regs[t]);break;
4938     case C2OP:
4939       c2op_assemble(t,&regs[t]);break;
4940     case FCONV:
4941       fconv_assemble(t,&regs[t]);break;
4942     case FLOAT:
4943       float_assemble(t,&regs[t]);break;
4944     case FCOMP:
4945       fcomp_assemble(t,&regs[t]);break;
4946     case MULTDIV:
4947       multdiv_assemble(t,&regs[t]);break;
4948     case MOV:
4949       mov_assemble(t,&regs[t]);break;
4950     case SYSCALL:
4951     case HLECALL:
4952     case INTCALL:
4953     case SPAN:
4954     case UJUMP:
4955     case RJUMP:
4956     case CJUMP:
4957     case SJUMP:
4958     case FJUMP:
4959       printf("Jump in the delay slot.  This is probably a bug.\n");
4960   }
4961   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4962   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4963   if(internal_branch(regs[t].is32,ba[i]+4))
4964     assem_debug("branch: internal\n");
4965   else
4966     assem_debug("branch: external\n");
4967   assert(internal_branch(regs[t].is32,ba[i]+4));
4968   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4969   emit_jmp(0);
4970 }
4971
4972 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4973 {
4974   int count;
4975   int jaddr;
4976   int idle=0;
4977   if(itype[i]==RJUMP)
4978   {
4979     *adj=0;
4980   }
4981   //if(ba[i]>=start && ba[i]<(start+slen*4))
4982   if(internal_branch(branch_regs[i].is32,ba[i]))
4983   {
4984     int t=(ba[i]-start)>>2;
4985     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4986     else *adj=ccadj[t];
4987   }
4988   else
4989   {
4990     *adj=0;
4991   }
4992   count=ccadj[i];
4993   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4994     // Idle loop
4995     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4996     idle=(int)out;
4997     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4998     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4999     jaddr=(int)out;
5000     emit_jmp(0);
5001   }
5002   else if(*adj==0||invert) {
5003     emit_addimm_and_set_flags(CLOCK_ADJUST(count+2),HOST_CCREG);
5004     jaddr=(int)out;
5005     emit_jns(0);
5006   }
5007   else
5008   {
5009     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
5010     jaddr=(int)out;
5011     emit_jns(0);
5012   }
5013   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
5014 }
5015
5016 void do_ccstub(int n)
5017 {
5018   literal_pool(256);
5019   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
5020   set_jump_target(stubs[n][1],(int)out);
5021   int i=stubs[n][4];
5022   if(stubs[n][6]==NULLDS) {
5023     // Delay slot instruction is nullified ("likely" branch)
5024     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5025   }
5026   else if(stubs[n][6]!=TAKEN) {
5027     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
5028   }
5029   else {
5030     if(internal_branch(branch_regs[i].is32,ba[i]))
5031       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5032   }
5033   if(stubs[n][5]!=-1)
5034   {
5035     // Save PC as return address
5036     emit_movimm(stubs[n][5],EAX);
5037     emit_writeword(EAX,(int)&pcaddr);
5038   }
5039   else
5040   {
5041     // Return address depends on which way the branch goes
5042     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5043     {
5044       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5045       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5046       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5047       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5048       if(rs1[i]==0)
5049       {
5050         s1l=s2l;s1h=s2h;
5051         s2l=s2h=-1;
5052       }
5053       else if(rs2[i]==0)
5054       {
5055         s2l=s2h=-1;
5056       }
5057       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
5058         s1h=s2h=-1;
5059       }
5060       assert(s1l>=0);
5061       #ifdef DESTRUCTIVE_WRITEBACK
5062       if(rs1[i]) {
5063         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
5064           emit_loadreg(rs1[i],s1l);
5065       } 
5066       else {
5067         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
5068           emit_loadreg(rs2[i],s1l);
5069       }
5070       if(s2l>=0)
5071         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
5072           emit_loadreg(rs2[i],s2l);
5073       #endif
5074       int hr=0;
5075       int addr=-1,alt=-1,ntaddr=-1;
5076       while(hr<HOST_REGS)
5077       {
5078         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5079            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5080            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5081         {
5082           addr=hr++;break;
5083         }
5084         hr++;
5085       }
5086       while(hr<HOST_REGS)
5087       {
5088         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5089            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5090            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5091         {
5092           alt=hr++;break;
5093         }
5094         hr++;
5095       }
5096       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5097       {
5098         while(hr<HOST_REGS)
5099         {
5100           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5101              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5102              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5103           {
5104             ntaddr=hr;break;
5105           }
5106           hr++;
5107         }
5108         assert(hr<HOST_REGS);
5109       }
5110       if((opcode[i]&0x2f)==4) // BEQ
5111       {
5112         #ifdef HAVE_CMOV_IMM
5113         if(s1h<0) {
5114           if(s2l>=0) emit_cmp(s1l,s2l);
5115           else emit_test(s1l,s1l);
5116           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5117         }
5118         else
5119         #endif
5120         {
5121           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5122           if(s1h>=0) {
5123             if(s2h>=0) emit_cmp(s1h,s2h);
5124             else emit_test(s1h,s1h);
5125             emit_cmovne_reg(alt,addr);
5126           }
5127           if(s2l>=0) emit_cmp(s1l,s2l);
5128           else emit_test(s1l,s1l);
5129           emit_cmovne_reg(alt,addr);
5130         }
5131       }
5132       if((opcode[i]&0x2f)==5) // BNE
5133       {
5134         #ifdef HAVE_CMOV_IMM
5135         if(s1h<0) {
5136           if(s2l>=0) emit_cmp(s1l,s2l);
5137           else emit_test(s1l,s1l);
5138           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5139         }
5140         else
5141         #endif
5142         {
5143           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5144           if(s1h>=0) {
5145             if(s2h>=0) emit_cmp(s1h,s2h);
5146             else emit_test(s1h,s1h);
5147             emit_cmovne_reg(alt,addr);
5148           }
5149           if(s2l>=0) emit_cmp(s1l,s2l);
5150           else emit_test(s1l,s1l);
5151           emit_cmovne_reg(alt,addr);
5152         }
5153       }
5154       if((opcode[i]&0x2f)==6) // BLEZ
5155       {
5156         //emit_movimm(ba[i],alt);
5157         //emit_movimm(start+i*4+8,addr);
5158         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5159         emit_cmpimm(s1l,1);
5160         if(s1h>=0) emit_mov(addr,ntaddr);
5161         emit_cmovl_reg(alt,addr);
5162         if(s1h>=0) {
5163           emit_test(s1h,s1h);
5164           emit_cmovne_reg(ntaddr,addr);
5165           emit_cmovs_reg(alt,addr);
5166         }
5167       }
5168       if((opcode[i]&0x2f)==7) // BGTZ
5169       {
5170         //emit_movimm(ba[i],addr);
5171         //emit_movimm(start+i*4+8,ntaddr);
5172         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5173         emit_cmpimm(s1l,1);
5174         if(s1h>=0) emit_mov(addr,alt);
5175         emit_cmovl_reg(ntaddr,addr);
5176         if(s1h>=0) {
5177           emit_test(s1h,s1h);
5178           emit_cmovne_reg(alt,addr);
5179           emit_cmovs_reg(ntaddr,addr);
5180         }
5181       }
5182       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5183       {
5184         //emit_movimm(ba[i],alt);
5185         //emit_movimm(start+i*4+8,addr);
5186         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5187         if(s1h>=0) emit_test(s1h,s1h);
5188         else emit_test(s1l,s1l);
5189         emit_cmovs_reg(alt,addr);
5190       }
5191       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5192       {
5193         //emit_movimm(ba[i],addr);
5194         //emit_movimm(start+i*4+8,alt);
5195         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5196         if(s1h>=0) emit_test(s1h,s1h);
5197         else emit_test(s1l,s1l);
5198         emit_cmovs_reg(alt,addr);
5199       }
5200       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5201         if(source[i]&0x10000) // BC1T
5202         {
5203           //emit_movimm(ba[i],alt);
5204           //emit_movimm(start+i*4+8,addr);
5205           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5206           emit_testimm(s1l,0x800000);
5207           emit_cmovne_reg(alt,addr);
5208         }
5209         else // BC1F
5210         {
5211           //emit_movimm(ba[i],addr);
5212           //emit_movimm(start+i*4+8,alt);
5213           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5214           emit_testimm(s1l,0x800000);
5215           emit_cmovne_reg(alt,addr);
5216         }
5217       }
5218       emit_writeword(addr,(int)&pcaddr);
5219     }
5220     else
5221     if(itype[i]==RJUMP)
5222     {
5223       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5224       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5225         r=get_reg(branch_regs[i].regmap,RTEMP);
5226       }
5227       emit_writeword(r,(int)&pcaddr);
5228     }
5229     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
5230   }
5231   // Update cycle count
5232   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5233   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5234   emit_call((int)cc_interrupt);
5235   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5236   if(stubs[n][6]==TAKEN) {
5237     if(internal_branch(branch_regs[i].is32,ba[i]))
5238       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5239     else if(itype[i]==RJUMP) {
5240       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5241         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5242       else
5243         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5244     }
5245   }else if(stubs[n][6]==NOTTAKEN) {
5246     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5247     else load_all_regs(branch_regs[i].regmap);
5248   }else if(stubs[n][6]==NULLDS) {
5249     // Delay slot instruction is nullified ("likely" branch)
5250     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5251     else load_all_regs(regs[i].regmap);
5252   }else{
5253     load_all_regs(branch_regs[i].regmap);
5254   }
5255   emit_jmp(stubs[n][2]); // return address
5256   
5257   /* This works but uses a lot of memory...
5258   emit_readword((int)&last_count,ECX);
5259   emit_add(HOST_CCREG,ECX,EAX);
5260   emit_writeword(EAX,(int)&Count);
5261   emit_call((int)gen_interupt);
5262   emit_readword((int)&Count,HOST_CCREG);
5263   emit_readword((int)&next_interupt,EAX);
5264   emit_readword((int)&pending_exception,EBX);
5265   emit_writeword(EAX,(int)&last_count);
5266   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5267   emit_test(EBX,EBX);
5268   int jne_instr=(int)out;
5269   emit_jne(0);
5270   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5271   load_all_regs(branch_regs[i].regmap);
5272   emit_jmp(stubs[n][2]); // return address
5273   set_jump_target(jne_instr,(int)out);
5274   emit_readword((int)&pcaddr,EAX);
5275   // Call get_addr_ht instead of doing the hash table here.
5276   // This code is executed infrequently and takes up a lot of space
5277   // so smaller is better.
5278   emit_storereg(CCREG,HOST_CCREG);
5279   emit_pushreg(EAX);
5280   emit_call((int)get_addr_ht);
5281   emit_loadreg(CCREG,HOST_CCREG);
5282   emit_addimm(ESP,4,ESP);
5283   emit_jmpreg(EAX);*/
5284 }
5285
5286 add_to_linker(int addr,int target,int ext)
5287 {
5288   link_addr[linkcount][0]=addr;
5289   link_addr[linkcount][1]=target;
5290   link_addr[linkcount][2]=ext;  
5291   linkcount++;
5292 }
5293
5294 static void ujump_assemble_write_ra(int i)
5295 {
5296   int rt;
5297   unsigned int return_address;
5298   rt=get_reg(branch_regs[i].regmap,31);
5299   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5300   //assert(rt>=0);
5301   return_address=start+i*4+8;
5302   if(rt>=0) {
5303     #ifdef USE_MINI_HT
5304     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5305       int temp=-1; // note: must be ds-safe
5306       #ifdef HOST_TEMPREG
5307       temp=HOST_TEMPREG;
5308       #endif
5309       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5310       else emit_movimm(return_address,rt);
5311     }
5312     else
5313     #endif
5314     {
5315       #ifdef REG_PREFETCH
5316       if(temp>=0) 
5317       {
5318         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5319       }
5320       #endif
5321       emit_movimm(return_address,rt); // PC into link register
5322       #ifdef IMM_PREFETCH
5323       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5324       #endif
5325     }
5326   }
5327 }
5328
5329 void ujump_assemble(int i,struct regstat *i_regs)
5330 {
5331   signed char *i_regmap=i_regs->regmap;
5332   int ra_done=0;
5333   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5334   address_generation(i+1,i_regs,regs[i].regmap_entry);
5335   #ifdef REG_PREFETCH
5336   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5337   if(rt1[i]==31&&temp>=0) 
5338   {
5339     int return_address=start+i*4+8;
5340     if(get_reg(branch_regs[i].regmap,31)>0) 
5341     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5342   }
5343   #endif
5344   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5345     ujump_assemble_write_ra(i); // writeback ra for DS
5346     ra_done=1;
5347   }
5348   ds_assemble(i+1,i_regs);
5349   uint64_t bc_unneeded=branch_regs[i].u;
5350   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5351   bc_unneeded|=1|(1LL<<rt1[i]);
5352   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5353   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5354                 bc_unneeded,bc_unneeded_upper);
5355   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5356   if(!ra_done&&rt1[i]==31)
5357     ujump_assemble_write_ra(i);
5358   int cc,adj;
5359   cc=get_reg(branch_regs[i].regmap,CCREG);
5360   assert(cc==HOST_CCREG);
5361   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5362   #ifdef REG_PREFETCH
5363   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5364   #endif
5365   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5366   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5367   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5368   if(internal_branch(branch_regs[i].is32,ba[i]))
5369     assem_debug("branch: internal\n");
5370   else
5371     assem_debug("branch: external\n");
5372   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5373     ds_assemble_entry(i);
5374   }
5375   else {
5376     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5377     emit_jmp(0);
5378   }
5379 }
5380
5381 static void rjump_assemble_write_ra(int i)
5382 {
5383   int rt,return_address;
5384   assert(rt1[i+1]!=rt1[i]);
5385   assert(rt2[i+1]!=rt1[i]);
5386   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5387   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5388   assert(rt>=0);
5389   return_address=start+i*4+8;
5390   #ifdef REG_PREFETCH
5391   if(temp>=0) 
5392   {
5393     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5394   }
5395   #endif
5396   emit_movimm(return_address,rt); // PC into link register
5397   #ifdef IMM_PREFETCH
5398   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5399   #endif
5400 }
5401
5402 void rjump_assemble(int i,struct regstat *i_regs)
5403 {
5404   signed char *i_regmap=i_regs->regmap;
5405   int temp;
5406   int rs,cc,adj;
5407   int ra_done=0;
5408   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5409   assert(rs>=0);
5410   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5411     // Delay slot abuse, make a copy of the branch address register
5412     temp=get_reg(branch_regs[i].regmap,RTEMP);
5413     assert(temp>=0);
5414     assert(regs[i].regmap[temp]==RTEMP);
5415     emit_mov(rs,temp);
5416     rs=temp;
5417   }
5418   address_generation(i+1,i_regs,regs[i].regmap_entry);
5419   #ifdef REG_PREFETCH
5420   if(rt1[i]==31) 
5421   {
5422     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5423       int return_address=start+i*4+8;
5424       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5425     }
5426   }
5427   #endif
5428   #ifdef USE_MINI_HT
5429   if(rs1[i]==31) {
5430     int rh=get_reg(regs[i].regmap,RHASH);
5431     if(rh>=0) do_preload_rhash(rh);
5432   }
5433   #endif
5434   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5435     rjump_assemble_write_ra(i);
5436     ra_done=1;
5437   }
5438   ds_assemble(i+1,i_regs);
5439   uint64_t bc_unneeded=branch_regs[i].u;
5440   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5441   bc_unneeded|=1|(1LL<<rt1[i]);
5442   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5443   bc_unneeded&=~(1LL<<rs1[i]);
5444   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5445                 bc_unneeded,bc_unneeded_upper);
5446   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5447   if(!ra_done&&rt1[i]!=0)
5448     rjump_assemble_write_ra(i);
5449   cc=get_reg(branch_regs[i].regmap,CCREG);
5450   assert(cc==HOST_CCREG);
5451   #ifdef USE_MINI_HT
5452   int rh=get_reg(branch_regs[i].regmap,RHASH);
5453   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5454   if(rs1[i]==31) {
5455     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5456     do_preload_rhtbl(ht);
5457     do_rhash(rs,rh);
5458   }
5459   #endif
5460   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5461   #ifdef DESTRUCTIVE_WRITEBACK
5462   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5463     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5464       emit_loadreg(rs1[i],rs);
5465     }
5466   }
5467   #endif
5468   #ifdef REG_PREFETCH
5469   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5470   #endif
5471   #ifdef USE_MINI_HT
5472   if(rs1[i]==31) {
5473     do_miniht_load(ht,rh);
5474   }
5475   #endif
5476   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5477   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5478   //assert(adj==0);
5479   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5480   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5481 #ifdef PCSX
5482   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5483     // special case for RFE
5484     emit_jmp(0);
5485   else
5486 #endif
5487   emit_jns(0);
5488   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5489   #ifdef USE_MINI_HT
5490   if(rs1[i]==31) {
5491     do_miniht_jump(rs,rh,ht);
5492   }
5493   else
5494   #endif
5495   {
5496     //if(rs!=EAX) emit_mov(rs,EAX);
5497     //emit_jmp((int)jump_vaddr_eax);
5498     emit_jmp(jump_vaddr_reg[rs]);
5499   }
5500   /* Check hash table
5501   temp=!rs;
5502   emit_mov(rs,temp);
5503   emit_shrimm(rs,16,rs);
5504   emit_xor(temp,rs,rs);
5505   emit_movzwl_reg(rs,rs);
5506   emit_shlimm(rs,4,rs);
5507   emit_cmpmem_indexed((int)hash_table,rs,temp);
5508   emit_jne((int)out+14);
5509   emit_readword_indexed((int)hash_table+4,rs,rs);
5510   emit_jmpreg(rs);
5511   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5512   emit_addimm_no_flags(8,rs);
5513   emit_jeq((int)out-17);
5514   // No hit on hash table, call compiler
5515   emit_pushreg(temp);
5516 //DEBUG >
5517 #ifdef DEBUG_CYCLE_COUNT
5518   emit_readword((int)&last_count,ECX);
5519   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5520   emit_readword((int)&next_interupt,ECX);
5521   emit_writeword(HOST_CCREG,(int)&Count);
5522   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5523   emit_writeword(ECX,(int)&last_count);
5524 #endif
5525 //DEBUG <
5526   emit_storereg(CCREG,HOST_CCREG);
5527   emit_call((int)get_addr);
5528   emit_loadreg(CCREG,HOST_CCREG);
5529   emit_addimm(ESP,4,ESP);
5530   emit_jmpreg(EAX);*/
5531   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5532   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5533   #endif
5534 }
5535
5536 void cjump_assemble(int i,struct regstat *i_regs)
5537 {
5538   signed char *i_regmap=i_regs->regmap;
5539   int cc;
5540   int match;
5541   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5542   assem_debug("match=%d\n",match);
5543   int s1h,s1l,s2h,s2l;
5544   int prev_cop1_usable=cop1_usable;
5545   int unconditional=0,nop=0;
5546   int only32=0;
5547   int invert=0;
5548   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5549   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5550   if(!match) invert=1;
5551   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5552   if(i>(ba[i]-start)>>2) invert=1;
5553   #endif
5554   
5555   if(ooo[i]) {
5556     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5557     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5558     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5559     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5560   }
5561   else {
5562     s1l=get_reg(i_regmap,rs1[i]);
5563     s1h=get_reg(i_regmap,rs1[i]|64);
5564     s2l=get_reg(i_regmap,rs2[i]);
5565     s2h=get_reg(i_regmap,rs2[i]|64);
5566   }
5567   if(rs1[i]==0&&rs2[i]==0)
5568   {
5569     if(opcode[i]&1) nop=1;
5570     else unconditional=1;
5571     //assert(opcode[i]!=5);
5572     //assert(opcode[i]!=7);
5573     //assert(opcode[i]!=0x15);
5574     //assert(opcode[i]!=0x17);
5575   }
5576   else if(rs1[i]==0)
5577   {
5578     s1l=s2l;s1h=s2h;
5579     s2l=s2h=-1;
5580     only32=(regs[i].was32>>rs2[i])&1;
5581   }
5582   else if(rs2[i]==0)
5583   {
5584     s2l=s2h=-1;
5585     only32=(regs[i].was32>>rs1[i])&1;
5586   }
5587   else {
5588     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5589   }
5590
5591   if(ooo[i]) {
5592     // Out of order execution (delay slot first)
5593     //printf("OOOE\n");
5594     address_generation(i+1,i_regs,regs[i].regmap_entry);
5595     ds_assemble(i+1,i_regs);
5596     int adj;
5597     uint64_t bc_unneeded=branch_regs[i].u;
5598     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5599     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5600     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5601     bc_unneeded|=1;
5602     bc_unneeded_upper|=1;
5603     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5604                   bc_unneeded,bc_unneeded_upper);
5605     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5606     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5607     cc=get_reg(branch_regs[i].regmap,CCREG);
5608     assert(cc==HOST_CCREG);
5609     if(unconditional) 
5610       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5611     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5612     //assem_debug("cycle count (adj)\n");
5613     if(unconditional) {
5614       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5615       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5616         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5617         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5618         if(internal)
5619           assem_debug("branch: internal\n");
5620         else
5621           assem_debug("branch: external\n");
5622         if(internal&&is_ds[(ba[i]-start)>>2]) {
5623           ds_assemble_entry(i);
5624         }
5625         else {
5626           add_to_linker((int)out,ba[i],internal);
5627           emit_jmp(0);
5628         }
5629         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5630         if(((u_int)out)&7) emit_addnop(0);
5631         #endif
5632       }
5633     }
5634     else if(nop) {
5635       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5636       int jaddr=(int)out;
5637       emit_jns(0);
5638       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5639     }
5640     else {
5641       int taken=0,nottaken=0,nottaken1=0;
5642       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5643       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5644       if(!only32)
5645       {
5646         assert(s1h>=0);
5647         if(opcode[i]==4) // BEQ
5648         {
5649           if(s2h>=0) emit_cmp(s1h,s2h);
5650           else emit_test(s1h,s1h);
5651           nottaken1=(int)out;
5652           emit_jne(1);
5653         }
5654         if(opcode[i]==5) // BNE
5655         {
5656           if(s2h>=0) emit_cmp(s1h,s2h);
5657           else emit_test(s1h,s1h);
5658           if(invert) taken=(int)out;
5659           else add_to_linker((int)out,ba[i],internal);
5660           emit_jne(0);
5661         }
5662         if(opcode[i]==6) // BLEZ
5663         {
5664           emit_test(s1h,s1h);
5665           if(invert) taken=(int)out;
5666           else add_to_linker((int)out,ba[i],internal);
5667           emit_js(0);
5668           nottaken1=(int)out;
5669           emit_jne(1);
5670         }
5671         if(opcode[i]==7) // BGTZ
5672         {
5673           emit_test(s1h,s1h);
5674           nottaken1=(int)out;
5675           emit_js(1);
5676           if(invert) taken=(int)out;
5677           else add_to_linker((int)out,ba[i],internal);
5678           emit_jne(0);
5679         }
5680       } // if(!only32)
5681           
5682       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5683       assert(s1l>=0);
5684       if(opcode[i]==4) // BEQ
5685       {
5686         if(s2l>=0) emit_cmp(s1l,s2l);
5687         else emit_test(s1l,s1l);
5688         if(invert){
5689           nottaken=(int)out;
5690           emit_jne(1);
5691         }else{
5692           add_to_linker((int)out,ba[i],internal);
5693           emit_jeq(0);
5694         }
5695       }
5696       if(opcode[i]==5) // BNE
5697       {
5698         if(s2l>=0) emit_cmp(s1l,s2l);
5699         else emit_test(s1l,s1l);
5700         if(invert){
5701           nottaken=(int)out;
5702           emit_jeq(1);
5703         }else{
5704           add_to_linker((int)out,ba[i],internal);
5705           emit_jne(0);
5706         }
5707       }
5708       if(opcode[i]==6) // BLEZ
5709       {
5710         emit_cmpimm(s1l,1);
5711         if(invert){
5712           nottaken=(int)out;
5713           emit_jge(1);
5714         }else{
5715           add_to_linker((int)out,ba[i],internal);
5716           emit_jl(0);
5717         }
5718       }
5719       if(opcode[i]==7) // BGTZ
5720       {
5721         emit_cmpimm(s1l,1);
5722         if(invert){
5723           nottaken=(int)out;
5724           emit_jl(1);
5725         }else{
5726           add_to_linker((int)out,ba[i],internal);
5727           emit_jge(0);
5728         }
5729       }
5730       if(invert) {
5731         if(taken) set_jump_target(taken,(int)out);
5732         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5733         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5734           if(adj) {
5735             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5736             add_to_linker((int)out,ba[i],internal);
5737           }else{
5738             emit_addnop(13);
5739             add_to_linker((int)out,ba[i],internal*2);
5740           }
5741           emit_jmp(0);
5742         }else
5743         #endif
5744         {
5745           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5746           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5747           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5748           if(internal)
5749             assem_debug("branch: internal\n");
5750           else
5751             assem_debug("branch: external\n");
5752           if(internal&&is_ds[(ba[i]-start)>>2]) {
5753             ds_assemble_entry(i);
5754           }
5755           else {
5756             add_to_linker((int)out,ba[i],internal);
5757             emit_jmp(0);
5758           }
5759         }
5760         set_jump_target(nottaken,(int)out);
5761       }
5762
5763       if(nottaken1) set_jump_target(nottaken1,(int)out);
5764       if(adj) {
5765         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5766       }
5767     } // (!unconditional)
5768   } // if(ooo)
5769   else
5770   {
5771     // In-order execution (branch first)
5772     //if(likely[i]) printf("IOL\n");
5773     //else
5774     //printf("IOE\n");
5775     int taken=0,nottaken=0,nottaken1=0;
5776     if(!unconditional&&!nop) {
5777       if(!only32)
5778       {
5779         assert(s1h>=0);
5780         if((opcode[i]&0x2f)==4) // BEQ
5781         {
5782           if(s2h>=0) emit_cmp(s1h,s2h);
5783           else emit_test(s1h,s1h);
5784           nottaken1=(int)out;
5785           emit_jne(2);
5786         }
5787         if((opcode[i]&0x2f)==5) // BNE
5788         {
5789           if(s2h>=0) emit_cmp(s1h,s2h);
5790           else emit_test(s1h,s1h);
5791           taken=(int)out;
5792           emit_jne(1);
5793         }
5794         if((opcode[i]&0x2f)==6) // BLEZ
5795         {
5796           emit_test(s1h,s1h);
5797           taken=(int)out;
5798           emit_js(1);
5799           nottaken1=(int)out;
5800           emit_jne(2);
5801         }
5802         if((opcode[i]&0x2f)==7) // BGTZ
5803         {
5804           emit_test(s1h,s1h);
5805           nottaken1=(int)out;
5806           emit_js(2);
5807           taken=(int)out;
5808           emit_jne(1);
5809         }
5810       } // if(!only32)
5811           
5812       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5813       assert(s1l>=0);
5814       if((opcode[i]&0x2f)==4) // BEQ
5815       {
5816         if(s2l>=0) emit_cmp(s1l,s2l);
5817         else emit_test(s1l,s1l);
5818         nottaken=(int)out;
5819         emit_jne(2);
5820       }
5821       if((opcode[i]&0x2f)==5) // BNE
5822       {
5823         if(s2l>=0) emit_cmp(s1l,s2l);
5824         else emit_test(s1l,s1l);
5825         nottaken=(int)out;
5826         emit_jeq(2);
5827       }
5828       if((opcode[i]&0x2f)==6) // BLEZ
5829       {
5830         emit_cmpimm(s1l,1);
5831         nottaken=(int)out;
5832         emit_jge(2);
5833       }
5834       if((opcode[i]&0x2f)==7) // BGTZ
5835       {
5836         emit_cmpimm(s1l,1);
5837         nottaken=(int)out;
5838         emit_jl(2);
5839       }
5840     } // if(!unconditional)
5841     int adj;
5842     uint64_t ds_unneeded=branch_regs[i].u;
5843     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5844     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5845     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5846     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5847     ds_unneeded|=1;
5848     ds_unneeded_upper|=1;
5849     // branch taken
5850     if(!nop) {
5851       if(taken) set_jump_target(taken,(int)out);
5852       assem_debug("1:\n");
5853       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5854                     ds_unneeded,ds_unneeded_upper);
5855       // load regs
5856       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5857       address_generation(i+1,&branch_regs[i],0);
5858       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5859       ds_assemble(i+1,&branch_regs[i]);
5860       cc=get_reg(branch_regs[i].regmap,CCREG);
5861       if(cc==-1) {
5862         emit_loadreg(CCREG,cc=HOST_CCREG);
5863         // CHECK: Is the following instruction (fall thru) allocated ok?
5864       }
5865       assert(cc==HOST_CCREG);
5866       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5867       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5868       assem_debug("cycle count (adj)\n");
5869       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5870       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5871       if(internal)
5872         assem_debug("branch: internal\n");
5873       else
5874         assem_debug("branch: external\n");
5875       if(internal&&is_ds[(ba[i]-start)>>2]) {
5876         ds_assemble_entry(i);
5877       }
5878       else {
5879         add_to_linker((int)out,ba[i],internal);
5880         emit_jmp(0);
5881       }
5882     }
5883     // branch not taken
5884     cop1_usable=prev_cop1_usable;
5885     if(!unconditional) {
5886       if(nottaken1) set_jump_target(nottaken1,(int)out);
5887       set_jump_target(nottaken,(int)out);
5888       assem_debug("2:\n");
5889       if(!likely[i]) {
5890         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5891                       ds_unneeded,ds_unneeded_upper);
5892         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5893         address_generation(i+1,&branch_regs[i],0);
5894         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5895         ds_assemble(i+1,&branch_regs[i]);
5896       }
5897       cc=get_reg(branch_regs[i].regmap,CCREG);
5898       if(cc==-1&&!likely[i]) {
5899         // Cycle count isn't in a register, temporarily load it then write it out
5900         emit_loadreg(CCREG,HOST_CCREG);
5901         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5902         int jaddr=(int)out;
5903         emit_jns(0);
5904         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5905         emit_storereg(CCREG,HOST_CCREG);
5906       }
5907       else{
5908         cc=get_reg(i_regmap,CCREG);
5909         assert(cc==HOST_CCREG);
5910         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5911         int jaddr=(int)out;
5912         emit_jns(0);
5913         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5914       }
5915     }
5916   }
5917 }
5918
5919 void sjump_assemble(int i,struct regstat *i_regs)
5920 {
5921   signed char *i_regmap=i_regs->regmap;
5922   int cc;
5923   int match;
5924   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5925   assem_debug("smatch=%d\n",match);
5926   int s1h,s1l;
5927   int prev_cop1_usable=cop1_usable;
5928   int unconditional=0,nevertaken=0;
5929   int only32=0;
5930   int invert=0;
5931   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5932   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5933   if(!match) invert=1;
5934   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5935   if(i>(ba[i]-start)>>2) invert=1;
5936   #endif
5937
5938   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5939   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5940
5941   if(ooo[i]) {
5942     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5943     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5944   }
5945   else {
5946     s1l=get_reg(i_regmap,rs1[i]);
5947     s1h=get_reg(i_regmap,rs1[i]|64);
5948   }
5949   if(rs1[i]==0)
5950   {
5951     if(opcode2[i]&1) unconditional=1;
5952     else nevertaken=1;
5953     // These are never taken (r0 is never less than zero)
5954     //assert(opcode2[i]!=0);
5955     //assert(opcode2[i]!=2);
5956     //assert(opcode2[i]!=0x10);
5957     //assert(opcode2[i]!=0x12);
5958   }
5959   else {
5960     only32=(regs[i].was32>>rs1[i])&1;
5961   }
5962
5963   if(ooo[i]) {
5964     // Out of order execution (delay slot first)
5965     //printf("OOOE\n");
5966     address_generation(i+1,i_regs,regs[i].regmap_entry);
5967     ds_assemble(i+1,i_regs);
5968     int adj;
5969     uint64_t bc_unneeded=branch_regs[i].u;
5970     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5971     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5972     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5973     bc_unneeded|=1;
5974     bc_unneeded_upper|=1;
5975     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5976                   bc_unneeded,bc_unneeded_upper);
5977     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5978     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5979     if(rt1[i]==31) {
5980       int rt,return_address;
5981       rt=get_reg(branch_regs[i].regmap,31);
5982       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5983       if(rt>=0) {
5984         // Save the PC even if the branch is not taken
5985         return_address=start+i*4+8;
5986         emit_movimm(return_address,rt); // PC into link register
5987         #ifdef IMM_PREFETCH
5988         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5989         #endif
5990       }
5991     }
5992     cc=get_reg(branch_regs[i].regmap,CCREG);
5993     assert(cc==HOST_CCREG);
5994     if(unconditional) 
5995       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5996     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5997     assem_debug("cycle count (adj)\n");
5998     if(unconditional) {
5999       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
6000       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
6001         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6002         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6003         if(internal)
6004           assem_debug("branch: internal\n");
6005         else
6006           assem_debug("branch: external\n");
6007         if(internal&&is_ds[(ba[i]-start)>>2]) {
6008           ds_assemble_entry(i);
6009         }
6010         else {
6011           add_to_linker((int)out,ba[i],internal);
6012           emit_jmp(0);
6013         }
6014         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6015         if(((u_int)out)&7) emit_addnop(0);
6016         #endif
6017       }
6018     }
6019     else if(nevertaken) {
6020       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6021       int jaddr=(int)out;
6022       emit_jns(0);
6023       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6024     }
6025     else {
6026       int nottaken=0;
6027       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6028       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6029       if(!only32)
6030       {
6031         assert(s1h>=0);
6032         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
6033         {
6034           emit_test(s1h,s1h);
6035           if(invert){
6036             nottaken=(int)out;
6037             emit_jns(1);
6038           }else{
6039             add_to_linker((int)out,ba[i],internal);
6040             emit_js(0);
6041           }
6042         }
6043         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6044         {
6045           emit_test(s1h,s1h);
6046           if(invert){
6047             nottaken=(int)out;
6048             emit_js(1);
6049           }else{
6050             add_to_linker((int)out,ba[i],internal);
6051             emit_jns(0);
6052           }
6053         }
6054       } // if(!only32)
6055       else
6056       {
6057         assert(s1l>=0);
6058         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
6059         {
6060           emit_test(s1l,s1l);
6061           if(invert){
6062             nottaken=(int)out;
6063             emit_jns(1);
6064           }else{
6065             add_to_linker((int)out,ba[i],internal);
6066             emit_js(0);
6067           }
6068         }
6069         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6070         {
6071           emit_test(s1l,s1l);
6072           if(invert){
6073             nottaken=(int)out;
6074             emit_js(1);
6075           }else{
6076             add_to_linker((int)out,ba[i],internal);
6077             emit_jns(0);
6078           }
6079         }
6080       } // if(!only32)
6081           
6082       if(invert) {
6083         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6084         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
6085           if(adj) {
6086             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6087             add_to_linker((int)out,ba[i],internal);
6088           }else{
6089             emit_addnop(13);
6090             add_to_linker((int)out,ba[i],internal*2);
6091           }
6092           emit_jmp(0);
6093         }else
6094         #endif
6095         {
6096           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6097           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6098           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6099           if(internal)
6100             assem_debug("branch: internal\n");
6101           else
6102             assem_debug("branch: external\n");
6103           if(internal&&is_ds[(ba[i]-start)>>2]) {
6104             ds_assemble_entry(i);
6105           }
6106           else {
6107             add_to_linker((int)out,ba[i],internal);
6108             emit_jmp(0);
6109           }
6110         }
6111         set_jump_target(nottaken,(int)out);
6112       }
6113
6114       if(adj) {
6115         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6116       }
6117     } // (!unconditional)
6118   } // if(ooo)
6119   else
6120   {
6121     // In-order execution (branch first)
6122     //printf("IOE\n");
6123     int nottaken=0;
6124     if(rt1[i]==31) {
6125       int rt,return_address;
6126       rt=get_reg(branch_regs[i].regmap,31);
6127       if(rt>=0) {
6128         // Save the PC even if the branch is not taken
6129         return_address=start+i*4+8;
6130         emit_movimm(return_address,rt); // PC into link register
6131         #ifdef IMM_PREFETCH
6132         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6133         #endif
6134       }
6135     }
6136     if(!unconditional) {
6137       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6138       if(!only32)
6139       {
6140         assert(s1h>=0);
6141         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6142         {
6143           emit_test(s1h,s1h);
6144           nottaken=(int)out;
6145           emit_jns(1);
6146         }
6147         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6148         {
6149           emit_test(s1h,s1h);
6150           nottaken=(int)out;
6151           emit_js(1);
6152         }
6153       } // if(!only32)
6154       else
6155       {
6156         assert(s1l>=0);
6157         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6158         {
6159           emit_test(s1l,s1l);
6160           nottaken=(int)out;
6161           emit_jns(1);
6162         }
6163         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6164         {
6165           emit_test(s1l,s1l);
6166           nottaken=(int)out;
6167           emit_js(1);
6168         }
6169       }
6170     } // if(!unconditional)
6171     int adj;
6172     uint64_t ds_unneeded=branch_regs[i].u;
6173     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6174     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6175     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6176     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6177     ds_unneeded|=1;
6178     ds_unneeded_upper|=1;
6179     // branch taken
6180     if(!nevertaken) {
6181       //assem_debug("1:\n");
6182       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6183                     ds_unneeded,ds_unneeded_upper);
6184       // load regs
6185       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6186       address_generation(i+1,&branch_regs[i],0);
6187       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6188       ds_assemble(i+1,&branch_regs[i]);
6189       cc=get_reg(branch_regs[i].regmap,CCREG);
6190       if(cc==-1) {
6191         emit_loadreg(CCREG,cc=HOST_CCREG);
6192         // CHECK: Is the following instruction (fall thru) allocated ok?
6193       }
6194       assert(cc==HOST_CCREG);
6195       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6196       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6197       assem_debug("cycle count (adj)\n");
6198       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6199       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6200       if(internal)
6201         assem_debug("branch: internal\n");
6202       else
6203         assem_debug("branch: external\n");
6204       if(internal&&is_ds[(ba[i]-start)>>2]) {
6205         ds_assemble_entry(i);
6206       }
6207       else {
6208         add_to_linker((int)out,ba[i],internal);
6209         emit_jmp(0);
6210       }
6211     }
6212     // branch not taken
6213     cop1_usable=prev_cop1_usable;
6214     if(!unconditional) {
6215       set_jump_target(nottaken,(int)out);
6216       assem_debug("1:\n");
6217       if(!likely[i]) {
6218         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6219                       ds_unneeded,ds_unneeded_upper);
6220         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6221         address_generation(i+1,&branch_regs[i],0);
6222         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6223         ds_assemble(i+1,&branch_regs[i]);
6224       }
6225       cc=get_reg(branch_regs[i].regmap,CCREG);
6226       if(cc==-1&&!likely[i]) {
6227         // Cycle count isn't in a register, temporarily load it then write it out
6228         emit_loadreg(CCREG,HOST_CCREG);
6229         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6230         int jaddr=(int)out;
6231         emit_jns(0);
6232         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6233         emit_storereg(CCREG,HOST_CCREG);
6234       }
6235       else{
6236         cc=get_reg(i_regmap,CCREG);
6237         assert(cc==HOST_CCREG);
6238         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6239         int jaddr=(int)out;
6240         emit_jns(0);
6241         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6242       }
6243     }
6244   }
6245 }
6246
6247 void fjump_assemble(int i,struct regstat *i_regs)
6248 {
6249   signed char *i_regmap=i_regs->regmap;
6250   int cc;
6251   int match;
6252   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6253   assem_debug("fmatch=%d\n",match);
6254   int fs,cs;
6255   int eaddr;
6256   int invert=0;
6257   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6258   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6259   if(!match) invert=1;
6260   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6261   if(i>(ba[i]-start)>>2) invert=1;
6262   #endif
6263
6264   if(ooo[i]) {
6265     fs=get_reg(branch_regs[i].regmap,FSREG);
6266     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6267   }
6268   else {
6269     fs=get_reg(i_regmap,FSREG);
6270   }
6271
6272   // Check cop1 unusable
6273   if(!cop1_usable) {
6274     cs=get_reg(i_regmap,CSREG);
6275     assert(cs>=0);
6276     emit_testimm(cs,0x20000000);
6277     eaddr=(int)out;
6278     emit_jeq(0);
6279     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6280     cop1_usable=1;
6281   }
6282
6283   if(ooo[i]) {
6284     // Out of order execution (delay slot first)
6285     //printf("OOOE\n");
6286     ds_assemble(i+1,i_regs);
6287     int adj;
6288     uint64_t bc_unneeded=branch_regs[i].u;
6289     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6290     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6291     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6292     bc_unneeded|=1;
6293     bc_unneeded_upper|=1;
6294     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6295                   bc_unneeded,bc_unneeded_upper);
6296     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6297     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6298     cc=get_reg(branch_regs[i].regmap,CCREG);
6299     assert(cc==HOST_CCREG);
6300     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6301     assem_debug("cycle count (adj)\n");
6302     if(1) {
6303       int nottaken=0;
6304       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6305       if(1) {
6306         assert(fs>=0);
6307         emit_testimm(fs,0x800000);
6308         if(source[i]&0x10000) // BC1T
6309         {
6310           if(invert){
6311             nottaken=(int)out;
6312             emit_jeq(1);
6313           }else{
6314             add_to_linker((int)out,ba[i],internal);
6315             emit_jne(0);
6316           }
6317         }
6318         else // BC1F
6319           if(invert){
6320             nottaken=(int)out;
6321             emit_jne(1);
6322           }else{
6323             add_to_linker((int)out,ba[i],internal);
6324             emit_jeq(0);
6325           }
6326         {
6327         }
6328       } // if(!only32)
6329           
6330       if(invert) {
6331         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6332         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6333         else if(match) emit_addnop(13);
6334         #endif
6335         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6336         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6337         if(internal)
6338           assem_debug("branch: internal\n");
6339         else
6340           assem_debug("branch: external\n");
6341         if(internal&&is_ds[(ba[i]-start)>>2]) {
6342           ds_assemble_entry(i);
6343         }
6344         else {
6345           add_to_linker((int)out,ba[i],internal);
6346           emit_jmp(0);
6347         }
6348         set_jump_target(nottaken,(int)out);
6349       }
6350
6351       if(adj) {
6352         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6353       }
6354     } // (!unconditional)
6355   } // if(ooo)
6356   else
6357   {
6358     // In-order execution (branch first)
6359     //printf("IOE\n");
6360     int nottaken=0;
6361     if(1) {
6362       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6363       if(1) {
6364         assert(fs>=0);
6365         emit_testimm(fs,0x800000);
6366         if(source[i]&0x10000) // BC1T
6367         {
6368           nottaken=(int)out;
6369           emit_jeq(1);
6370         }
6371         else // BC1F
6372         {
6373           nottaken=(int)out;
6374           emit_jne(1);
6375         }
6376       }
6377     } // if(!unconditional)
6378     int adj;
6379     uint64_t ds_unneeded=branch_regs[i].u;
6380     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6381     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6382     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6383     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6384     ds_unneeded|=1;
6385     ds_unneeded_upper|=1;
6386     // branch taken
6387     //assem_debug("1:\n");
6388     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6389                   ds_unneeded,ds_unneeded_upper);
6390     // load regs
6391     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6392     address_generation(i+1,&branch_regs[i],0);
6393     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6394     ds_assemble(i+1,&branch_regs[i]);
6395     cc=get_reg(branch_regs[i].regmap,CCREG);
6396     if(cc==-1) {
6397       emit_loadreg(CCREG,cc=HOST_CCREG);
6398       // CHECK: Is the following instruction (fall thru) allocated ok?
6399     }
6400     assert(cc==HOST_CCREG);
6401     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6402     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6403     assem_debug("cycle count (adj)\n");
6404     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6405     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6406     if(internal)
6407       assem_debug("branch: internal\n");
6408     else
6409       assem_debug("branch: external\n");
6410     if(internal&&is_ds[(ba[i]-start)>>2]) {
6411       ds_assemble_entry(i);
6412     }
6413     else {
6414       add_to_linker((int)out,ba[i],internal);
6415       emit_jmp(0);
6416     }
6417
6418     // branch not taken
6419     if(1) { // <- FIXME (don't need this)
6420       set_jump_target(nottaken,(int)out);
6421       assem_debug("1:\n");
6422       if(!likely[i]) {
6423         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6424                       ds_unneeded,ds_unneeded_upper);
6425         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6426         address_generation(i+1,&branch_regs[i],0);
6427         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6428         ds_assemble(i+1,&branch_regs[i]);
6429       }
6430       cc=get_reg(branch_regs[i].regmap,CCREG);
6431       if(cc==-1&&!likely[i]) {
6432         // Cycle count isn't in a register, temporarily load it then write it out
6433         emit_loadreg(CCREG,HOST_CCREG);
6434         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6435         int jaddr=(int)out;
6436         emit_jns(0);
6437         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6438         emit_storereg(CCREG,HOST_CCREG);
6439       }
6440       else{
6441         cc=get_reg(i_regmap,CCREG);
6442         assert(cc==HOST_CCREG);
6443         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6444         int jaddr=(int)out;
6445         emit_jns(0);
6446         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6447       }
6448     }
6449   }
6450 }
6451
6452 static void pagespan_assemble(int i,struct regstat *i_regs)
6453 {
6454   int s1l=get_reg(i_regs->regmap,rs1[i]);
6455   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6456   int s2l=get_reg(i_regs->regmap,rs2[i]);
6457   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6458   void *nt_branch=NULL;
6459   int taken=0;
6460   int nottaken=0;
6461   int unconditional=0;
6462   if(rs1[i]==0)
6463   {
6464     s1l=s2l;s1h=s2h;
6465     s2l=s2h=-1;
6466   }
6467   else if(rs2[i]==0)
6468   {
6469     s2l=s2h=-1;
6470   }
6471   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6472     s1h=s2h=-1;
6473   }
6474   int hr=0;
6475   int addr,alt,ntaddr;
6476   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6477   else {
6478     while(hr<HOST_REGS)
6479     {
6480       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6481          (i_regs->regmap[hr]&63)!=rs1[i] &&
6482          (i_regs->regmap[hr]&63)!=rs2[i] )
6483       {
6484         addr=hr++;break;
6485       }
6486       hr++;
6487     }
6488   }
6489   while(hr<HOST_REGS)
6490   {
6491     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6492        (i_regs->regmap[hr]&63)!=rs1[i] &&
6493        (i_regs->regmap[hr]&63)!=rs2[i] )
6494     {
6495       alt=hr++;break;
6496     }
6497     hr++;
6498   }
6499   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6500   {
6501     while(hr<HOST_REGS)
6502     {
6503       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6504          (i_regs->regmap[hr]&63)!=rs1[i] &&
6505          (i_regs->regmap[hr]&63)!=rs2[i] )
6506       {
6507         ntaddr=hr;break;
6508       }
6509       hr++;
6510     }
6511   }
6512   assert(hr<HOST_REGS);
6513   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6514     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6515   }
6516   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6517   if(opcode[i]==2) // J
6518   {
6519     unconditional=1;
6520   }
6521   if(opcode[i]==3) // JAL
6522   {
6523     // TODO: mini_ht
6524     int rt=get_reg(i_regs->regmap,31);
6525     emit_movimm(start+i*4+8,rt);
6526     unconditional=1;
6527   }
6528   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6529   {
6530     emit_mov(s1l,addr);
6531     if(opcode2[i]==9) // JALR
6532     {
6533       int rt=get_reg(i_regs->regmap,rt1[i]);
6534       emit_movimm(start+i*4+8,rt);
6535     }
6536   }
6537   if((opcode[i]&0x3f)==4) // BEQ
6538   {
6539     if(rs1[i]==rs2[i])
6540     {
6541       unconditional=1;
6542     }
6543     else
6544     #ifdef HAVE_CMOV_IMM
6545     if(s1h<0) {
6546       if(s2l>=0) emit_cmp(s1l,s2l);
6547       else emit_test(s1l,s1l);
6548       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6549     }
6550     else
6551     #endif
6552     {
6553       assert(s1l>=0);
6554       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6555       if(s1h>=0) {
6556         if(s2h>=0) emit_cmp(s1h,s2h);
6557         else emit_test(s1h,s1h);
6558         emit_cmovne_reg(alt,addr);
6559       }
6560       if(s2l>=0) emit_cmp(s1l,s2l);
6561       else emit_test(s1l,s1l);
6562       emit_cmovne_reg(alt,addr);
6563     }
6564   }
6565   if((opcode[i]&0x3f)==5) // BNE
6566   {
6567     #ifdef HAVE_CMOV_IMM
6568     if(s1h<0) {
6569       if(s2l>=0) emit_cmp(s1l,s2l);
6570       else emit_test(s1l,s1l);
6571       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6572     }
6573     else
6574     #endif
6575     {
6576       assert(s1l>=0);
6577       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6578       if(s1h>=0) {
6579         if(s2h>=0) emit_cmp(s1h,s2h);
6580         else emit_test(s1h,s1h);
6581         emit_cmovne_reg(alt,addr);
6582       }
6583       if(s2l>=0) emit_cmp(s1l,s2l);
6584       else emit_test(s1l,s1l);
6585       emit_cmovne_reg(alt,addr);
6586     }
6587   }
6588   if((opcode[i]&0x3f)==0x14) // BEQL
6589   {
6590     if(s1h>=0) {
6591       if(s2h>=0) emit_cmp(s1h,s2h);
6592       else emit_test(s1h,s1h);
6593       nottaken=(int)out;
6594       emit_jne(0);
6595     }
6596     if(s2l>=0) emit_cmp(s1l,s2l);
6597     else emit_test(s1l,s1l);
6598     if(nottaken) set_jump_target(nottaken,(int)out);
6599     nottaken=(int)out;
6600     emit_jne(0);
6601   }
6602   if((opcode[i]&0x3f)==0x15) // BNEL
6603   {
6604     if(s1h>=0) {
6605       if(s2h>=0) emit_cmp(s1h,s2h);
6606       else emit_test(s1h,s1h);
6607       taken=(int)out;
6608       emit_jne(0);
6609     }
6610     if(s2l>=0) emit_cmp(s1l,s2l);
6611     else emit_test(s1l,s1l);
6612     nottaken=(int)out;
6613     emit_jeq(0);
6614     if(taken) set_jump_target(taken,(int)out);
6615   }
6616   if((opcode[i]&0x3f)==6) // BLEZ
6617   {
6618     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6619     emit_cmpimm(s1l,1);
6620     if(s1h>=0) emit_mov(addr,ntaddr);
6621     emit_cmovl_reg(alt,addr);
6622     if(s1h>=0) {
6623       emit_test(s1h,s1h);
6624       emit_cmovne_reg(ntaddr,addr);
6625       emit_cmovs_reg(alt,addr);
6626     }
6627   }
6628   if((opcode[i]&0x3f)==7) // BGTZ
6629   {
6630     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6631     emit_cmpimm(s1l,1);
6632     if(s1h>=0) emit_mov(addr,alt);
6633     emit_cmovl_reg(ntaddr,addr);
6634     if(s1h>=0) {
6635       emit_test(s1h,s1h);
6636       emit_cmovne_reg(alt,addr);
6637       emit_cmovs_reg(ntaddr,addr);
6638     }
6639   }
6640   if((opcode[i]&0x3f)==0x16) // BLEZL
6641   {
6642     assert((opcode[i]&0x3f)!=0x16);
6643   }
6644   if((opcode[i]&0x3f)==0x17) // BGTZL
6645   {
6646     assert((opcode[i]&0x3f)!=0x17);
6647   }
6648   assert(opcode[i]!=1); // BLTZ/BGEZ
6649
6650   //FIXME: Check CSREG
6651   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6652     if((source[i]&0x30000)==0) // BC1F
6653     {
6654       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6655       emit_testimm(s1l,0x800000);
6656       emit_cmovne_reg(alt,addr);
6657     }
6658     if((source[i]&0x30000)==0x10000) // BC1T
6659     {
6660       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6661       emit_testimm(s1l,0x800000);
6662       emit_cmovne_reg(alt,addr);
6663     }
6664     if((source[i]&0x30000)==0x20000) // BC1FL
6665     {
6666       emit_testimm(s1l,0x800000);
6667       nottaken=(int)out;
6668       emit_jne(0);
6669     }
6670     if((source[i]&0x30000)==0x30000) // BC1TL
6671     {
6672       emit_testimm(s1l,0x800000);
6673       nottaken=(int)out;
6674       emit_jeq(0);
6675     }
6676   }
6677
6678   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6679   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6680   if(likely[i]||unconditional)
6681   {
6682     emit_movimm(ba[i],HOST_BTREG);
6683   }
6684   else if(addr!=HOST_BTREG)
6685   {
6686     emit_mov(addr,HOST_BTREG);
6687   }
6688   void *branch_addr=out;
6689   emit_jmp(0);
6690   int target_addr=start+i*4+5;
6691   void *stub=out;
6692   void *compiled_target_addr=check_addr(target_addr);
6693   emit_extjump_ds((int)branch_addr,target_addr);
6694   if(compiled_target_addr) {
6695     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6696     add_link(target_addr,stub);
6697   }
6698   else set_jump_target((int)branch_addr,(int)stub);
6699   if(likely[i]) {
6700     // Not-taken path
6701     set_jump_target((int)nottaken,(int)out);
6702     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6703     void *branch_addr=out;
6704     emit_jmp(0);
6705     int target_addr=start+i*4+8;
6706     void *stub=out;
6707     void *compiled_target_addr=check_addr(target_addr);
6708     emit_extjump_ds((int)branch_addr,target_addr);
6709     if(compiled_target_addr) {
6710       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6711       add_link(target_addr,stub);
6712     }
6713     else set_jump_target((int)branch_addr,(int)stub);
6714   }
6715 }
6716
6717 // Assemble the delay slot for the above
6718 static void pagespan_ds()
6719 {
6720   assem_debug("initial delay slot:\n");
6721   u_int vaddr=start+1;
6722   u_int page=get_page(vaddr);
6723   u_int vpage=get_vpage(vaddr);
6724   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6725   do_dirty_stub_ds();
6726   ll_add(jump_in+page,vaddr,(void *)out);
6727   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6728   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6729     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6730   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6731     emit_writeword(HOST_BTREG,(int)&branch_target);
6732   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6733   address_generation(0,&regs[0],regs[0].regmap_entry);
6734   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6735     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6736   cop1_usable=0;
6737   is_delayslot=0;
6738   switch(itype[0]) {
6739     case ALU:
6740       alu_assemble(0,&regs[0]);break;
6741     case IMM16:
6742       imm16_assemble(0,&regs[0]);break;
6743     case SHIFT:
6744       shift_assemble(0,&regs[0]);break;
6745     case SHIFTIMM:
6746       shiftimm_assemble(0,&regs[0]);break;
6747     case LOAD:
6748       load_assemble(0,&regs[0]);break;
6749     case LOADLR:
6750       loadlr_assemble(0,&regs[0]);break;
6751     case STORE:
6752       store_assemble(0,&regs[0]);break;
6753     case STORELR:
6754       storelr_assemble(0,&regs[0]);break;
6755     case COP0:
6756       cop0_assemble(0,&regs[0]);break;
6757     case COP1:
6758       cop1_assemble(0,&regs[0]);break;
6759     case C1LS:
6760       c1ls_assemble(0,&regs[0]);break;
6761     case COP2:
6762       cop2_assemble(0,&regs[0]);break;
6763     case C2LS:
6764       c2ls_assemble(0,&regs[0]);break;
6765     case C2OP:
6766       c2op_assemble(0,&regs[0]);break;
6767     case FCONV:
6768       fconv_assemble(0,&regs[0]);break;
6769     case FLOAT:
6770       float_assemble(0,&regs[0]);break;
6771     case FCOMP:
6772       fcomp_assemble(0,&regs[0]);break;
6773     case MULTDIV:
6774       multdiv_assemble(0,&regs[0]);break;
6775     case MOV:
6776       mov_assemble(0,&regs[0]);break;
6777     case SYSCALL:
6778     case HLECALL:
6779     case INTCALL:
6780     case SPAN:
6781     case UJUMP:
6782     case RJUMP:
6783     case CJUMP:
6784     case SJUMP:
6785     case FJUMP:
6786       printf("Jump in the delay slot.  This is probably a bug.\n");
6787   }
6788   int btaddr=get_reg(regs[0].regmap,BTREG);
6789   if(btaddr<0) {
6790     btaddr=get_reg(regs[0].regmap,-1);
6791     emit_readword((int)&branch_target,btaddr);
6792   }
6793   assert(btaddr!=HOST_CCREG);
6794   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6795 #ifdef HOST_IMM8
6796   emit_movimm(start+4,HOST_TEMPREG);
6797   emit_cmp(btaddr,HOST_TEMPREG);
6798 #else
6799   emit_cmpimm(btaddr,start+4);
6800 #endif
6801   int branch=(int)out;
6802   emit_jeq(0);
6803   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6804   emit_jmp(jump_vaddr_reg[btaddr]);
6805   set_jump_target(branch,(int)out);
6806   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6807   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6808 }
6809
6810 // Basic liveness analysis for MIPS registers
6811 void unneeded_registers(int istart,int iend,int r)
6812 {
6813   int i;
6814   uint64_t u,uu,gte_u,b,bu,gte_bu;
6815   uint64_t temp_u,temp_uu,temp_gte_u=0;
6816   uint64_t tdep;
6817   uint64_t gte_u_unknown=0;
6818   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6819     gte_u_unknown=~0ll;
6820   if(iend==slen-1) {
6821     u=1;uu=1;
6822     gte_u=gte_u_unknown;
6823   }else{
6824     u=unneeded_reg[iend+1];
6825     uu=unneeded_reg_upper[iend+1];
6826     u=1;uu=1;
6827     gte_u=gte_unneeded[iend+1];
6828   }
6829
6830   for (i=iend;i>=istart;i--)
6831   {
6832     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6833     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6834     {
6835       // If subroutine call, flag return address as a possible branch target
6836       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6837       
6838       if(ba[i]<start || ba[i]>=(start+slen*4))
6839       {
6840         // Branch out of this block, flush all regs
6841         u=1;
6842         uu=1;
6843         gte_u=gte_u_unknown;
6844         /* Hexagon hack 
6845         if(itype[i]==UJUMP&&rt1[i]==31)
6846         {
6847           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6848         }
6849         if(itype[i]==RJUMP&&rs1[i]==31)
6850         {
6851           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6852         }
6853         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6854           if(itype[i]==UJUMP&&rt1[i]==31)
6855           {
6856             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6857             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6858           }
6859           if(itype[i]==RJUMP&&rs1[i]==31)
6860           {
6861             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6862             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6863           }
6864         }*/
6865         branch_unneeded_reg[i]=u;
6866         branch_unneeded_reg_upper[i]=uu;
6867         // Merge in delay slot
6868         tdep=(~uu>>rt1[i+1])&1;
6869         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6870         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6871         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6872         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6873         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6874         u|=1;uu|=1;
6875         gte_u|=gte_rt[i+1];
6876         gte_u&=~gte_rs[i+1];
6877         // If branch is "likely" (and conditional)
6878         // then we skip the delay slot on the fall-thru path
6879         if(likely[i]) {
6880           if(i<slen-1) {
6881             u&=unneeded_reg[i+2];
6882             uu&=unneeded_reg_upper[i+2];
6883             gte_u&=gte_unneeded[i+2];
6884           }
6885           else
6886           {
6887             u=1;
6888             uu=1;
6889             gte_u=gte_u_unknown;
6890           }
6891         }
6892       }
6893       else
6894       {
6895         // Internal branch, flag target
6896         bt[(ba[i]-start)>>2]=1;
6897         if(ba[i]<=start+i*4) {
6898           // Backward branch
6899           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6900           {
6901             // Unconditional branch
6902             temp_u=1;temp_uu=1;
6903             temp_gte_u=0;
6904           } else {
6905             // Conditional branch (not taken case)
6906             temp_u=unneeded_reg[i+2];
6907             temp_uu=unneeded_reg_upper[i+2];
6908             temp_gte_u&=gte_unneeded[i+2];
6909           }
6910           // Merge in delay slot
6911           tdep=(~temp_uu>>rt1[i+1])&1;
6912           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6913           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6914           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6915           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6916           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6917           temp_u|=1;temp_uu|=1;
6918           temp_gte_u|=gte_rt[i+1];
6919           temp_gte_u&=~gte_rs[i+1];
6920           // If branch is "likely" (and conditional)
6921           // then we skip the delay slot on the fall-thru path
6922           if(likely[i]) {
6923             if(i<slen-1) {
6924               temp_u&=unneeded_reg[i+2];
6925               temp_uu&=unneeded_reg_upper[i+2];
6926               temp_gte_u&=gte_unneeded[i+2];
6927             }
6928             else
6929             {
6930               temp_u=1;
6931               temp_uu=1;
6932               temp_gte_u=gte_u_unknown;
6933             }
6934           }
6935           tdep=(~temp_uu>>rt1[i])&1;
6936           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6937           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6938           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6939           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6940           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6941           temp_u|=1;temp_uu|=1;
6942           temp_gte_u|=gte_rt[i];
6943           temp_gte_u&=~gte_rs[i];
6944           unneeded_reg[i]=temp_u;
6945           unneeded_reg_upper[i]=temp_uu;
6946           gte_unneeded[i]=temp_gte_u;
6947           // Only go three levels deep.  This recursion can take an
6948           // excessive amount of time if there are a lot of nested loops.
6949           if(r<2) {
6950             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6951           }else{
6952             unneeded_reg[(ba[i]-start)>>2]=1;
6953             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6954             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6955           }
6956         } /*else*/ if(1) {
6957           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6958           {
6959             // Unconditional branch
6960             u=unneeded_reg[(ba[i]-start)>>2];
6961             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6962             gte_u=gte_unneeded[(ba[i]-start)>>2];
6963             branch_unneeded_reg[i]=u;
6964             branch_unneeded_reg_upper[i]=uu;
6965         //u=1;
6966         //uu=1;
6967         //branch_unneeded_reg[i]=u;
6968         //branch_unneeded_reg_upper[i]=uu;
6969             // Merge in delay slot
6970             tdep=(~uu>>rt1[i+1])&1;
6971             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6972             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6973             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6974             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6975             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6976             u|=1;uu|=1;
6977             gte_u|=gte_rt[i+1];
6978             gte_u&=~gte_rs[i+1];
6979           } else {
6980             // Conditional branch
6981             b=unneeded_reg[(ba[i]-start)>>2];
6982             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6983             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6984             branch_unneeded_reg[i]=b;
6985             branch_unneeded_reg_upper[i]=bu;
6986         //b=1;
6987         //bu=1;
6988         //branch_unneeded_reg[i]=b;
6989         //branch_unneeded_reg_upper[i]=bu;
6990             // Branch delay slot
6991             tdep=(~uu>>rt1[i+1])&1;
6992             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6993             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6994             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6995             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6996             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6997             b|=1;bu|=1;
6998             gte_bu|=gte_rt[i+1];
6999             gte_bu&=~gte_rs[i+1];
7000             // If branch is "likely" then we skip the
7001             // delay slot on the fall-thru path
7002             if(likely[i]) {
7003               u=b;
7004               uu=bu;
7005               gte_u=gte_bu;
7006               if(i<slen-1) {
7007                 u&=unneeded_reg[i+2];
7008                 uu&=unneeded_reg_upper[i+2];
7009                 gte_u&=gte_unneeded[i+2];
7010         //u=1;
7011         //uu=1;
7012               }
7013             } else {
7014               u&=b;
7015               uu&=bu;
7016               gte_u&=gte_bu;
7017         //u=1;
7018         //uu=1;
7019             }
7020             if(i<slen-1) {
7021               branch_unneeded_reg[i]&=unneeded_reg[i+2];
7022               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
7023         //branch_unneeded_reg[i]=1;
7024         //branch_unneeded_reg_upper[i]=1;
7025             } else {
7026               branch_unneeded_reg[i]=1;
7027               branch_unneeded_reg_upper[i]=1;
7028             }
7029           }
7030         }
7031       }
7032     }
7033     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7034     {
7035       // SYSCALL instruction (software interrupt)
7036       u=1;
7037       uu=1;
7038     }
7039     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7040     {
7041       // ERET instruction (return from interrupt)
7042       u=1;
7043       uu=1;
7044     }
7045     //u=uu=1; // DEBUG
7046     tdep=(~uu>>rt1[i])&1;
7047     // Written registers are unneeded
7048     u|=1LL<<rt1[i];
7049     u|=1LL<<rt2[i];
7050     uu|=1LL<<rt1[i];
7051     uu|=1LL<<rt2[i];
7052     gte_u|=gte_rt[i];
7053     // Accessed registers are needed
7054     u&=~(1LL<<rs1[i]);
7055     u&=~(1LL<<rs2[i]);
7056     uu&=~(1LL<<us1[i]);
7057     uu&=~(1LL<<us2[i]);
7058     gte_u&=~gte_rs[i];
7059     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
7060       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
7061     // Source-target dependencies
7062     uu&=~(tdep<<dep1[i]);
7063     uu&=~(tdep<<dep2[i]);
7064     // R0 is always unneeded
7065     u|=1;uu|=1;
7066     // Save it
7067     unneeded_reg[i]=u;
7068     unneeded_reg_upper[i]=uu;
7069     gte_unneeded[i]=gte_u;
7070     /*
7071     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
7072     printf("U:");
7073     int r;
7074     for(r=1;r<=CCREG;r++) {
7075       if((unneeded_reg[i]>>r)&1) {
7076         if(r==HIREG) printf(" HI");
7077         else if(r==LOREG) printf(" LO");
7078         else printf(" r%d",r);
7079       }
7080     }
7081     printf(" UU:");
7082     for(r=1;r<=CCREG;r++) {
7083       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
7084         if(r==HIREG) printf(" HI");
7085         else if(r==LOREG) printf(" LO");
7086         else printf(" r%d",r);
7087       }
7088     }
7089     printf("\n");*/
7090   }
7091 #ifdef FORCE32
7092   for (i=iend;i>=istart;i--)
7093   {
7094     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7095   }
7096 #endif
7097 }
7098
7099 // Identify registers which are likely to contain 32-bit values
7100 // This is used to predict whether any branches will jump to a
7101 // location with 64-bit values in registers.
7102 static void provisional_32bit()
7103 {
7104   int i,j;
7105   uint64_t is32=1;
7106   uint64_t lastbranch=1;
7107   
7108   for(i=0;i<slen;i++)
7109   {
7110     if(i>0) {
7111       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7112         if(i>1) is32=lastbranch;
7113         else is32=1;
7114       }
7115     }
7116     if(i>1)
7117     {
7118       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7119         if(likely[i-2]) {
7120           if(i>2) is32=lastbranch;
7121           else is32=1;
7122         }
7123       }
7124       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7125       {
7126         if(rs1[i-2]==0||rs2[i-2]==0)
7127         {
7128           if(rs1[i-2]) {
7129             is32|=1LL<<rs1[i-2];
7130           }
7131           if(rs2[i-2]) {
7132             is32|=1LL<<rs2[i-2];
7133           }
7134         }
7135       }
7136     }
7137     // If something jumps here with 64-bit values
7138     // then promote those registers to 64 bits
7139     if(bt[i])
7140     {
7141       uint64_t temp_is32=is32;
7142       for(j=i-1;j>=0;j--)
7143       {
7144         if(ba[j]==start+i*4) 
7145           //temp_is32&=branch_regs[j].is32;
7146           temp_is32&=p32[j];
7147       }
7148       for(j=i;j<slen;j++)
7149       {
7150         if(ba[j]==start+i*4) 
7151           temp_is32=1;
7152       }
7153       is32=temp_is32;
7154     }
7155     int type=itype[i];
7156     int op=opcode[i];
7157     int op2=opcode2[i];
7158     int rt=rt1[i];
7159     int s1=rs1[i];
7160     int s2=rs2[i];
7161     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7162       // Branches don't write registers, consider the delay slot instead.
7163       type=itype[i+1];
7164       op=opcode[i+1];
7165       op2=opcode2[i+1];
7166       rt=rt1[i+1];
7167       s1=rs1[i+1];
7168       s2=rs2[i+1];
7169       lastbranch=is32;
7170     }
7171     switch(type) {
7172       case LOAD:
7173         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7174            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7175           is32&=~(1LL<<rt);
7176         else
7177           is32|=1LL<<rt;
7178         break;
7179       case STORE:
7180       case STORELR:
7181         break;
7182       case LOADLR:
7183         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7184         if(op==0x22) is32|=1LL<<rt; // LWL
7185         break;
7186       case IMM16:
7187         if (op==0x08||op==0x09|| // ADDI/ADDIU
7188             op==0x0a||op==0x0b|| // SLTI/SLTIU
7189             op==0x0c|| // ANDI
7190             op==0x0f)  // LUI
7191         {
7192           is32|=1LL<<rt;
7193         }
7194         if(op==0x18||op==0x19) { // DADDI/DADDIU
7195           is32&=~(1LL<<rt);
7196           //if(imm[i]==0)
7197           //  is32|=((is32>>s1)&1LL)<<rt;
7198         }
7199         if(op==0x0d||op==0x0e) { // ORI/XORI
7200           uint64_t sr=((is32>>s1)&1LL);
7201           is32&=~(1LL<<rt);
7202           is32|=sr<<rt;
7203         }
7204         break;
7205       case UJUMP:
7206         break;
7207       case RJUMP:
7208         break;
7209       case CJUMP:
7210         break;
7211       case SJUMP:
7212         break;
7213       case FJUMP:
7214         break;
7215       case ALU:
7216         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7217           is32|=1LL<<rt;
7218         }
7219         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7220           is32|=1LL<<rt;
7221         }
7222         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7223           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7224           is32&=~(1LL<<rt);
7225           is32|=sr<<rt;
7226         }
7227         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7228           if(s1==0&&s2==0) {
7229             is32|=1LL<<rt;
7230           }
7231           else if(s2==0) {
7232             uint64_t sr=((is32>>s1)&1LL);
7233             is32&=~(1LL<<rt);
7234             is32|=sr<<rt;
7235           }
7236           else if(s1==0) {
7237             uint64_t sr=((is32>>s2)&1LL);
7238             is32&=~(1LL<<rt);
7239             is32|=sr<<rt;
7240           }
7241           else {
7242             is32&=~(1LL<<rt);
7243           }
7244         }
7245         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7246           if(s1==0&&s2==0) {
7247             is32|=1LL<<rt;
7248           }
7249           else if(s2==0) {
7250             uint64_t sr=((is32>>s1)&1LL);
7251             is32&=~(1LL<<rt);
7252             is32|=sr<<rt;
7253           }
7254           else {
7255             is32&=~(1LL<<rt);
7256           }
7257         }
7258         break;
7259       case MULTDIV:
7260         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7261           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7262         }
7263         else {
7264           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7265         }
7266         break;
7267       case MOV:
7268         {
7269           uint64_t sr=((is32>>s1)&1LL);
7270           is32&=~(1LL<<rt);
7271           is32|=sr<<rt;
7272         }
7273         break;
7274       case SHIFT:
7275         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7276         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7277         break;
7278       case SHIFTIMM:
7279         is32|=1LL<<rt;
7280         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7281         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7282         break;
7283       case COP0:
7284         if(op2==0) is32|=1LL<<rt; // MFC0
7285         break;
7286       case COP1:
7287       case COP2:
7288         if(op2==0) is32|=1LL<<rt; // MFC1
7289         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7290         if(op2==2) is32|=1LL<<rt; // CFC1
7291         break;
7292       case C1LS:
7293       case C2LS:
7294         break;
7295       case FLOAT:
7296       case FCONV:
7297         break;
7298       case FCOMP:
7299         break;
7300       case C2OP:
7301       case SYSCALL:
7302       case HLECALL:
7303         break;
7304       default:
7305         break;
7306     }
7307     is32|=1;
7308     p32[i]=is32;
7309
7310     if(i>0)
7311     {
7312       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7313       {
7314         if(rt1[i-1]==31) // JAL/JALR
7315         {
7316           // Subroutine call will return here, don't alloc any registers
7317           is32=1;
7318         }
7319         else if(i+1<slen)
7320         {
7321           // Internal branch will jump here, match registers to caller
7322           is32=0x3FFFFFFFFLL;
7323         }
7324       }
7325     }
7326   }
7327 }
7328
7329 // Identify registers which may be assumed to contain 32-bit values
7330 // and where optimizations will rely on this.
7331 // This is used to determine whether backward branches can safely
7332 // jump to a location with 64-bit values in registers.
7333 static void provisional_r32()
7334 {
7335   u_int r32=0;
7336   int i;
7337   
7338   for (i=slen-1;i>=0;i--)
7339   {
7340     int hr;
7341     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7342     {
7343       if(ba[i]<start || ba[i]>=(start+slen*4))
7344       {
7345         // Branch out of this block, don't need anything
7346         r32=0;
7347       }
7348       else
7349       {
7350         // Internal branch
7351         // Need whatever matches the target
7352         // (and doesn't get overwritten by the delay slot instruction)
7353         r32=0;
7354         int t=(ba[i]-start)>>2;
7355         if(ba[i]>start+i*4) {
7356           // Forward branch
7357           //if(!(requires_32bit[t]&~regs[i].was32))
7358           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7359           if(!(pr32[t]&~regs[i].was32))
7360             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7361         }else{
7362           // Backward branch
7363           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7364             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7365         }
7366       }
7367       // Conditional branch may need registers for following instructions
7368       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7369       {
7370         if(i<slen-2) {
7371           //r32|=requires_32bit[i+2];
7372           r32|=pr32[i+2];
7373           r32&=regs[i].was32;
7374           // Mark this address as a branch target since it may be called
7375           // upon return from interrupt
7376           //bt[i+2]=1;
7377         }
7378       }
7379       // Merge in delay slot
7380       if(!likely[i]) {
7381         // These are overwritten unless the branch is "likely"
7382         // and the delay slot is nullified if not taken
7383         r32&=~(1LL<<rt1[i+1]);
7384         r32&=~(1LL<<rt2[i+1]);
7385       }
7386       // Assume these are needed (delay slot)
7387       if(us1[i+1]>0)
7388       {
7389         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7390       }
7391       if(us2[i+1]>0)
7392       {
7393         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7394       }
7395       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7396       {
7397         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7398       }
7399       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7400       {
7401         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7402       }
7403     }
7404     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7405     {
7406       // SYSCALL instruction (software interrupt)
7407       r32=0;
7408     }
7409     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7410     {
7411       // ERET instruction (return from interrupt)
7412       r32=0;
7413     }
7414     // Check 32 bits
7415     r32&=~(1LL<<rt1[i]);
7416     r32&=~(1LL<<rt2[i]);
7417     if(us1[i]>0)
7418     {
7419       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7420     }
7421     if(us2[i]>0)
7422     {
7423       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7424     }
7425     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7426     {
7427       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7428     }
7429     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7430     {
7431       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7432     }
7433     //requires_32bit[i]=r32;
7434     pr32[i]=r32;
7435     
7436     // Dirty registers which are 32-bit, require 32-bit input
7437     // as they will be written as 32-bit values
7438     for(hr=0;hr<HOST_REGS;hr++)
7439     {
7440       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7441         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7442           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7443           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7444           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7445         }
7446       }
7447     }
7448   }
7449 }
7450
7451 // Write back dirty registers as soon as we will no longer modify them,
7452 // so that we don't end up with lots of writes at the branches.
7453 void clean_registers(int istart,int iend,int wr)
7454 {
7455   int i;
7456   int r;
7457   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7458   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7459   if(iend==slen-1) {
7460     will_dirty_i=will_dirty_next=0;
7461     wont_dirty_i=wont_dirty_next=0;
7462   }else{
7463     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7464     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7465   }
7466   for (i=iend;i>=istart;i--)
7467   {
7468     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7469     {
7470       if(ba[i]<start || ba[i]>=(start+slen*4))
7471       {
7472         // Branch out of this block, flush all regs
7473         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7474         {
7475           // Unconditional branch
7476           will_dirty_i=0;
7477           wont_dirty_i=0;
7478           // Merge in delay slot (will dirty)
7479           for(r=0;r<HOST_REGS;r++) {
7480             if(r!=EXCLUDE_REG) {
7481               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7482               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7483               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7484               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7485               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7486               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7487               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7488               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7489               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7490               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7491               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7492               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7493               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7494               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7495             }
7496           }
7497         }
7498         else
7499         {
7500           // Conditional branch
7501           will_dirty_i=0;
7502           wont_dirty_i=wont_dirty_next;
7503           // Merge in delay slot (will dirty)
7504           for(r=0;r<HOST_REGS;r++) {
7505             if(r!=EXCLUDE_REG) {
7506               if(!likely[i]) {
7507                 // Might not dirty if likely branch is not taken
7508                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7509                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7510                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7511                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7512                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7513                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7514                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7515                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7516                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7517                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7518                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7519                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7520                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7521                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7522               }
7523             }
7524           }
7525         }
7526         // Merge in delay slot (wont dirty)
7527         for(r=0;r<HOST_REGS;r++) {
7528           if(r!=EXCLUDE_REG) {
7529             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7530             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7531             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7532             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7533             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7534             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7535             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7536             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7537             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7538             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7539           }
7540         }
7541         if(wr) {
7542           #ifndef DESTRUCTIVE_WRITEBACK
7543           branch_regs[i].dirty&=wont_dirty_i;
7544           #endif
7545           branch_regs[i].dirty|=will_dirty_i;
7546         }
7547       }
7548       else
7549       {
7550         // Internal branch
7551         if(ba[i]<=start+i*4) {
7552           // Backward branch
7553           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7554           {
7555             // Unconditional branch
7556             temp_will_dirty=0;
7557             temp_wont_dirty=0;
7558             // Merge in delay slot (will dirty)
7559             for(r=0;r<HOST_REGS;r++) {
7560               if(r!=EXCLUDE_REG) {
7561                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7562                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7563                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7564                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7565                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7566                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7567                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7568                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7569                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7570                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7571                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7572                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7573                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7574                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7575               }
7576             }
7577           } else {
7578             // Conditional branch (not taken case)
7579             temp_will_dirty=will_dirty_next;
7580             temp_wont_dirty=wont_dirty_next;
7581             // Merge in delay slot (will dirty)
7582             for(r=0;r<HOST_REGS;r++) {
7583               if(r!=EXCLUDE_REG) {
7584                 if(!likely[i]) {
7585                   // Will not dirty if likely branch is not taken
7586                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7587                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7588                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7589                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7590                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7591                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7592                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7593                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7594                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7595                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7596                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7597                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7598                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7599                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7600                 }
7601               }
7602             }
7603           }
7604           // Merge in delay slot (wont dirty)
7605           for(r=0;r<HOST_REGS;r++) {
7606             if(r!=EXCLUDE_REG) {
7607               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7608               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7609               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7610               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7611               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7612               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7613               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7614               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7615               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7616               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7617             }
7618           }
7619           // Deal with changed mappings
7620           if(i<iend) {
7621             for(r=0;r<HOST_REGS;r++) {
7622               if(r!=EXCLUDE_REG) {
7623                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7624                   temp_will_dirty&=~(1<<r);
7625                   temp_wont_dirty&=~(1<<r);
7626                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7627                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7628                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7629                   } else {
7630                     temp_will_dirty|=1<<r;
7631                     temp_wont_dirty|=1<<r;
7632                   }
7633                 }
7634               }
7635             }
7636           }
7637           if(wr) {
7638             will_dirty[i]=temp_will_dirty;
7639             wont_dirty[i]=temp_wont_dirty;
7640             clean_registers((ba[i]-start)>>2,i-1,0);
7641           }else{
7642             // Limit recursion.  It can take an excessive amount
7643             // of time if there are a lot of nested loops.
7644             will_dirty[(ba[i]-start)>>2]=0;
7645             wont_dirty[(ba[i]-start)>>2]=-1;
7646           }
7647         }
7648         /*else*/ if(1)
7649         {
7650           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7651           {
7652             // Unconditional branch
7653             will_dirty_i=0;
7654             wont_dirty_i=0;
7655           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7656             for(r=0;r<HOST_REGS;r++) {
7657               if(r!=EXCLUDE_REG) {
7658                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7659                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7660                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7661                 }
7662                 if(branch_regs[i].regmap[r]>=0) {
7663                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7664                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7665                 }
7666               }
7667             }
7668           //}
7669             // Merge in delay slot
7670             for(r=0;r<HOST_REGS;r++) {
7671               if(r!=EXCLUDE_REG) {
7672                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7673                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7674                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7675                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7676                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7677                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7678                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7679                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7680                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7681                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7682                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7683                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7684                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7685                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7686               }
7687             }
7688           } else {
7689             // Conditional branch
7690             will_dirty_i=will_dirty_next;
7691             wont_dirty_i=wont_dirty_next;
7692           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7693             for(r=0;r<HOST_REGS;r++) {
7694               if(r!=EXCLUDE_REG) {
7695                 signed char target_reg=branch_regs[i].regmap[r];
7696                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7697                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7698                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7699                 }
7700                 else if(target_reg>=0) {
7701                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7702                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7703                 }
7704                 // Treat delay slot as part of branch too
7705                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7706                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7707                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7708                 }
7709                 else
7710                 {
7711                   will_dirty[i+1]&=~(1<<r);
7712                 }*/
7713               }
7714             }
7715           //}
7716             // Merge in delay slot
7717             for(r=0;r<HOST_REGS;r++) {
7718               if(r!=EXCLUDE_REG) {
7719                 if(!likely[i]) {
7720                   // Might not dirty if likely branch is not taken
7721                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7722                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7723                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7724                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7725                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7726                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7727                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7728                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7729                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7730                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7731                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7732                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7733                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7734                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7735                 }
7736               }
7737             }
7738           }
7739           // Merge in delay slot (won't dirty)
7740           for(r=0;r<HOST_REGS;r++) {
7741             if(r!=EXCLUDE_REG) {
7742               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7743               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7744               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7745               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7746               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7747               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7748               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7749               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7750               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7751               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7752             }
7753           }
7754           if(wr) {
7755             #ifndef DESTRUCTIVE_WRITEBACK
7756             branch_regs[i].dirty&=wont_dirty_i;
7757             #endif
7758             branch_regs[i].dirty|=will_dirty_i;
7759           }
7760         }
7761       }
7762     }
7763     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7764     {
7765       // SYSCALL instruction (software interrupt)
7766       will_dirty_i=0;
7767       wont_dirty_i=0;
7768     }
7769     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7770     {
7771       // ERET instruction (return from interrupt)
7772       will_dirty_i=0;
7773       wont_dirty_i=0;
7774     }
7775     will_dirty_next=will_dirty_i;
7776     wont_dirty_next=wont_dirty_i;
7777     for(r=0;r<HOST_REGS;r++) {
7778       if(r!=EXCLUDE_REG) {
7779         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7780         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7781         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7782         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7783         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7784         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7785         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7786         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7787         if(i>istart) {
7788           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7789           {
7790             // Don't store a register immediately after writing it,
7791             // may prevent dual-issue.
7792             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7793             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7794           }
7795         }
7796       }
7797     }
7798     // Save it
7799     will_dirty[i]=will_dirty_i;
7800     wont_dirty[i]=wont_dirty_i;
7801     // Mark registers that won't be dirtied as not dirty
7802     if(wr) {
7803       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7804       for(r=0;r<HOST_REGS;r++) {
7805         if((will_dirty_i>>r)&1) {
7806           printf(" r%d",r);
7807         }
7808       }
7809       printf("\n");*/
7810
7811       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7812         regs[i].dirty|=will_dirty_i;
7813         #ifndef DESTRUCTIVE_WRITEBACK
7814         regs[i].dirty&=wont_dirty_i;
7815         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7816         {
7817           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7818             for(r=0;r<HOST_REGS;r++) {
7819               if(r!=EXCLUDE_REG) {
7820                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7821                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7822                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7823               }
7824             }
7825           }
7826         }
7827         else
7828         {
7829           if(i<iend) {
7830             for(r=0;r<HOST_REGS;r++) {
7831               if(r!=EXCLUDE_REG) {
7832                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7833                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7834                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7835               }
7836             }
7837           }
7838         }
7839         #endif
7840       //}
7841     }
7842     // Deal with changed mappings
7843     temp_will_dirty=will_dirty_i;
7844     temp_wont_dirty=wont_dirty_i;
7845     for(r=0;r<HOST_REGS;r++) {
7846       if(r!=EXCLUDE_REG) {
7847         int nr;
7848         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7849           if(wr) {
7850             #ifndef DESTRUCTIVE_WRITEBACK
7851             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7852             #endif
7853             regs[i].wasdirty|=will_dirty_i&(1<<r);
7854           }
7855         }
7856         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7857           // Register moved to a different register
7858           will_dirty_i&=~(1<<r);
7859           wont_dirty_i&=~(1<<r);
7860           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7861           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7862           if(wr) {
7863             #ifndef DESTRUCTIVE_WRITEBACK
7864             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7865             #endif
7866             regs[i].wasdirty|=will_dirty_i&(1<<r);
7867           }
7868         }
7869         else {
7870           will_dirty_i&=~(1<<r);
7871           wont_dirty_i&=~(1<<r);
7872           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7873             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7874             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7875           } else {
7876             wont_dirty_i|=1<<r;
7877             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7878           }
7879         }
7880       }
7881     }
7882   }
7883 }
7884
7885 #ifdef DISASM
7886   /* disassembly */
7887 void disassemble_inst(int i)
7888 {
7889     if (bt[i]) printf("*"); else printf(" ");
7890     switch(itype[i]) {
7891       case UJUMP:
7892         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7893       case CJUMP:
7894         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7895       case SJUMP:
7896         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7897       case FJUMP:
7898         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7899       case RJUMP:
7900         if (opcode[i]==0x9&&rt1[i]!=31)
7901           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7902         else
7903           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7904         break;
7905       case SPAN:
7906         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7907       case IMM16:
7908         if(opcode[i]==0xf) //LUI
7909           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7910         else
7911           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7912         break;
7913       case LOAD:
7914       case LOADLR:
7915         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7916         break;
7917       case STORE:
7918       case STORELR:
7919         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7920         break;
7921       case ALU:
7922       case SHIFT:
7923         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7924         break;
7925       case MULTDIV:
7926         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7927         break;
7928       case SHIFTIMM:
7929         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7930         break;
7931       case MOV:
7932         if((opcode2[i]&0x1d)==0x10)
7933           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7934         else if((opcode2[i]&0x1d)==0x11)
7935           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7936         else
7937           printf (" %x: %s\n",start+i*4,insn[i]);
7938         break;
7939       case COP0:
7940         if(opcode2[i]==0)
7941           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7942         else if(opcode2[i]==4)
7943           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7944         else printf (" %x: %s\n",start+i*4,insn[i]);
7945         break;
7946       case COP1:
7947         if(opcode2[i]<3)
7948           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7949         else if(opcode2[i]>3)
7950           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7951         else printf (" %x: %s\n",start+i*4,insn[i]);
7952         break;
7953       case COP2:
7954         if(opcode2[i]<3)
7955           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7956         else if(opcode2[i]>3)
7957           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7958         else printf (" %x: %s\n",start+i*4,insn[i]);
7959         break;
7960       case C1LS:
7961         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7962         break;
7963       case C2LS:
7964         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7965         break;
7966       case INTCALL:
7967         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7968         break;
7969       default:
7970         //printf (" %s %8x\n",insn[i],source[i]);
7971         printf (" %x: %s\n",start+i*4,insn[i]);
7972     }
7973 }
7974 #else
7975 static void disassemble_inst(int i) {}
7976 #endif // DISASM
7977
7978 // clear the state completely, instead of just marking
7979 // things invalid like invalidate_all_pages() does
7980 void new_dynarec_clear_full()
7981 {
7982   int n;
7983   out=(u_char *)BASE_ADDR;
7984   memset(invalid_code,1,sizeof(invalid_code));
7985   memset(hash_table,0xff,sizeof(hash_table));
7986   memset(mini_ht,-1,sizeof(mini_ht));
7987   memset(restore_candidate,0,sizeof(restore_candidate));
7988   memset(shadow,0,sizeof(shadow));
7989   copy=shadow;
7990   expirep=16384; // Expiry pointer, +2 blocks
7991   pending_exception=0;
7992   literalcount=0;
7993   stop_after_jal=0;
7994   inv_code_start=inv_code_end=~0;
7995   // TLB
7996 #ifndef DISABLE_TLB
7997   using_tlb=0;
7998   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7999     memory_map[n]=-1;
8000   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
8001     memory_map[n]=((u_int)rdram-0x80000000)>>2;
8002   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
8003     memory_map[n]=-1;
8004 #endif
8005   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8006   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8007   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8008 }
8009
8010 void new_dynarec_init()
8011 {
8012   printf("Init new dynarec\n");
8013   out=(u_char *)BASE_ADDR;
8014 #if BASE_ADDR_FIXED
8015   if (mmap (out, 1<<TARGET_SIZE_2,
8016             PROT_READ | PROT_WRITE | PROT_EXEC,
8017             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
8018             -1, 0) <= 0) {printf("mmap() failed\n");}
8019 #else
8020   // not all systems allow execute in data segment by default
8021   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
8022     printf("mprotect() failed\n");
8023 #endif
8024 #ifdef MUPEN64
8025   rdword=&readmem_dword;
8026   fake_pc.f.r.rs=&readmem_dword;
8027   fake_pc.f.r.rt=&readmem_dword;
8028   fake_pc.f.r.rd=&readmem_dword;
8029 #endif
8030   int n;
8031   cycle_multiplier=200;
8032   new_dynarec_clear_full();
8033 #ifdef HOST_IMM8
8034   // Copy this into local area so we don't have to put it in every literal pool
8035   invc_ptr=invalid_code;
8036 #endif
8037 #ifdef MUPEN64
8038   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
8039     writemem[n] = write_nomem_new;
8040     writememb[n] = write_nomemb_new;
8041     writememh[n] = write_nomemh_new;
8042 #ifndef FORCE32
8043     writememd[n] = write_nomemd_new;
8044 #endif
8045     readmem[n] = read_nomem_new;
8046     readmemb[n] = read_nomemb_new;
8047     readmemh[n] = read_nomemh_new;
8048 #ifndef FORCE32
8049     readmemd[n] = read_nomemd_new;
8050 #endif
8051   }
8052   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
8053     writemem[n] = write_rdram_new;
8054     writememb[n] = write_rdramb_new;
8055     writememh[n] = write_rdramh_new;
8056 #ifndef FORCE32
8057     writememd[n] = write_rdramd_new;
8058 #endif
8059   }
8060   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
8061     writemem[n] = write_nomem_new;
8062     writememb[n] = write_nomemb_new;
8063     writememh[n] = write_nomemh_new;
8064 #ifndef FORCE32
8065     writememd[n] = write_nomemd_new;
8066 #endif
8067     readmem[n] = read_nomem_new;
8068     readmemb[n] = read_nomemb_new;
8069     readmemh[n] = read_nomemh_new;
8070 #ifndef FORCE32
8071     readmemd[n] = read_nomemd_new;
8072 #endif
8073   }
8074 #endif
8075   tlb_hacks();
8076   arch_init();
8077 #ifndef RAM_FIXED
8078   ram_offset=(u_int)rdram-0x80000000;
8079 #endif
8080 }
8081
8082 void new_dynarec_cleanup()
8083 {
8084   int n;
8085   #if BASE_ADDR_FIXED
8086   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
8087   #endif
8088   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8089   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8090   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8091   #ifdef ROM_COPY
8092   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
8093   #endif
8094 }
8095
8096 int new_recompile_block(int addr)
8097 {
8098 /*
8099   if(addr==0x800cd050) {
8100     int block;
8101     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
8102     int n;
8103     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
8104   }
8105 */
8106   //if(Count==365117028) tracedebug=1;
8107   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8108   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8109   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8110   //if(debug) 
8111   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8112   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8113   /*if(Count>=312978186) {
8114     rlist();
8115   }*/
8116   //rlist();
8117   start = (u_int)addr&~3;
8118   //assert(((u_int)addr&1)==0);
8119   new_dynarec_did_compile=1;
8120 #ifdef PCSX
8121   if (Config.HLE && start == 0x80001000) // hlecall
8122   {
8123     // XXX: is this enough? Maybe check hleSoftCall?
8124     u_int beginning=(u_int)out;
8125     u_int page=get_page(start);
8126     invalid_code[start>>12]=0;
8127     emit_movimm(start,0);
8128     emit_writeword(0,(int)&pcaddr);
8129     emit_jmp((int)new_dyna_leave);
8130     literal_pool(0);
8131 #ifdef __arm__
8132     __clear_cache((void *)beginning,out);
8133 #endif
8134     ll_add(jump_in+page,start,(void *)beginning);
8135     return 0;
8136   }
8137   else if ((u_int)addr < 0x00200000 ||
8138     (0xa0000000 <= addr && addr < 0xa0200000)) {
8139     // used for BIOS calls mostly?
8140     source = (u_int *)((u_int)rdram+(start&0x1fffff));
8141     pagelimit = (addr&0xa0000000)|0x00200000;
8142   }
8143   else if (!Config.HLE && (
8144 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8145     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8146     // BIOS
8147     source = (u_int *)((u_int)psxR+(start&0x7ffff));
8148     pagelimit = (addr&0xfff00000)|0x80000;
8149   }
8150   else
8151 #endif
8152 #ifdef MUPEN64
8153   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
8154     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
8155     pagelimit = 0xa4001000;
8156   }
8157   else
8158 #endif
8159   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
8160     source = (u_int *)((u_int)rdram+start-0x80000000);
8161     pagelimit = 0x80000000+RAM_SIZE;
8162   }
8163 #ifndef DISABLE_TLB
8164   else if ((signed int)addr >= (signed int)0xC0000000) {
8165     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
8166     //if(tlb_LUT_r[start>>12])
8167       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
8168     if((signed int)memory_map[start>>12]>=0) {
8169       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
8170       pagelimit=(start+4096)&0xFFFFF000;
8171       int map=memory_map[start>>12];
8172       int i;
8173       for(i=0;i<5;i++) {
8174         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
8175         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
8176       }
8177       assem_debug("pagelimit=%x\n",pagelimit);
8178       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
8179     }
8180     else {
8181       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
8182       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
8183       return -1; // Caller will invoke exception handler
8184     }
8185     //printf("source= %x\n",(int)source);
8186   }
8187 #endif
8188   else {
8189     printf("Compile at bogus memory address: %x \n", (int)addr);
8190     exit(1);
8191   }
8192
8193   /* Pass 1: disassemble */
8194   /* Pass 2: register dependencies, branch targets */
8195   /* Pass 3: register allocation */
8196   /* Pass 4: branch dependencies */
8197   /* Pass 5: pre-alloc */
8198   /* Pass 6: optimize clean/dirty state */
8199   /* Pass 7: flag 32-bit registers */
8200   /* Pass 8: assembly */
8201   /* Pass 9: linker */
8202   /* Pass 10: garbage collection / free memory */
8203
8204   int i,j;
8205   int done=0;
8206   unsigned int type,op,op2;
8207
8208   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8209   
8210   /* Pass 1 disassembly */
8211
8212   for(i=0;!done;i++) {
8213     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8214     minimum_free_regs[i]=0;
8215     opcode[i]=op=source[i]>>26;
8216     switch(op)
8217     {
8218       case 0x00: strcpy(insn[i],"special"); type=NI;
8219         op2=source[i]&0x3f;
8220         switch(op2)
8221         {
8222           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8223           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8224           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8225           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8226           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8227           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8228           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8229           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8230           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8231           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8232           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8233           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8234           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8235           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8236           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8237           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8238           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8239           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8240           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8241           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8242           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8243           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8244           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8245           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8246           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8247           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8248           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8249           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8250           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8251           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8252           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8253           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8254           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8255           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8256           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8257 #ifndef FORCE32
8258           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8259           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8260           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8261           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8262           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8263           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8264           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8265           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8266           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8267           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8268           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8269           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8270           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8271           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8272           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8273           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8274           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8275 #endif
8276         }
8277         break;
8278       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8279         op2=(source[i]>>16)&0x1f;
8280         switch(op2)
8281         {
8282           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8283           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8284           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8285           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8286           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8287           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8288           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8289           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8290           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8291           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8292           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8293           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8294           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8295           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8296         }
8297         break;
8298       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8299       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8300       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8301       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8302       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8303       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8304       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8305       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8306       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8307       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8308       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8309       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8310       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8311       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8312       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8313         op2=(source[i]>>21)&0x1f;
8314         switch(op2)
8315         {
8316           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8317           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8318           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8319           switch(source[i]&0x3f)
8320           {
8321             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8322             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8323             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8324             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8325 #ifdef PCSX
8326             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8327 #else
8328             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8329 #endif
8330           }
8331         }
8332         break;
8333       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8334         op2=(source[i]>>21)&0x1f;
8335         switch(op2)
8336         {
8337           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8338           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8339           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8340           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8341           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8342           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8343           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8344           switch((source[i]>>16)&0x3)
8345           {
8346             case 0x00: strcpy(insn[i],"BC1F"); break;
8347             case 0x01: strcpy(insn[i],"BC1T"); break;
8348             case 0x02: strcpy(insn[i],"BC1FL"); break;
8349             case 0x03: strcpy(insn[i],"BC1TL"); break;
8350           }
8351           break;
8352           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8353           switch(source[i]&0x3f)
8354           {
8355             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8356             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8357             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8358             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8359             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8360             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8361             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8362             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8363             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8364             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8365             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8366             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8367             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8368             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8369             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8370             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8371             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8372             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8373             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8374             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8375             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8376             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8377             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8378             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8379             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8380             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8381             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8382             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8383             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8384             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8385             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8386             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8387             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8388             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8389             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8390           }
8391           break;
8392           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8393           switch(source[i]&0x3f)
8394           {
8395             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8396             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8397             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8398             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8399             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8400             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8401             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8402             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8403             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8404             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8405             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8406             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8407             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8408             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8409             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8410             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8411             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8412             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8413             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8414             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8415             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8416             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8417             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8418             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8419             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8420             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8421             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8422             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8423             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8424             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8425             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8426             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8427             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8428             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8429             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8430           }
8431           break;
8432           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8433           switch(source[i]&0x3f)
8434           {
8435             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8436             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8437           }
8438           break;
8439           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8440           switch(source[i]&0x3f)
8441           {
8442             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8443             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8444           }
8445           break;
8446         }
8447         break;
8448 #ifndef FORCE32
8449       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8450       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8451       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8452       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8453       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8454       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8455       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8456       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8457 #endif
8458       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8459       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8460       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8461       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8462       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8463       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8464       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8465 #ifndef FORCE32
8466       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8467 #endif
8468       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8469       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8470       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8471       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8472 #ifndef FORCE32
8473       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8474       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8475 #endif
8476       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8477       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8478       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8479       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8480 #ifndef FORCE32
8481       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8482       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8483       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8484 #endif
8485       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8486       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8487 #ifndef FORCE32
8488       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8489       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8490       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8491 #endif
8492 #ifdef PCSX
8493       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8494         op2=(source[i]>>21)&0x1f;
8495         //if (op2 & 0x10) {
8496         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8497           if (gte_handlers[source[i]&0x3f]!=NULL) {
8498             if (gte_regnames[source[i]&0x3f]!=NULL)
8499               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8500             else
8501               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8502             type=C2OP;
8503           }
8504         }
8505         else switch(op2)
8506         {
8507           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8508           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8509           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8510           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8511         }
8512         break;
8513       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8514       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8515       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8516 #endif
8517       default: strcpy(insn[i],"???"); type=NI;
8518         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8519         break;
8520     }
8521     itype[i]=type;
8522     opcode2[i]=op2;
8523     /* Get registers/immediates */
8524     lt1[i]=0;
8525     us1[i]=0;
8526     us2[i]=0;
8527     dep1[i]=0;
8528     dep2[i]=0;
8529     gte_rs[i]=gte_rt[i]=0;
8530     switch(type) {
8531       case LOAD:
8532         rs1[i]=(source[i]>>21)&0x1f;
8533         rs2[i]=0;
8534         rt1[i]=(source[i]>>16)&0x1f;
8535         rt2[i]=0;
8536         imm[i]=(short)source[i];
8537         break;
8538       case STORE:
8539       case STORELR:
8540         rs1[i]=(source[i]>>21)&0x1f;
8541         rs2[i]=(source[i]>>16)&0x1f;
8542         rt1[i]=0;
8543         rt2[i]=0;
8544         imm[i]=(short)source[i];
8545         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8546         break;
8547       case LOADLR:
8548         // LWL/LWR only load part of the register,
8549         // therefore the target register must be treated as a source too
8550         rs1[i]=(source[i]>>21)&0x1f;
8551         rs2[i]=(source[i]>>16)&0x1f;
8552         rt1[i]=(source[i]>>16)&0x1f;
8553         rt2[i]=0;
8554         imm[i]=(short)source[i];
8555         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8556         if(op==0x26) dep1[i]=rt1[i]; // LWR
8557         break;
8558       case IMM16:
8559         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8560         else rs1[i]=(source[i]>>21)&0x1f;
8561         rs2[i]=0;
8562         rt1[i]=(source[i]>>16)&0x1f;
8563         rt2[i]=0;
8564         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8565           imm[i]=(unsigned short)source[i];
8566         }else{
8567           imm[i]=(short)source[i];
8568         }
8569         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8570         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8571         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8572         break;
8573       case UJUMP:
8574         rs1[i]=0;
8575         rs2[i]=0;
8576         rt1[i]=0;
8577         rt2[i]=0;
8578         // The JAL instruction writes to r31.
8579         if (op&1) {
8580           rt1[i]=31;
8581         }
8582         rs2[i]=CCREG;
8583         break;
8584       case RJUMP:
8585         rs1[i]=(source[i]>>21)&0x1f;
8586         rs2[i]=0;
8587         rt1[i]=0;
8588         rt2[i]=0;
8589         // The JALR instruction writes to rd.
8590         if (op2&1) {
8591           rt1[i]=(source[i]>>11)&0x1f;
8592         }
8593         rs2[i]=CCREG;
8594         break;
8595       case CJUMP:
8596         rs1[i]=(source[i]>>21)&0x1f;
8597         rs2[i]=(source[i]>>16)&0x1f;
8598         rt1[i]=0;
8599         rt2[i]=0;
8600         if(op&2) { // BGTZ/BLEZ
8601           rs2[i]=0;
8602         }
8603         us1[i]=rs1[i];
8604         us2[i]=rs2[i];
8605         likely[i]=op>>4;
8606         break;
8607       case SJUMP:
8608         rs1[i]=(source[i]>>21)&0x1f;
8609         rs2[i]=CCREG;
8610         rt1[i]=0;
8611         rt2[i]=0;
8612         us1[i]=rs1[i];
8613         if(op2&0x10) { // BxxAL
8614           rt1[i]=31;
8615           // NOTE: If the branch is not taken, r31 is still overwritten
8616         }
8617         likely[i]=(op2&2)>>1;
8618         break;
8619       case FJUMP:
8620         rs1[i]=FSREG;
8621         rs2[i]=CSREG;
8622         rt1[i]=0;
8623         rt2[i]=0;
8624         likely[i]=((source[i])>>17)&1;
8625         break;
8626       case ALU:
8627         rs1[i]=(source[i]>>21)&0x1f; // source
8628         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8629         rt1[i]=(source[i]>>11)&0x1f; // destination
8630         rt2[i]=0;
8631         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8632           us1[i]=rs1[i];us2[i]=rs2[i];
8633         }
8634         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8635           dep1[i]=rs1[i];dep2[i]=rs2[i];
8636         }
8637         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8638           dep1[i]=rs1[i];dep2[i]=rs2[i];
8639         }
8640         break;
8641       case MULTDIV:
8642         rs1[i]=(source[i]>>21)&0x1f; // source
8643         rs2[i]=(source[i]>>16)&0x1f; // divisor
8644         rt1[i]=HIREG;
8645         rt2[i]=LOREG;
8646         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8647           us1[i]=rs1[i];us2[i]=rs2[i];
8648         }
8649         break;
8650       case MOV:
8651         rs1[i]=0;
8652         rs2[i]=0;
8653         rt1[i]=0;
8654         rt2[i]=0;
8655         if(op2==0x10) rs1[i]=HIREG; // MFHI
8656         if(op2==0x11) rt1[i]=HIREG; // MTHI
8657         if(op2==0x12) rs1[i]=LOREG; // MFLO
8658         if(op2==0x13) rt1[i]=LOREG; // MTLO
8659         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8660         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8661         dep1[i]=rs1[i];
8662         break;
8663       case SHIFT:
8664         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8665         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8666         rt1[i]=(source[i]>>11)&0x1f; // destination
8667         rt2[i]=0;
8668         // DSLLV/DSRLV/DSRAV are 64-bit
8669         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8670         break;
8671       case SHIFTIMM:
8672         rs1[i]=(source[i]>>16)&0x1f;
8673         rs2[i]=0;
8674         rt1[i]=(source[i]>>11)&0x1f;
8675         rt2[i]=0;
8676         imm[i]=(source[i]>>6)&0x1f;
8677         // DSxx32 instructions
8678         if(op2>=0x3c) imm[i]|=0x20;
8679         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8680         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8681         break;
8682       case COP0:
8683         rs1[i]=0;
8684         rs2[i]=0;
8685         rt1[i]=0;
8686         rt2[i]=0;
8687         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8688         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8689         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8690         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8691         break;
8692       case COP1:
8693         rs1[i]=0;
8694         rs2[i]=0;
8695         rt1[i]=0;
8696         rt2[i]=0;
8697         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8698         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8699         if(op2==5) us1[i]=rs1[i]; // DMTC1
8700         rs2[i]=CSREG;
8701         break;
8702       case COP2:
8703         rs1[i]=0;
8704         rs2[i]=0;
8705         rt1[i]=0;
8706         rt2[i]=0;
8707         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8708         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8709         rs2[i]=CSREG;
8710         int gr=(source[i]>>11)&0x1F;
8711         switch(op2)
8712         {
8713           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8714           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8715           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
8716           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8717         }
8718         break;
8719       case C1LS:
8720         rs1[i]=(source[i]>>21)&0x1F;
8721         rs2[i]=CSREG;
8722         rt1[i]=0;
8723         rt2[i]=0;
8724         imm[i]=(short)source[i];
8725         break;
8726       case C2LS:
8727         rs1[i]=(source[i]>>21)&0x1F;
8728         rs2[i]=0;
8729         rt1[i]=0;
8730         rt2[i]=0;
8731         imm[i]=(short)source[i];
8732         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8733         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8734         break;
8735       case C2OP:
8736         rs1[i]=0;
8737         rs2[i]=0;
8738         rt1[i]=0;
8739         rt2[i]=0;
8740         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
8741         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
8742         gte_rt[i]|=1ll<<63; // every op changes flags
8743         if((source[i]&0x3f)==GTE_MVMVA) {
8744           int v = (source[i] >> 15) & 3;
8745           gte_rs[i]&=~0xe3fll;
8746           if(v==3) gte_rs[i]|=0xe00ll;
8747           else gte_rs[i]|=3ll<<(v*2);
8748         }
8749         break;
8750       case FLOAT:
8751       case FCONV:
8752         rs1[i]=0;
8753         rs2[i]=CSREG;
8754         rt1[i]=0;
8755         rt2[i]=0;
8756         break;
8757       case FCOMP:
8758         rs1[i]=FSREG;
8759         rs2[i]=CSREG;
8760         rt1[i]=FSREG;
8761         rt2[i]=0;
8762         break;
8763       case SYSCALL:
8764       case HLECALL:
8765       case INTCALL:
8766         rs1[i]=CCREG;
8767         rs2[i]=0;
8768         rt1[i]=0;
8769         rt2[i]=0;
8770         break;
8771       default:
8772         rs1[i]=0;
8773         rs2[i]=0;
8774         rt1[i]=0;
8775         rt2[i]=0;
8776     }
8777     /* Calculate branch target addresses */
8778     if(type==UJUMP)
8779       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8780     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8781       ba[i]=start+i*4+8; // Ignore never taken branch
8782     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8783       ba[i]=start+i*4+8; // Ignore never taken branch
8784     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8785       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8786     else ba[i]=-1;
8787 #ifdef PCSX
8788     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8789       int do_in_intrp=0;
8790       // branch in delay slot?
8791       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8792         // don't handle first branch and call interpreter if it's hit
8793         printf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8794         do_in_intrp=1;
8795       }
8796       // basic load delay detection
8797       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8798         int t=(ba[i-1]-start)/4;
8799         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8800           // jump target wants DS result - potential load delay effect
8801           printf("load delay @%08x (%08x)\n", addr + i*4, addr);
8802           do_in_intrp=1;
8803           bt[t+1]=1; // expected return from interpreter
8804         }
8805         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8806               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8807           // v0 overwrite like this is a sign of trouble, bail out
8808           printf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8809           do_in_intrp=1;
8810         }
8811       }
8812       if(do_in_intrp) {
8813         rs1[i-1]=CCREG;
8814         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8815         ba[i-1]=-1;
8816         itype[i-1]=INTCALL;
8817         done=2;
8818         i--; // don't compile the DS
8819       }
8820     }
8821 #endif
8822     /* Is this the end of the block? */
8823     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8824       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8825         done=2;
8826       }
8827       else {
8828         if(stop_after_jal) done=1;
8829         // Stop on BREAK
8830         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8831       }
8832       // Don't recompile stuff that's already compiled
8833       if(check_addr(start+i*4+4)) done=1;
8834       // Don't get too close to the limit
8835       if(i>MAXBLOCK/2) done=1;
8836     }
8837     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8838     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8839     if(done==2) {
8840       // Does the block continue due to a branch?
8841       for(j=i-1;j>=0;j--)
8842       {
8843         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8844         if(ba[j]==start+i*4+4) done=j=0;
8845         if(ba[j]==start+i*4+8) done=j=0;
8846       }
8847     }
8848     //assert(i<MAXBLOCK-1);
8849     if(start+i*4==pagelimit-4) done=1;
8850     assert(start+i*4<pagelimit);
8851     if (i==MAXBLOCK-1) done=1;
8852     // Stop if we're compiling junk
8853     if(itype[i]==NI&&opcode[i]==0x11) {
8854       done=stop_after_jal=1;
8855       printf("Disabled speculative precompilation\n");
8856     }
8857   }
8858   slen=i;
8859   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8860     if(start+i*4==pagelimit) {
8861       itype[i-1]=SPAN;
8862     }
8863   }
8864   assert(slen>0);
8865
8866   /* Pass 2 - Register dependencies and branch targets */
8867
8868   unneeded_registers(0,slen-1,0);
8869   
8870   /* Pass 3 - Register allocation */
8871
8872   struct regstat current; // Current register allocations/status
8873   current.is32=1;
8874   current.dirty=0;
8875   current.u=unneeded_reg[0];
8876   current.uu=unneeded_reg_upper[0];
8877   clear_all_regs(current.regmap);
8878   alloc_reg(&current,0,CCREG);
8879   dirty_reg(&current,CCREG);
8880   current.isconst=0;
8881   current.wasconst=0;
8882   current.waswritten=0;
8883   int ds=0;
8884   int cc=0;
8885   int hr=-1;
8886
8887 #ifndef FORCE32
8888   provisional_32bit();
8889 #endif
8890   if((u_int)addr&1) {
8891     // First instruction is delay slot
8892     cc=-1;
8893     bt[1]=1;
8894     ds=1;
8895     unneeded_reg[0]=1;
8896     unneeded_reg_upper[0]=1;
8897     current.regmap[HOST_BTREG]=BTREG;
8898   }
8899   
8900   for(i=0;i<slen;i++)
8901   {
8902     if(bt[i])
8903     {
8904       int hr;
8905       for(hr=0;hr<HOST_REGS;hr++)
8906       {
8907         // Is this really necessary?
8908         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8909       }
8910       current.isconst=0;
8911       current.waswritten=0;
8912     }
8913     if(i>1)
8914     {
8915       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8916       {
8917         if(rs1[i-2]==0||rs2[i-2]==0)
8918         {
8919           if(rs1[i-2]) {
8920             current.is32|=1LL<<rs1[i-2];
8921             int hr=get_reg(current.regmap,rs1[i-2]|64);
8922             if(hr>=0) current.regmap[hr]=-1;
8923           }
8924           if(rs2[i-2]) {
8925             current.is32|=1LL<<rs2[i-2];
8926             int hr=get_reg(current.regmap,rs2[i-2]|64);
8927             if(hr>=0) current.regmap[hr]=-1;
8928           }
8929         }
8930       }
8931     }
8932 #ifndef FORCE32
8933     // If something jumps here with 64-bit values
8934     // then promote those registers to 64 bits
8935     if(bt[i])
8936     {
8937       uint64_t temp_is32=current.is32;
8938       for(j=i-1;j>=0;j--)
8939       {
8940         if(ba[j]==start+i*4) 
8941           temp_is32&=branch_regs[j].is32;
8942       }
8943       for(j=i;j<slen;j++)
8944       {
8945         if(ba[j]==start+i*4) 
8946           //temp_is32=1;
8947           temp_is32&=p32[j];
8948       }
8949       if(temp_is32!=current.is32) {
8950         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8951         #ifndef DESTRUCTIVE_WRITEBACK
8952         if(ds)
8953         #endif
8954         for(hr=0;hr<HOST_REGS;hr++)
8955         {
8956           int r=current.regmap[hr];
8957           if(r>0&&r<64)
8958           {
8959             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8960               temp_is32|=1LL<<r;
8961               //printf("restore %d\n",r);
8962             }
8963           }
8964         }
8965         current.is32=temp_is32;
8966       }
8967     }
8968 #else
8969     current.is32=-1LL;
8970 #endif
8971
8972     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8973     regs[i].wasconst=current.isconst;
8974     regs[i].was32=current.is32;
8975     regs[i].wasdirty=current.dirty;
8976     regs[i].loadedconst=0;
8977     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
8978     // To change a dirty register from 32 to 64 bits, we must write
8979     // it out during the previous cycle (for branches, 2 cycles)
8980     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8981     {
8982       uint64_t temp_is32=current.is32;
8983       for(j=i-1;j>=0;j--)
8984       {
8985         if(ba[j]==start+i*4+4) 
8986           temp_is32&=branch_regs[j].is32;
8987       }
8988       for(j=i;j<slen;j++)
8989       {
8990         if(ba[j]==start+i*4+4) 
8991           //temp_is32=1;
8992           temp_is32&=p32[j];
8993       }
8994       if(temp_is32!=current.is32) {
8995         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8996         for(hr=0;hr<HOST_REGS;hr++)
8997         {
8998           int r=current.regmap[hr];
8999           if(r>0)
9000           {
9001             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9002               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
9003               {
9004                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
9005                 {
9006                   //printf("dump %d/r%d\n",hr,r);
9007                   current.regmap[hr]=-1;
9008                   if(get_reg(current.regmap,r|64)>=0) 
9009                     current.regmap[get_reg(current.regmap,r|64)]=-1;
9010                 }
9011               }
9012             }
9013           }
9014         }
9015       }
9016     }
9017     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
9018     {
9019       uint64_t temp_is32=current.is32;
9020       for(j=i-1;j>=0;j--)
9021       {
9022         if(ba[j]==start+i*4+8) 
9023           temp_is32&=branch_regs[j].is32;
9024       }
9025       for(j=i;j<slen;j++)
9026       {
9027         if(ba[j]==start+i*4+8) 
9028           //temp_is32=1;
9029           temp_is32&=p32[j];
9030       }
9031       if(temp_is32!=current.is32) {
9032         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9033         for(hr=0;hr<HOST_REGS;hr++)
9034         {
9035           int r=current.regmap[hr];
9036           if(r>0)
9037           {
9038             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9039               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
9040               {
9041                 //printf("dump %d/r%d\n",hr,r);
9042                 current.regmap[hr]=-1;
9043                 if(get_reg(current.regmap,r|64)>=0) 
9044                   current.regmap[get_reg(current.regmap,r|64)]=-1;
9045               }
9046             }
9047           }
9048         }
9049       }
9050     }
9051     #endif
9052     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9053       if(i+1<slen) {
9054         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9055         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9056         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9057         current.u|=1;
9058         current.uu|=1;
9059       } else {
9060         current.u=1;
9061         current.uu=1;
9062       }
9063     } else {
9064       if(i+1<slen) {
9065         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
9066         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9067         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9068         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9069         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9070         current.u|=1;
9071         current.uu|=1;
9072       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
9073     }
9074     is_ds[i]=ds;
9075     if(ds) {
9076       ds=0; // Skip delay slot, already allocated as part of branch
9077       // ...but we need to alloc it in case something jumps here
9078       if(i+1<slen) {
9079         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
9080         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
9081       }else{
9082         current.u=branch_unneeded_reg[i-1];
9083         current.uu=branch_unneeded_reg_upper[i-1];
9084       }
9085       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9086       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9087       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9088       current.u|=1;
9089       current.uu|=1;
9090       struct regstat temp;
9091       memcpy(&temp,&current,sizeof(current));
9092       temp.wasdirty=temp.dirty;
9093       temp.was32=temp.is32;
9094       // TODO: Take into account unconditional branches, as below
9095       delayslot_alloc(&temp,i);
9096       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
9097       regs[i].wasdirty=temp.wasdirty;
9098       regs[i].was32=temp.was32;
9099       regs[i].dirty=temp.dirty;
9100       regs[i].is32=temp.is32;
9101       regs[i].isconst=0;
9102       regs[i].wasconst=0;
9103       current.isconst=0;
9104       // Create entry (branch target) regmap
9105       for(hr=0;hr<HOST_REGS;hr++)
9106       {
9107         int r=temp.regmap[hr];
9108         if(r>=0) {
9109           if(r!=regmap_pre[i][hr]) {
9110             regs[i].regmap_entry[hr]=-1;
9111           }
9112           else
9113           {
9114             if(r<64){
9115               if((current.u>>r)&1) {
9116                 regs[i].regmap_entry[hr]=-1;
9117                 regs[i].regmap[hr]=-1;
9118                 //Don't clear regs in the delay slot as the branch might need them
9119                 //current.regmap[hr]=-1;
9120               }else
9121                 regs[i].regmap_entry[hr]=r;
9122             }
9123             else {
9124               if((current.uu>>(r&63))&1) {
9125                 regs[i].regmap_entry[hr]=-1;
9126                 regs[i].regmap[hr]=-1;
9127                 //Don't clear regs in the delay slot as the branch might need them
9128                 //current.regmap[hr]=-1;
9129               }else
9130                 regs[i].regmap_entry[hr]=r;
9131             }
9132           }
9133         } else {
9134           // First instruction expects CCREG to be allocated
9135           if(i==0&&hr==HOST_CCREG) 
9136             regs[i].regmap_entry[hr]=CCREG;
9137           else
9138             regs[i].regmap_entry[hr]=-1;
9139         }
9140       }
9141     }
9142     else { // Not delay slot
9143       switch(itype[i]) {
9144         case UJUMP:
9145           //current.isconst=0; // DEBUG
9146           //current.wasconst=0; // DEBUG
9147           //regs[i].wasconst=0; // DEBUG
9148           clear_const(&current,rt1[i]);
9149           alloc_cc(&current,i);
9150           dirty_reg(&current,CCREG);
9151           if (rt1[i]==31) {
9152             alloc_reg(&current,i,31);
9153             dirty_reg(&current,31);
9154             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9155             //assert(rt1[i+1]!=rt1[i]);
9156             #ifdef REG_PREFETCH
9157             alloc_reg(&current,i,PTEMP);
9158             #endif
9159             //current.is32|=1LL<<rt1[i];
9160           }
9161           ooo[i]=1;
9162           delayslot_alloc(&current,i+1);
9163           //current.isconst=0; // DEBUG
9164           ds=1;
9165           //printf("i=%d, isconst=%x\n",i,current.isconst);
9166           break;
9167         case RJUMP:
9168           //current.isconst=0;
9169           //current.wasconst=0;
9170           //regs[i].wasconst=0;
9171           clear_const(&current,rs1[i]);
9172           clear_const(&current,rt1[i]);
9173           alloc_cc(&current,i);
9174           dirty_reg(&current,CCREG);
9175           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9176             alloc_reg(&current,i,rs1[i]);
9177             if (rt1[i]!=0) {
9178               alloc_reg(&current,i,rt1[i]);
9179               dirty_reg(&current,rt1[i]);
9180               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9181               assert(rt1[i+1]!=rt1[i]);
9182               #ifdef REG_PREFETCH
9183               alloc_reg(&current,i,PTEMP);
9184               #endif
9185             }
9186             #ifdef USE_MINI_HT
9187             if(rs1[i]==31) { // JALR
9188               alloc_reg(&current,i,RHASH);
9189               #ifndef HOST_IMM_ADDR32
9190               alloc_reg(&current,i,RHTBL);
9191               #endif
9192             }
9193             #endif
9194             delayslot_alloc(&current,i+1);
9195           } else {
9196             // The delay slot overwrites our source register,
9197             // allocate a temporary register to hold the old value.
9198             current.isconst=0;
9199             current.wasconst=0;
9200             regs[i].wasconst=0;
9201             delayslot_alloc(&current,i+1);
9202             current.isconst=0;
9203             alloc_reg(&current,i,RTEMP);
9204           }
9205           //current.isconst=0; // DEBUG
9206           ooo[i]=1;
9207           ds=1;
9208           break;
9209         case CJUMP:
9210           //current.isconst=0;
9211           //current.wasconst=0;
9212           //regs[i].wasconst=0;
9213           clear_const(&current,rs1[i]);
9214           clear_const(&current,rs2[i]);
9215           if((opcode[i]&0x3E)==4) // BEQ/BNE
9216           {
9217             alloc_cc(&current,i);
9218             dirty_reg(&current,CCREG);
9219             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9220             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9221             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9222             {
9223               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9224               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9225             }
9226             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9227                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9228               // The delay slot overwrites one of our conditions.
9229               // Allocate the branch condition registers instead.
9230               current.isconst=0;
9231               current.wasconst=0;
9232               regs[i].wasconst=0;
9233               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9234               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9235               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9236               {
9237                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9238                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9239               }
9240             }
9241             else
9242             {
9243               ooo[i]=1;
9244               delayslot_alloc(&current,i+1);
9245             }
9246           }
9247           else
9248           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9249           {
9250             alloc_cc(&current,i);
9251             dirty_reg(&current,CCREG);
9252             alloc_reg(&current,i,rs1[i]);
9253             if(!(current.is32>>rs1[i]&1))
9254             {
9255               alloc_reg64(&current,i,rs1[i]);
9256             }
9257             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9258               // The delay slot overwrites one of our conditions.
9259               // Allocate the branch condition registers instead.
9260               current.isconst=0;
9261               current.wasconst=0;
9262               regs[i].wasconst=0;
9263               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9264               if(!((current.is32>>rs1[i])&1))
9265               {
9266                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9267               }
9268             }
9269             else
9270             {
9271               ooo[i]=1;
9272               delayslot_alloc(&current,i+1);
9273             }
9274           }
9275           else
9276           // Don't alloc the delay slot yet because we might not execute it
9277           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9278           {
9279             current.isconst=0;
9280             current.wasconst=0;
9281             regs[i].wasconst=0;
9282             alloc_cc(&current,i);
9283             dirty_reg(&current,CCREG);
9284             alloc_reg(&current,i,rs1[i]);
9285             alloc_reg(&current,i,rs2[i]);
9286             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9287             {
9288               alloc_reg64(&current,i,rs1[i]);
9289               alloc_reg64(&current,i,rs2[i]);
9290             }
9291           }
9292           else
9293           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9294           {
9295             current.isconst=0;
9296             current.wasconst=0;
9297             regs[i].wasconst=0;
9298             alloc_cc(&current,i);
9299             dirty_reg(&current,CCREG);
9300             alloc_reg(&current,i,rs1[i]);
9301             if(!(current.is32>>rs1[i]&1))
9302             {
9303               alloc_reg64(&current,i,rs1[i]);
9304             }
9305           }
9306           ds=1;
9307           //current.isconst=0;
9308           break;
9309         case SJUMP:
9310           //current.isconst=0;
9311           //current.wasconst=0;
9312           //regs[i].wasconst=0;
9313           clear_const(&current,rs1[i]);
9314           clear_const(&current,rt1[i]);
9315           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9316           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9317           {
9318             alloc_cc(&current,i);
9319             dirty_reg(&current,CCREG);
9320             alloc_reg(&current,i,rs1[i]);
9321             if(!(current.is32>>rs1[i]&1))
9322             {
9323               alloc_reg64(&current,i,rs1[i]);
9324             }
9325             if (rt1[i]==31) { // BLTZAL/BGEZAL
9326               alloc_reg(&current,i,31);
9327               dirty_reg(&current,31);
9328               //#ifdef REG_PREFETCH
9329               //alloc_reg(&current,i,PTEMP);
9330               //#endif
9331               //current.is32|=1LL<<rt1[i];
9332             }
9333             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9334                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9335               // Allocate the branch condition registers instead.
9336               current.isconst=0;
9337               current.wasconst=0;
9338               regs[i].wasconst=0;
9339               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9340               if(!((current.is32>>rs1[i])&1))
9341               {
9342                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9343               }
9344             }
9345             else
9346             {
9347               ooo[i]=1;
9348               delayslot_alloc(&current,i+1);
9349             }
9350           }
9351           else
9352           // Don't alloc the delay slot yet because we might not execute it
9353           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9354           {
9355             current.isconst=0;
9356             current.wasconst=0;
9357             regs[i].wasconst=0;
9358             alloc_cc(&current,i);
9359             dirty_reg(&current,CCREG);
9360             alloc_reg(&current,i,rs1[i]);
9361             if(!(current.is32>>rs1[i]&1))
9362             {
9363               alloc_reg64(&current,i,rs1[i]);
9364             }
9365           }
9366           ds=1;
9367           //current.isconst=0;
9368           break;
9369         case FJUMP:
9370           current.isconst=0;
9371           current.wasconst=0;
9372           regs[i].wasconst=0;
9373           if(likely[i]==0) // BC1F/BC1T
9374           {
9375             // TODO: Theoretically we can run out of registers here on x86.
9376             // The delay slot can allocate up to six, and we need to check
9377             // CSREG before executing the delay slot.  Possibly we can drop
9378             // the cycle count and then reload it after checking that the
9379             // FPU is in a usable state, or don't do out-of-order execution.
9380             alloc_cc(&current,i);
9381             dirty_reg(&current,CCREG);
9382             alloc_reg(&current,i,FSREG);
9383             alloc_reg(&current,i,CSREG);
9384             if(itype[i+1]==FCOMP) {
9385               // The delay slot overwrites the branch condition.
9386               // Allocate the branch condition registers instead.
9387               alloc_cc(&current,i);
9388               dirty_reg(&current,CCREG);
9389               alloc_reg(&current,i,CSREG);
9390               alloc_reg(&current,i,FSREG);
9391             }
9392             else {
9393               ooo[i]=1;
9394               delayslot_alloc(&current,i+1);
9395               alloc_reg(&current,i+1,CSREG);
9396             }
9397           }
9398           else
9399           // Don't alloc the delay slot yet because we might not execute it
9400           if(likely[i]) // BC1FL/BC1TL
9401           {
9402             alloc_cc(&current,i);
9403             dirty_reg(&current,CCREG);
9404             alloc_reg(&current,i,CSREG);
9405             alloc_reg(&current,i,FSREG);
9406           }
9407           ds=1;
9408           current.isconst=0;
9409           break;
9410         case IMM16:
9411           imm16_alloc(&current,i);
9412           break;
9413         case LOAD:
9414         case LOADLR:
9415           load_alloc(&current,i);
9416           break;
9417         case STORE:
9418         case STORELR:
9419           store_alloc(&current,i);
9420           break;
9421         case ALU:
9422           alu_alloc(&current,i);
9423           break;
9424         case SHIFT:
9425           shift_alloc(&current,i);
9426           break;
9427         case MULTDIV:
9428           multdiv_alloc(&current,i);
9429           break;
9430         case SHIFTIMM:
9431           shiftimm_alloc(&current,i);
9432           break;
9433         case MOV:
9434           mov_alloc(&current,i);
9435           break;
9436         case COP0:
9437           cop0_alloc(&current,i);
9438           break;
9439         case COP1:
9440         case COP2:
9441           cop1_alloc(&current,i);
9442           break;
9443         case C1LS:
9444           c1ls_alloc(&current,i);
9445           break;
9446         case C2LS:
9447           c2ls_alloc(&current,i);
9448           break;
9449         case C2OP:
9450           c2op_alloc(&current,i);
9451           break;
9452         case FCONV:
9453           fconv_alloc(&current,i);
9454           break;
9455         case FLOAT:
9456           float_alloc(&current,i);
9457           break;
9458         case FCOMP:
9459           fcomp_alloc(&current,i);
9460           break;
9461         case SYSCALL:
9462         case HLECALL:
9463         case INTCALL:
9464           syscall_alloc(&current,i);
9465           break;
9466         case SPAN:
9467           pagespan_alloc(&current,i);
9468           break;
9469       }
9470       
9471       // Drop the upper half of registers that have become 32-bit
9472       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9473       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9474         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9475         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9476         current.uu|=1;
9477       } else {
9478         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9479         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9480         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9481         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9482         current.uu|=1;
9483       }
9484
9485       // Create entry (branch target) regmap
9486       for(hr=0;hr<HOST_REGS;hr++)
9487       {
9488         int r,or,er;
9489         r=current.regmap[hr];
9490         if(r>=0) {
9491           if(r!=regmap_pre[i][hr]) {
9492             // TODO: delay slot (?)
9493             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9494             if(or<0||(r&63)>=TEMPREG){
9495               regs[i].regmap_entry[hr]=-1;
9496             }
9497             else
9498             {
9499               // Just move it to a different register
9500               regs[i].regmap_entry[hr]=r;
9501               // If it was dirty before, it's still dirty
9502               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9503             }
9504           }
9505           else
9506           {
9507             // Unneeded
9508             if(r==0){
9509               regs[i].regmap_entry[hr]=0;
9510             }
9511             else
9512             if(r<64){
9513               if((current.u>>r)&1) {
9514                 regs[i].regmap_entry[hr]=-1;
9515                 //regs[i].regmap[hr]=-1;
9516                 current.regmap[hr]=-1;
9517               }else
9518                 regs[i].regmap_entry[hr]=r;
9519             }
9520             else {
9521               if((current.uu>>(r&63))&1) {
9522                 regs[i].regmap_entry[hr]=-1;
9523                 //regs[i].regmap[hr]=-1;
9524                 current.regmap[hr]=-1;
9525               }else
9526                 regs[i].regmap_entry[hr]=r;
9527             }
9528           }
9529         } else {
9530           // Branches expect CCREG to be allocated at the target
9531           if(regmap_pre[i][hr]==CCREG) 
9532             regs[i].regmap_entry[hr]=CCREG;
9533           else
9534             regs[i].regmap_entry[hr]=-1;
9535         }
9536       }
9537       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9538     }
9539
9540     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
9541       current.waswritten|=1<<rs1[i-1];
9542     current.waswritten&=~(1<<rt1[i]);
9543     current.waswritten&=~(1<<rt2[i]);
9544     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
9545       current.waswritten&=~(1<<rs1[i]);
9546
9547     /* Branch post-alloc */
9548     if(i>0)
9549     {
9550       current.was32=current.is32;
9551       current.wasdirty=current.dirty;
9552       switch(itype[i-1]) {
9553         case UJUMP:
9554           memcpy(&branch_regs[i-1],&current,sizeof(current));
9555           branch_regs[i-1].isconst=0;
9556           branch_regs[i-1].wasconst=0;
9557           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9558           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9559           alloc_cc(&branch_regs[i-1],i-1);
9560           dirty_reg(&branch_regs[i-1],CCREG);
9561           if(rt1[i-1]==31) { // JAL
9562             alloc_reg(&branch_regs[i-1],i-1,31);
9563             dirty_reg(&branch_regs[i-1],31);
9564             branch_regs[i-1].is32|=1LL<<31;
9565           }
9566           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9567           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9568           break;
9569         case RJUMP:
9570           memcpy(&branch_regs[i-1],&current,sizeof(current));
9571           branch_regs[i-1].isconst=0;
9572           branch_regs[i-1].wasconst=0;
9573           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9574           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9575           alloc_cc(&branch_regs[i-1],i-1);
9576           dirty_reg(&branch_regs[i-1],CCREG);
9577           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9578           if(rt1[i-1]!=0) { // JALR
9579             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9580             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9581             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9582           }
9583           #ifdef USE_MINI_HT
9584           if(rs1[i-1]==31) { // JALR
9585             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9586             #ifndef HOST_IMM_ADDR32
9587             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9588             #endif
9589           }
9590           #endif
9591           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9592           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9593           break;
9594         case CJUMP:
9595           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9596           {
9597             alloc_cc(&current,i-1);
9598             dirty_reg(&current,CCREG);
9599             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9600                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9601               // The delay slot overwrote one of our conditions
9602               // Delay slot goes after the test (in order)
9603               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9604               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9605               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9606               current.u|=1;
9607               current.uu|=1;
9608               delayslot_alloc(&current,i);
9609               current.isconst=0;
9610             }
9611             else
9612             {
9613               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9614               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9615               // Alloc the branch condition registers
9616               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9617               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9618               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9619               {
9620                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9621                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9622               }
9623             }
9624             memcpy(&branch_regs[i-1],&current,sizeof(current));
9625             branch_regs[i-1].isconst=0;
9626             branch_regs[i-1].wasconst=0;
9627             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9628             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9629           }
9630           else
9631           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9632           {
9633             alloc_cc(&current,i-1);
9634             dirty_reg(&current,CCREG);
9635             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9636               // The delay slot overwrote the branch condition
9637               // Delay slot goes after the test (in order)
9638               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9639               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9640               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9641               current.u|=1;
9642               current.uu|=1;
9643               delayslot_alloc(&current,i);
9644               current.isconst=0;
9645             }
9646             else
9647             {
9648               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9649               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9650               // Alloc the branch condition register
9651               alloc_reg(&current,i-1,rs1[i-1]);
9652               if(!(current.is32>>rs1[i-1]&1))
9653               {
9654                 alloc_reg64(&current,i-1,rs1[i-1]);
9655               }
9656             }
9657             memcpy(&branch_regs[i-1],&current,sizeof(current));
9658             branch_regs[i-1].isconst=0;
9659             branch_regs[i-1].wasconst=0;
9660             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9661             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9662           }
9663           else
9664           // Alloc the delay slot in case the branch is taken
9665           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9666           {
9667             memcpy(&branch_regs[i-1],&current,sizeof(current));
9668             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9669             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9670             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9671             alloc_cc(&branch_regs[i-1],i);
9672             dirty_reg(&branch_regs[i-1],CCREG);
9673             delayslot_alloc(&branch_regs[i-1],i);
9674             branch_regs[i-1].isconst=0;
9675             alloc_reg(&current,i,CCREG); // Not taken path
9676             dirty_reg(&current,CCREG);
9677             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9678           }
9679           else
9680           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9681           {
9682             memcpy(&branch_regs[i-1],&current,sizeof(current));
9683             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9684             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9685             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9686             alloc_cc(&branch_regs[i-1],i);
9687             dirty_reg(&branch_regs[i-1],CCREG);
9688             delayslot_alloc(&branch_regs[i-1],i);
9689             branch_regs[i-1].isconst=0;
9690             alloc_reg(&current,i,CCREG); // Not taken path
9691             dirty_reg(&current,CCREG);
9692             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9693           }
9694           break;
9695         case SJUMP:
9696           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9697           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9698           {
9699             alloc_cc(&current,i-1);
9700             dirty_reg(&current,CCREG);
9701             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9702               // The delay slot overwrote the branch condition
9703               // Delay slot goes after the test (in order)
9704               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9705               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9706               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9707               current.u|=1;
9708               current.uu|=1;
9709               delayslot_alloc(&current,i);
9710               current.isconst=0;
9711             }
9712             else
9713             {
9714               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9715               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9716               // Alloc the branch condition register
9717               alloc_reg(&current,i-1,rs1[i-1]);
9718               if(!(current.is32>>rs1[i-1]&1))
9719               {
9720                 alloc_reg64(&current,i-1,rs1[i-1]);
9721               }
9722             }
9723             memcpy(&branch_regs[i-1],&current,sizeof(current));
9724             branch_regs[i-1].isconst=0;
9725             branch_regs[i-1].wasconst=0;
9726             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9727             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9728           }
9729           else
9730           // Alloc the delay slot in case the branch is taken
9731           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9732           {
9733             memcpy(&branch_regs[i-1],&current,sizeof(current));
9734             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9735             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9736             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9737             alloc_cc(&branch_regs[i-1],i);
9738             dirty_reg(&branch_regs[i-1],CCREG);
9739             delayslot_alloc(&branch_regs[i-1],i);
9740             branch_regs[i-1].isconst=0;
9741             alloc_reg(&current,i,CCREG); // Not taken path
9742             dirty_reg(&current,CCREG);
9743             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9744           }
9745           // FIXME: BLTZAL/BGEZAL
9746           if(opcode2[i-1]&0x10) { // BxxZAL
9747             alloc_reg(&branch_regs[i-1],i-1,31);
9748             dirty_reg(&branch_regs[i-1],31);
9749             branch_regs[i-1].is32|=1LL<<31;
9750           }
9751           break;
9752         case FJUMP:
9753           if(likely[i-1]==0) // BC1F/BC1T
9754           {
9755             alloc_cc(&current,i-1);
9756             dirty_reg(&current,CCREG);
9757             if(itype[i]==FCOMP) {
9758               // The delay slot overwrote the branch condition
9759               // Delay slot goes after the test (in order)
9760               delayslot_alloc(&current,i);
9761               current.isconst=0;
9762             }
9763             else
9764             {
9765               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9766               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9767               // Alloc the branch condition register
9768               alloc_reg(&current,i-1,FSREG);
9769             }
9770             memcpy(&branch_regs[i-1],&current,sizeof(current));
9771             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9772           }
9773           else // BC1FL/BC1TL
9774           {
9775             // Alloc the delay slot in case the branch is taken
9776             memcpy(&branch_regs[i-1],&current,sizeof(current));
9777             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9778             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9779             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9780             alloc_cc(&branch_regs[i-1],i);
9781             dirty_reg(&branch_regs[i-1],CCREG);
9782             delayslot_alloc(&branch_regs[i-1],i);
9783             branch_regs[i-1].isconst=0;
9784             alloc_reg(&current,i,CCREG); // Not taken path
9785             dirty_reg(&current,CCREG);
9786             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9787           }
9788           break;
9789       }
9790
9791       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9792       {
9793         if(rt1[i-1]==31) // JAL/JALR
9794         {
9795           // Subroutine call will return here, don't alloc any registers
9796           current.is32=1;
9797           current.dirty=0;
9798           clear_all_regs(current.regmap);
9799           alloc_reg(&current,i,CCREG);
9800           dirty_reg(&current,CCREG);
9801         }
9802         else if(i+1<slen)
9803         {
9804           // Internal branch will jump here, match registers to caller
9805           current.is32=0x3FFFFFFFFLL;
9806           current.dirty=0;
9807           clear_all_regs(current.regmap);
9808           alloc_reg(&current,i,CCREG);
9809           dirty_reg(&current,CCREG);
9810           for(j=i-1;j>=0;j--)
9811           {
9812             if(ba[j]==start+i*4+4) {
9813               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9814               current.is32=branch_regs[j].is32;
9815               current.dirty=branch_regs[j].dirty;
9816               break;
9817             }
9818           }
9819           while(j>=0) {
9820             if(ba[j]==start+i*4+4) {
9821               for(hr=0;hr<HOST_REGS;hr++) {
9822                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9823                   current.regmap[hr]=-1;
9824                 }
9825                 current.is32&=branch_regs[j].is32;
9826                 current.dirty&=branch_regs[j].dirty;
9827               }
9828             }
9829             j--;
9830           }
9831         }
9832       }
9833     }
9834
9835     // Count cycles in between branches
9836     ccadj[i]=cc;
9837     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9838     {
9839       cc=0;
9840     }
9841 #if defined(PCSX) && !defined(DRC_DBG)
9842     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
9843     {
9844       // GTE runs in parallel until accessed, divide by 2 for a rough guess
9845       cc+=gte_cycletab[source[i]&0x3f]/2;
9846     }
9847     else if(/*itype[i]==LOAD||*/itype[i]==STORE||itype[i]==C1LS) // load causes weird timing issues
9848     {
9849       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9850     }
9851     else if(itype[i]==C2LS)
9852     {
9853       cc+=4;
9854     }
9855 #endif
9856     else
9857     {
9858       cc++;
9859     }
9860
9861     flush_dirty_uppers(&current);
9862     if(!is_ds[i]) {
9863       regs[i].is32=current.is32;
9864       regs[i].dirty=current.dirty;
9865       regs[i].isconst=current.isconst;
9866       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
9867     }
9868     for(hr=0;hr<HOST_REGS;hr++) {
9869       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9870         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9871           regs[i].wasconst&=~(1<<hr);
9872         }
9873       }
9874     }
9875     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9876     regs[i].waswritten=current.waswritten;
9877   }
9878   
9879   /* Pass 4 - Cull unused host registers */
9880   
9881   uint64_t nr=0;
9882   
9883   for (i=slen-1;i>=0;i--)
9884   {
9885     int hr;
9886     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9887     {
9888       if(ba[i]<start || ba[i]>=(start+slen*4))
9889       {
9890         // Branch out of this block, don't need anything
9891         nr=0;
9892       }
9893       else
9894       {
9895         // Internal branch
9896         // Need whatever matches the target
9897         nr=0;
9898         int t=(ba[i]-start)>>2;
9899         for(hr=0;hr<HOST_REGS;hr++)
9900         {
9901           if(regs[i].regmap_entry[hr]>=0) {
9902             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9903           }
9904         }
9905       }
9906       // Conditional branch may need registers for following instructions
9907       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9908       {
9909         if(i<slen-2) {
9910           nr|=needed_reg[i+2];
9911           for(hr=0;hr<HOST_REGS;hr++)
9912           {
9913             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9914             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9915           }
9916         }
9917       }
9918       // Don't need stuff which is overwritten
9919       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9920       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9921       // Merge in delay slot
9922       for(hr=0;hr<HOST_REGS;hr++)
9923       {
9924         if(!likely[i]) {
9925           // These are overwritten unless the branch is "likely"
9926           // and the delay slot is nullified if not taken
9927           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9928           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9929         }
9930         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9931         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9932         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9933         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9934         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9935         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9936         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9937         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9938         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9939           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9940           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9941         }
9942         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9943           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9944           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9945         }
9946         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9947           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9948           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9949         }
9950       }
9951     }
9952     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9953     {
9954       // SYSCALL instruction (software interrupt)
9955       nr=0;
9956     }
9957     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9958     {
9959       // ERET instruction (return from interrupt)
9960       nr=0;
9961     }
9962     else // Non-branch
9963     {
9964       if(i<slen-1) {
9965         for(hr=0;hr<HOST_REGS;hr++) {
9966           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9967           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9968           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9969           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9970         }
9971       }
9972     }
9973     for(hr=0;hr<HOST_REGS;hr++)
9974     {
9975       // Overwritten registers are not needed
9976       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9977       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9978       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9979       // Source registers are needed
9980       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9981       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9982       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9983       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9984       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9985       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9986       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9987       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9988       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9989         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9990         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9991       }
9992       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9993         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9994         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9995       }
9996       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9997         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9998         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9999       }
10000       // Don't store a register immediately after writing it,
10001       // may prevent dual-issue.
10002       // But do so if this is a branch target, otherwise we
10003       // might have to load the register before the branch.
10004       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
10005         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
10006            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
10007           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10008           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10009         }
10010         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
10011            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
10012           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10013           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10014         }
10015       }
10016     }
10017     // Cycle count is needed at branches.  Assume it is needed at the target too.
10018     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
10019       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10020       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10021     }
10022     // Save it
10023     needed_reg[i]=nr;
10024     
10025     // Deallocate unneeded registers
10026     for(hr=0;hr<HOST_REGS;hr++)
10027     {
10028       if(!((nr>>hr)&1)) {
10029         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
10030         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10031            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10032            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
10033         {
10034           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10035           {
10036             if(likely[i]) {
10037               regs[i].regmap[hr]=-1;
10038               regs[i].isconst&=~(1<<hr);
10039               if(i<slen-2) {
10040                 regmap_pre[i+2][hr]=-1;
10041                 regs[i+2].wasconst&=~(1<<hr);
10042               }
10043             }
10044           }
10045         }
10046         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10047         {
10048           int d1=0,d2=0,map=0,temp=0;
10049           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
10050           {
10051             d1=dep1[i+1];
10052             d2=dep2[i+1];
10053           }
10054           if(using_tlb) {
10055             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
10056                itype[i+1]==STORE || itype[i+1]==STORELR ||
10057                itype[i+1]==C1LS || itype[i+1]==C2LS)
10058             map=TLREG;
10059           } else
10060           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
10061              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10062             map=INVCP;
10063           }
10064           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
10065              itype[i+1]==C1LS || itype[i+1]==C2LS)
10066             temp=FTEMP;
10067           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10068              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10069              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
10070              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
10071              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10072              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
10073              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
10074              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
10075              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
10076              regs[i].regmap[hr]!=map )
10077           {
10078             regs[i].regmap[hr]=-1;
10079             regs[i].isconst&=~(1<<hr);
10080             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
10081                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
10082                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
10083                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
10084                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
10085                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
10086                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
10087                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
10088                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
10089                branch_regs[i].regmap[hr]!=map)
10090             {
10091               branch_regs[i].regmap[hr]=-1;
10092               branch_regs[i].regmap_entry[hr]=-1;
10093               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10094               {
10095                 if(!likely[i]&&i<slen-2) {
10096                   regmap_pre[i+2][hr]=-1;
10097                   regs[i+2].wasconst&=~(1<<hr);
10098                 }
10099               }
10100             }
10101           }
10102         }
10103         else
10104         {
10105           // Non-branch
10106           if(i>0)
10107           {
10108             int d1=0,d2=0,map=-1,temp=-1;
10109             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
10110             {
10111               d1=dep1[i];
10112               d2=dep2[i];
10113             }
10114             if(using_tlb) {
10115               if(itype[i]==LOAD || itype[i]==LOADLR ||
10116                  itype[i]==STORE || itype[i]==STORELR ||
10117                  itype[i]==C1LS || itype[i]==C2LS)
10118               map=TLREG;
10119             } else if(itype[i]==STORE || itype[i]==STORELR ||
10120                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10121               map=INVCP;
10122             }
10123             if(itype[i]==LOADLR || itype[i]==STORELR ||
10124                itype[i]==C1LS || itype[i]==C2LS)
10125               temp=FTEMP;
10126             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10127                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10128                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10129                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10130                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10131                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10132             {
10133               if(i<slen-1&&!is_ds[i]) {
10134                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10135                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10136                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10137                 {
10138                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10139                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10140                 }
10141                 regmap_pre[i+1][hr]=-1;
10142                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10143                 regs[i+1].wasconst&=~(1<<hr);
10144               }
10145               regs[i].regmap[hr]=-1;
10146               regs[i].isconst&=~(1<<hr);
10147             }
10148           }
10149         }
10150       }
10151     }
10152   }
10153   
10154   /* Pass 5 - Pre-allocate registers */
10155   
10156   // If a register is allocated during a loop, try to allocate it for the
10157   // entire loop, if possible.  This avoids loading/storing registers
10158   // inside of the loop.
10159   
10160   signed char f_regmap[HOST_REGS];
10161   clear_all_regs(f_regmap);
10162   for(i=0;i<slen-1;i++)
10163   {
10164     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10165     {
10166       if(ba[i]>=start && ba[i]<(start+i*4)) 
10167       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10168       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10169       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10170       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10171       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10172       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10173       {
10174         int t=(ba[i]-start)>>2;
10175         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10176         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10177         for(hr=0;hr<HOST_REGS;hr++)
10178         {
10179           if(regs[i].regmap[hr]>64) {
10180             if(!((regs[i].dirty>>hr)&1))
10181               f_regmap[hr]=regs[i].regmap[hr];
10182             else f_regmap[hr]=-1;
10183           }
10184           else if(regs[i].regmap[hr]>=0) {
10185             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10186               // dealloc old register
10187               int n;
10188               for(n=0;n<HOST_REGS;n++)
10189               {
10190                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10191               }
10192               // and alloc new one
10193               f_regmap[hr]=regs[i].regmap[hr];
10194             }
10195           }
10196           if(branch_regs[i].regmap[hr]>64) {
10197             if(!((branch_regs[i].dirty>>hr)&1))
10198               f_regmap[hr]=branch_regs[i].regmap[hr];
10199             else f_regmap[hr]=-1;
10200           }
10201           else if(branch_regs[i].regmap[hr]>=0) {
10202             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10203               // dealloc old register
10204               int n;
10205               for(n=0;n<HOST_REGS;n++)
10206               {
10207                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10208               }
10209               // and alloc new one
10210               f_regmap[hr]=branch_regs[i].regmap[hr];
10211             }
10212           }
10213           if(ooo[i]) {
10214             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
10215               f_regmap[hr]=branch_regs[i].regmap[hr];
10216           }else{
10217             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
10218               f_regmap[hr]=branch_regs[i].regmap[hr];
10219           }
10220           // Avoid dirty->clean transition
10221           #ifdef DESTRUCTIVE_WRITEBACK
10222           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10223           #endif
10224           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10225           // case above, however it's always a good idea.  We can't hoist the
10226           // load if the register was already allocated, so there's no point
10227           // wasting time analyzing most of these cases.  It only "succeeds"
10228           // when the mapping was different and the load can be replaced with
10229           // a mov, which is of negligible benefit.  So such cases are
10230           // skipped below.
10231           if(f_regmap[hr]>0) {
10232             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10233               int r=f_regmap[hr];
10234               for(j=t;j<=i;j++)
10235               {
10236                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10237                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10238                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10239                 if(r>63) {
10240                   // NB This can exclude the case where the upper-half
10241                   // register is lower numbered than the lower-half
10242                   // register.  Not sure if it's worth fixing...
10243                   if(get_reg(regs[j].regmap,r&63)<0) break;
10244                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10245                   if(regs[j].is32&(1LL<<(r&63))) break;
10246                 }
10247                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10248                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10249                   int k;
10250                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10251                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10252                     if(r>63) {
10253                       if(get_reg(regs[i].regmap,r&63)<0) break;
10254                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10255                     }
10256                     k=i;
10257                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10258                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10259                         //printf("no free regs for store %x\n",start+(k-1)*4);
10260                         break;
10261                       }
10262                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10263                         //printf("no-match due to different register\n");
10264                         break;
10265                       }
10266                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10267                         //printf("no-match due to branch\n");
10268                         break;
10269                       }
10270                       // call/ret fast path assumes no registers allocated
10271                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10272                         break;
10273                       }
10274                       if(r>63) {
10275                         // NB This can exclude the case where the upper-half
10276                         // register is lower numbered than the lower-half
10277                         // register.  Not sure if it's worth fixing...
10278                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10279                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10280                       }
10281                       k--;
10282                     }
10283                     if(i<slen-1) {
10284                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10285                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10286                         //printf("bad match after branch\n");
10287                         break;
10288                       }
10289                     }
10290                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10291                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10292                       while(k<i) {
10293                         regs[k].regmap_entry[hr]=f_regmap[hr];
10294                         regs[k].regmap[hr]=f_regmap[hr];
10295                         regmap_pre[k+1][hr]=f_regmap[hr];
10296                         regs[k].wasdirty&=~(1<<hr);
10297                         regs[k].dirty&=~(1<<hr);
10298                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10299                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10300                         regs[k].wasconst&=~(1<<hr);
10301                         regs[k].isconst&=~(1<<hr);
10302                         k++;
10303                       }
10304                     }
10305                     else {
10306                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10307                       break;
10308                     }
10309                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10310                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10311                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10312                       regs[i].regmap_entry[hr]=f_regmap[hr];
10313                       regs[i].regmap[hr]=f_regmap[hr];
10314                       regs[i].wasdirty&=~(1<<hr);
10315                       regs[i].dirty&=~(1<<hr);
10316                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10317                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10318                       regs[i].wasconst&=~(1<<hr);
10319                       regs[i].isconst&=~(1<<hr);
10320                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10321                       branch_regs[i].wasdirty&=~(1<<hr);
10322                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10323                       branch_regs[i].regmap[hr]=f_regmap[hr];
10324                       branch_regs[i].dirty&=~(1<<hr);
10325                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10326                       branch_regs[i].wasconst&=~(1<<hr);
10327                       branch_regs[i].isconst&=~(1<<hr);
10328                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10329                         regmap_pre[i+2][hr]=f_regmap[hr];
10330                         regs[i+2].wasdirty&=~(1<<hr);
10331                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10332                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10333                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10334                       }
10335                     }
10336                   }
10337                   for(k=t;k<j;k++) {
10338                     // Alloc register clean at beginning of loop,
10339                     // but may dirty it in pass 6
10340                     regs[k].regmap_entry[hr]=f_regmap[hr];
10341                     regs[k].regmap[hr]=f_regmap[hr];
10342                     regs[k].dirty&=~(1<<hr);
10343                     regs[k].wasconst&=~(1<<hr);
10344                     regs[k].isconst&=~(1<<hr);
10345                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10346                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10347                       branch_regs[k].regmap[hr]=f_regmap[hr];
10348                       branch_regs[k].dirty&=~(1<<hr);
10349                       branch_regs[k].wasconst&=~(1<<hr);
10350                       branch_regs[k].isconst&=~(1<<hr);
10351                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10352                         regmap_pre[k+2][hr]=f_regmap[hr];
10353                         regs[k+2].wasdirty&=~(1<<hr);
10354                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10355                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10356                       }
10357                     }
10358                     else
10359                     {
10360                       regmap_pre[k+1][hr]=f_regmap[hr];
10361                       regs[k+1].wasdirty&=~(1<<hr);
10362                     }
10363                   }
10364                   if(regs[j].regmap[hr]==f_regmap[hr])
10365                     regs[j].regmap_entry[hr]=f_regmap[hr];
10366                   break;
10367                 }
10368                 if(j==i) break;
10369                 if(regs[j].regmap[hr]>=0)
10370                   break;
10371                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10372                   //printf("no-match due to different register\n");
10373                   break;
10374                 }
10375                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10376                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10377                   break;
10378                 }
10379                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10380                 {
10381                   // Stop on unconditional branch
10382                   break;
10383                 }
10384                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10385                 {
10386                   if(ooo[j]) {
10387                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10388                       break;
10389                   }else{
10390                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10391                       break;
10392                   }
10393                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10394                     //printf("no-match due to different register (branch)\n");
10395                     break;
10396                   }
10397                 }
10398                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10399                   //printf("No free regs for store %x\n",start+j*4);
10400                   break;
10401                 }
10402                 if(f_regmap[hr]>=64) {
10403                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10404                     break;
10405                   }
10406                   else
10407                   {
10408                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10409                       break;
10410                     }
10411                   }
10412                 }
10413               }
10414             }
10415           }
10416         }
10417       }
10418     }else{
10419       // Non branch or undetermined branch target
10420       for(hr=0;hr<HOST_REGS;hr++)
10421       {
10422         if(hr!=EXCLUDE_REG) {
10423           if(regs[i].regmap[hr]>64) {
10424             if(!((regs[i].dirty>>hr)&1))
10425               f_regmap[hr]=regs[i].regmap[hr];
10426           }
10427           else if(regs[i].regmap[hr]>=0) {
10428             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10429               // dealloc old register
10430               int n;
10431               for(n=0;n<HOST_REGS;n++)
10432               {
10433                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10434               }
10435               // and alloc new one
10436               f_regmap[hr]=regs[i].regmap[hr];
10437             }
10438           }
10439         }
10440       }
10441       // Try to restore cycle count at branch targets
10442       if(bt[i]) {
10443         for(j=i;j<slen-1;j++) {
10444           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10445           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10446             //printf("no free regs for store %x\n",start+j*4);
10447             break;
10448           }
10449         }
10450         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10451           int k=i;
10452           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10453           while(k<j) {
10454             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10455             regs[k].regmap[HOST_CCREG]=CCREG;
10456             regmap_pre[k+1][HOST_CCREG]=CCREG;
10457             regs[k+1].wasdirty|=1<<HOST_CCREG;
10458             regs[k].dirty|=1<<HOST_CCREG;
10459             regs[k].wasconst&=~(1<<HOST_CCREG);
10460             regs[k].isconst&=~(1<<HOST_CCREG);
10461             k++;
10462           }
10463           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10464         }
10465         // Work backwards from the branch target
10466         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10467         {
10468           //printf("Extend backwards\n");
10469           int k;
10470           k=i;
10471           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10472             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10473               //printf("no free regs for store %x\n",start+(k-1)*4);
10474               break;
10475             }
10476             k--;
10477           }
10478           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10479             //printf("Extend CC, %x ->\n",start+k*4);
10480             while(k<=i) {
10481               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10482               regs[k].regmap[HOST_CCREG]=CCREG;
10483               regmap_pre[k+1][HOST_CCREG]=CCREG;
10484               regs[k+1].wasdirty|=1<<HOST_CCREG;
10485               regs[k].dirty|=1<<HOST_CCREG;
10486               regs[k].wasconst&=~(1<<HOST_CCREG);
10487               regs[k].isconst&=~(1<<HOST_CCREG);
10488               k++;
10489             }
10490           }
10491           else {
10492             //printf("Fail Extend CC, %x ->\n",start+k*4);
10493           }
10494         }
10495       }
10496       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10497          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10498          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10499          itype[i]!=FCONV&&itype[i]!=FCOMP)
10500       {
10501         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10502       }
10503     }
10504   }
10505   
10506   // Cache memory offset or tlb map pointer if a register is available
10507   #ifndef HOST_IMM_ADDR32
10508   #ifndef RAM_OFFSET
10509   if(using_tlb)
10510   #endif
10511   {
10512     int earliest_available[HOST_REGS];
10513     int loop_start[HOST_REGS];
10514     int score[HOST_REGS];
10515     int end[HOST_REGS];
10516     int reg=using_tlb?MMREG:ROREG;
10517
10518     // Init
10519     for(hr=0;hr<HOST_REGS;hr++) {
10520       score[hr]=0;earliest_available[hr]=0;
10521       loop_start[hr]=MAXBLOCK;
10522     }
10523     for(i=0;i<slen-1;i++)
10524     {
10525       // Can't do anything if no registers are available
10526       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10527         for(hr=0;hr<HOST_REGS;hr++) {
10528           score[hr]=0;earliest_available[hr]=i+1;
10529           loop_start[hr]=MAXBLOCK;
10530         }
10531       }
10532       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10533         if(!ooo[i]) {
10534           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10535             for(hr=0;hr<HOST_REGS;hr++) {
10536               score[hr]=0;earliest_available[hr]=i+1;
10537               loop_start[hr]=MAXBLOCK;
10538             }
10539           }
10540         }else{
10541           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10542             for(hr=0;hr<HOST_REGS;hr++) {
10543               score[hr]=0;earliest_available[hr]=i+1;
10544               loop_start[hr]=MAXBLOCK;
10545             }
10546           }
10547         }
10548       }
10549       // Mark unavailable registers
10550       for(hr=0;hr<HOST_REGS;hr++) {
10551         if(regs[i].regmap[hr]>=0) {
10552           score[hr]=0;earliest_available[hr]=i+1;
10553           loop_start[hr]=MAXBLOCK;
10554         }
10555         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10556           if(branch_regs[i].regmap[hr]>=0) {
10557             score[hr]=0;earliest_available[hr]=i+2;
10558             loop_start[hr]=MAXBLOCK;
10559           }
10560         }
10561       }
10562       // No register allocations after unconditional jumps
10563       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10564       {
10565         for(hr=0;hr<HOST_REGS;hr++) {
10566           score[hr]=0;earliest_available[hr]=i+2;
10567           loop_start[hr]=MAXBLOCK;
10568         }
10569         i++; // Skip delay slot too
10570         //printf("skip delay slot: %x\n",start+i*4);
10571       }
10572       else
10573       // Possible match
10574       if(itype[i]==LOAD||itype[i]==LOADLR||
10575          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10576         for(hr=0;hr<HOST_REGS;hr++) {
10577           if(hr!=EXCLUDE_REG) {
10578             end[hr]=i-1;
10579             for(j=i;j<slen-1;j++) {
10580               if(regs[j].regmap[hr]>=0) break;
10581               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10582                 if(branch_regs[j].regmap[hr]>=0) break;
10583                 if(ooo[j]) {
10584                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10585                 }else{
10586                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10587                 }
10588               }
10589               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10590               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10591                 int t=(ba[j]-start)>>2;
10592                 if(t<j&&t>=earliest_available[hr]) {
10593                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10594                     // Score a point for hoisting loop invariant
10595                     if(t<loop_start[hr]) loop_start[hr]=t;
10596                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10597                     score[hr]++;
10598                     end[hr]=j;
10599                   }
10600                 }
10601                 else if(t<j) {
10602                   if(regs[t].regmap[hr]==reg) {
10603                     // Score a point if the branch target matches this register
10604                     score[hr]++;
10605                     end[hr]=j;
10606                   }
10607                 }
10608                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10609                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10610                   score[hr]++;
10611                   end[hr]=j;
10612                 }
10613               }
10614               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10615               {
10616                 // Stop on unconditional branch
10617                 break;
10618               }
10619               else
10620               if(itype[j]==LOAD||itype[j]==LOADLR||
10621                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10622                 score[hr]++;
10623                 end[hr]=j;
10624               }
10625             }
10626           }
10627         }
10628         // Find highest score and allocate that register
10629         int maxscore=0;
10630         for(hr=0;hr<HOST_REGS;hr++) {
10631           if(hr!=EXCLUDE_REG) {
10632             if(score[hr]>score[maxscore]) {
10633               maxscore=hr;
10634               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10635             }
10636           }
10637         }
10638         if(score[maxscore]>1)
10639         {
10640           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10641           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10642             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10643             assert(regs[j].regmap[maxscore]<0);
10644             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10645             regs[j].regmap[maxscore]=reg;
10646             regs[j].dirty&=~(1<<maxscore);
10647             regs[j].wasconst&=~(1<<maxscore);
10648             regs[j].isconst&=~(1<<maxscore);
10649             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10650               branch_regs[j].regmap[maxscore]=reg;
10651               branch_regs[j].wasdirty&=~(1<<maxscore);
10652               branch_regs[j].dirty&=~(1<<maxscore);
10653               branch_regs[j].wasconst&=~(1<<maxscore);
10654               branch_regs[j].isconst&=~(1<<maxscore);
10655               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10656                 regmap_pre[j+2][maxscore]=reg;
10657                 regs[j+2].wasdirty&=~(1<<maxscore);
10658               }
10659               // loop optimization (loop_preload)
10660               int t=(ba[j]-start)>>2;
10661               if(t==loop_start[maxscore]) {
10662                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10663                   regs[t].regmap_entry[maxscore]=reg;
10664               }
10665             }
10666             else
10667             {
10668               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10669                 regmap_pre[j+1][maxscore]=reg;
10670                 regs[j+1].wasdirty&=~(1<<maxscore);
10671               }
10672             }
10673           }
10674           i=j-1;
10675           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10676           for(hr=0;hr<HOST_REGS;hr++) {
10677             score[hr]=0;earliest_available[hr]=i+i;
10678             loop_start[hr]=MAXBLOCK;
10679           }
10680         }
10681       }
10682     }
10683   }
10684   #endif
10685   
10686   // This allocates registers (if possible) one instruction prior
10687   // to use, which can avoid a load-use penalty on certain CPUs.
10688   for(i=0;i<slen-1;i++)
10689   {
10690     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10691     {
10692       if(!bt[i+1])
10693       {
10694         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10695            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10696         {
10697           if(rs1[i+1]) {
10698             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10699             {
10700               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10701               {
10702                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10703                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10704                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10705                 regs[i].isconst&=~(1<<hr);
10706                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10707                 constmap[i][hr]=constmap[i+1][hr];
10708                 regs[i+1].wasdirty&=~(1<<hr);
10709                 regs[i].dirty&=~(1<<hr);
10710               }
10711             }
10712           }
10713           if(rs2[i+1]) {
10714             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10715             {
10716               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10717               {
10718                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10719                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10720                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10721                 regs[i].isconst&=~(1<<hr);
10722                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10723                 constmap[i][hr]=constmap[i+1][hr];
10724                 regs[i+1].wasdirty&=~(1<<hr);
10725                 regs[i].dirty&=~(1<<hr);
10726               }
10727             }
10728           }
10729           // Preload target address for load instruction (non-constant)
10730           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10731             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10732             {
10733               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10734               {
10735                 regs[i].regmap[hr]=rs1[i+1];
10736                 regmap_pre[i+1][hr]=rs1[i+1];
10737                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10738                 regs[i].isconst&=~(1<<hr);
10739                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10740                 constmap[i][hr]=constmap[i+1][hr];
10741                 regs[i+1].wasdirty&=~(1<<hr);
10742                 regs[i].dirty&=~(1<<hr);
10743               }
10744             }
10745           }
10746           // Load source into target register 
10747           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10748             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10749             {
10750               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10751               {
10752                 regs[i].regmap[hr]=rs1[i+1];
10753                 regmap_pre[i+1][hr]=rs1[i+1];
10754                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10755                 regs[i].isconst&=~(1<<hr);
10756                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10757                 constmap[i][hr]=constmap[i+1][hr];
10758                 regs[i+1].wasdirty&=~(1<<hr);
10759                 regs[i].dirty&=~(1<<hr);
10760               }
10761             }
10762           }
10763           // Preload map address
10764           #ifndef HOST_IMM_ADDR32
10765           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10766             hr=get_reg(regs[i+1].regmap,TLREG);
10767             if(hr>=0) {
10768               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10769               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10770                 int nr;
10771                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10772                 {
10773                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10774                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10775                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10776                   regs[i].isconst&=~(1<<hr);
10777                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10778                   constmap[i][hr]=constmap[i+1][hr];
10779                   regs[i+1].wasdirty&=~(1<<hr);
10780                   regs[i].dirty&=~(1<<hr);
10781                 }
10782                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10783                 {
10784                   // move it to another register
10785                   regs[i+1].regmap[hr]=-1;
10786                   regmap_pre[i+2][hr]=-1;
10787                   regs[i+1].regmap[nr]=TLREG;
10788                   regmap_pre[i+2][nr]=TLREG;
10789                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10790                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10791                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10792                   regs[i].isconst&=~(1<<nr);
10793                   regs[i+1].isconst&=~(1<<nr);
10794                   regs[i].dirty&=~(1<<nr);
10795                   regs[i+1].wasdirty&=~(1<<nr);
10796                   regs[i+1].dirty&=~(1<<nr);
10797                   regs[i+2].wasdirty&=~(1<<nr);
10798                 }
10799               }
10800             }
10801           }
10802           #endif
10803           // Address for store instruction (non-constant)
10804           if(itype[i+1]==STORE||itype[i+1]==STORELR
10805              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10806             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10807               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10808               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10809               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10810               assert(hr>=0);
10811               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10812               {
10813                 regs[i].regmap[hr]=rs1[i+1];
10814                 regmap_pre[i+1][hr]=rs1[i+1];
10815                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10816                 regs[i].isconst&=~(1<<hr);
10817                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10818                 constmap[i][hr]=constmap[i+1][hr];
10819                 regs[i+1].wasdirty&=~(1<<hr);
10820                 regs[i].dirty&=~(1<<hr);
10821               }
10822             }
10823           }
10824           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10825             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10826               int nr;
10827               hr=get_reg(regs[i+1].regmap,FTEMP);
10828               assert(hr>=0);
10829               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10830               {
10831                 regs[i].regmap[hr]=rs1[i+1];
10832                 regmap_pre[i+1][hr]=rs1[i+1];
10833                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10834                 regs[i].isconst&=~(1<<hr);
10835                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10836                 constmap[i][hr]=constmap[i+1][hr];
10837                 regs[i+1].wasdirty&=~(1<<hr);
10838                 regs[i].dirty&=~(1<<hr);
10839               }
10840               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10841               {
10842                 // move it to another register
10843                 regs[i+1].regmap[hr]=-1;
10844                 regmap_pre[i+2][hr]=-1;
10845                 regs[i+1].regmap[nr]=FTEMP;
10846                 regmap_pre[i+2][nr]=FTEMP;
10847                 regs[i].regmap[nr]=rs1[i+1];
10848                 regmap_pre[i+1][nr]=rs1[i+1];
10849                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10850                 regs[i].isconst&=~(1<<nr);
10851                 regs[i+1].isconst&=~(1<<nr);
10852                 regs[i].dirty&=~(1<<nr);
10853                 regs[i+1].wasdirty&=~(1<<nr);
10854                 regs[i+1].dirty&=~(1<<nr);
10855                 regs[i+2].wasdirty&=~(1<<nr);
10856               }
10857             }
10858           }
10859           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10860             if(itype[i+1]==LOAD) 
10861               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10862             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10863               hr=get_reg(regs[i+1].regmap,FTEMP);
10864             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10865               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10866               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10867             }
10868             if(hr>=0&&regs[i].regmap[hr]<0) {
10869               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10870               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10871                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10872                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10873                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10874                 regs[i].isconst&=~(1<<hr);
10875                 regs[i+1].wasdirty&=~(1<<hr);
10876                 regs[i].dirty&=~(1<<hr);
10877               }
10878             }
10879           }
10880         }
10881       }
10882     }
10883   }
10884   
10885   /* Pass 6 - Optimize clean/dirty state */
10886   clean_registers(0,slen-1,1);
10887   
10888   /* Pass 7 - Identify 32-bit registers */
10889 #ifndef FORCE32
10890   provisional_r32();
10891
10892   u_int r32=0;
10893   
10894   for (i=slen-1;i>=0;i--)
10895   {
10896     int hr;
10897     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10898     {
10899       if(ba[i]<start || ba[i]>=(start+slen*4))
10900       {
10901         // Branch out of this block, don't need anything
10902         r32=0;
10903       }
10904       else
10905       {
10906         // Internal branch
10907         // Need whatever matches the target
10908         // (and doesn't get overwritten by the delay slot instruction)
10909         r32=0;
10910         int t=(ba[i]-start)>>2;
10911         if(ba[i]>start+i*4) {
10912           // Forward branch
10913           if(!(requires_32bit[t]&~regs[i].was32))
10914             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10915         }else{
10916           // Backward branch
10917           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10918           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10919           if(!(pr32[t]&~regs[i].was32))
10920             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10921         }
10922       }
10923       // Conditional branch may need registers for following instructions
10924       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10925       {
10926         if(i<slen-2) {
10927           r32|=requires_32bit[i+2];
10928           r32&=regs[i].was32;
10929           // Mark this address as a branch target since it may be called
10930           // upon return from interrupt
10931           bt[i+2]=1;
10932         }
10933       }
10934       // Merge in delay slot
10935       if(!likely[i]) {
10936         // These are overwritten unless the branch is "likely"
10937         // and the delay slot is nullified if not taken
10938         r32&=~(1LL<<rt1[i+1]);
10939         r32&=~(1LL<<rt2[i+1]);
10940       }
10941       // Assume these are needed (delay slot)
10942       if(us1[i+1]>0)
10943       {
10944         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10945       }
10946       if(us2[i+1]>0)
10947       {
10948         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10949       }
10950       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10951       {
10952         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10953       }
10954       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10955       {
10956         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10957       }
10958     }
10959     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10960     {
10961       // SYSCALL instruction (software interrupt)
10962       r32=0;
10963     }
10964     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10965     {
10966       // ERET instruction (return from interrupt)
10967       r32=0;
10968     }
10969     // Check 32 bits
10970     r32&=~(1LL<<rt1[i]);
10971     r32&=~(1LL<<rt2[i]);
10972     if(us1[i]>0)
10973     {
10974       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10975     }
10976     if(us2[i]>0)
10977     {
10978       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10979     }
10980     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10981     {
10982       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10983     }
10984     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10985     {
10986       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10987     }
10988     requires_32bit[i]=r32;
10989     
10990     // Dirty registers which are 32-bit, require 32-bit input
10991     // as they will be written as 32-bit values
10992     for(hr=0;hr<HOST_REGS;hr++)
10993     {
10994       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10995         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10996           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10997           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10998         }
10999       }
11000     }
11001     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
11002   }
11003 #else
11004   for (i=slen-1;i>=0;i--)
11005   {
11006     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11007     {
11008       // Conditional branch
11009       if((source[i]>>16)!=0x1000&&i<slen-2) {
11010         // Mark this address as a branch target since it may be called
11011         // upon return from interrupt
11012         bt[i+2]=1;
11013       }
11014     }
11015   }
11016 #endif
11017
11018   if(itype[slen-1]==SPAN) {
11019     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
11020   }
11021
11022 #ifdef DISASM
11023   /* Debug/disassembly */
11024   for(i=0;i<slen;i++)
11025   {
11026     printf("U:");
11027     int r;
11028     for(r=1;r<=CCREG;r++) {
11029       if((unneeded_reg[i]>>r)&1) {
11030         if(r==HIREG) printf(" HI");
11031         else if(r==LOREG) printf(" LO");
11032         else printf(" r%d",r);
11033       }
11034     }
11035 #ifndef FORCE32
11036     printf(" UU:");
11037     for(r=1;r<=CCREG;r++) {
11038       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
11039         if(r==HIREG) printf(" HI");
11040         else if(r==LOREG) printf(" LO");
11041         else printf(" r%d",r);
11042       }
11043     }
11044     printf(" 32:");
11045     for(r=0;r<=CCREG;r++) {
11046       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11047       if((regs[i].was32>>r)&1) {
11048         if(r==CCREG) printf(" CC");
11049         else if(r==HIREG) printf(" HI");
11050         else if(r==LOREG) printf(" LO");
11051         else printf(" r%d",r);
11052       }
11053     }
11054 #endif
11055     printf("\n");
11056     #if defined(__i386__) || defined(__x86_64__)
11057     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
11058     #endif
11059     #ifdef __arm__
11060     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
11061     #endif
11062     printf("needs: ");
11063     if(needed_reg[i]&1) printf("eax ");
11064     if((needed_reg[i]>>1)&1) printf("ecx ");
11065     if((needed_reg[i]>>2)&1) printf("edx ");
11066     if((needed_reg[i]>>3)&1) printf("ebx ");
11067     if((needed_reg[i]>>5)&1) printf("ebp ");
11068     if((needed_reg[i]>>6)&1) printf("esi ");
11069     if((needed_reg[i]>>7)&1) printf("edi ");
11070     printf("r:");
11071     for(r=0;r<=CCREG;r++) {
11072       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11073       if((requires_32bit[i]>>r)&1) {
11074         if(r==CCREG) printf(" CC");
11075         else if(r==HIREG) printf(" HI");
11076         else if(r==LOREG) printf(" LO");
11077         else printf(" r%d",r);
11078       }
11079     }
11080     printf("\n");
11081     /*printf("pr:");
11082     for(r=0;r<=CCREG;r++) {
11083       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11084       if((pr32[i]>>r)&1) {
11085         if(r==CCREG) printf(" CC");
11086         else if(r==HIREG) printf(" HI");
11087         else if(r==LOREG) printf(" LO");
11088         else printf(" r%d",r);
11089       }
11090     }
11091     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
11092     printf("\n");*/
11093     #if defined(__i386__) || defined(__x86_64__)
11094     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
11095     printf("dirty: ");
11096     if(regs[i].wasdirty&1) printf("eax ");
11097     if((regs[i].wasdirty>>1)&1) printf("ecx ");
11098     if((regs[i].wasdirty>>2)&1) printf("edx ");
11099     if((regs[i].wasdirty>>3)&1) printf("ebx ");
11100     if((regs[i].wasdirty>>5)&1) printf("ebp ");
11101     if((regs[i].wasdirty>>6)&1) printf("esi ");
11102     if((regs[i].wasdirty>>7)&1) printf("edi ");
11103     #endif
11104     #ifdef __arm__
11105     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
11106     printf("dirty: ");
11107     if(regs[i].wasdirty&1) printf("r0 ");
11108     if((regs[i].wasdirty>>1)&1) printf("r1 ");
11109     if((regs[i].wasdirty>>2)&1) printf("r2 ");
11110     if((regs[i].wasdirty>>3)&1) printf("r3 ");
11111     if((regs[i].wasdirty>>4)&1) printf("r4 ");
11112     if((regs[i].wasdirty>>5)&1) printf("r5 ");
11113     if((regs[i].wasdirty>>6)&1) printf("r6 ");
11114     if((regs[i].wasdirty>>7)&1) printf("r7 ");
11115     if((regs[i].wasdirty>>8)&1) printf("r8 ");
11116     if((regs[i].wasdirty>>9)&1) printf("r9 ");
11117     if((regs[i].wasdirty>>10)&1) printf("r10 ");
11118     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11119     #endif
11120     printf("\n");
11121     disassemble_inst(i);
11122     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11123     #if defined(__i386__) || defined(__x86_64__)
11124     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11125     if(regs[i].dirty&1) printf("eax ");
11126     if((regs[i].dirty>>1)&1) printf("ecx ");
11127     if((regs[i].dirty>>2)&1) printf("edx ");
11128     if((regs[i].dirty>>3)&1) printf("ebx ");
11129     if((regs[i].dirty>>5)&1) printf("ebp ");
11130     if((regs[i].dirty>>6)&1) printf("esi ");
11131     if((regs[i].dirty>>7)&1) printf("edi ");
11132     #endif
11133     #ifdef __arm__
11134     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11135     if(regs[i].dirty&1) printf("r0 ");
11136     if((regs[i].dirty>>1)&1) printf("r1 ");
11137     if((regs[i].dirty>>2)&1) printf("r2 ");
11138     if((regs[i].dirty>>3)&1) printf("r3 ");
11139     if((regs[i].dirty>>4)&1) printf("r4 ");
11140     if((regs[i].dirty>>5)&1) printf("r5 ");
11141     if((regs[i].dirty>>6)&1) printf("r6 ");
11142     if((regs[i].dirty>>7)&1) printf("r7 ");
11143     if((regs[i].dirty>>8)&1) printf("r8 ");
11144     if((regs[i].dirty>>9)&1) printf("r9 ");
11145     if((regs[i].dirty>>10)&1) printf("r10 ");
11146     if((regs[i].dirty>>12)&1) printf("r12 ");
11147     #endif
11148     printf("\n");
11149     if(regs[i].isconst) {
11150       printf("constants: ");
11151       #if defined(__i386__) || defined(__x86_64__)
11152       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11153       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11154       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11155       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11156       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11157       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11158       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11159       #endif
11160       #ifdef __arm__
11161       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11162       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11163       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11164       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11165       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11166       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11167       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11168       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11169       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11170       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11171       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11172       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11173       #endif
11174       printf("\n");
11175     }
11176 #ifndef FORCE32
11177     printf(" 32:");
11178     for(r=0;r<=CCREG;r++) {
11179       if((regs[i].is32>>r)&1) {
11180         if(r==CCREG) printf(" CC");
11181         else if(r==HIREG) printf(" HI");
11182         else if(r==LOREG) printf(" LO");
11183         else printf(" r%d",r);
11184       }
11185     }
11186     printf("\n");
11187 #endif
11188     /*printf(" p32:");
11189     for(r=0;r<=CCREG;r++) {
11190       if((p32[i]>>r)&1) {
11191         if(r==CCREG) printf(" CC");
11192         else if(r==HIREG) printf(" HI");
11193         else if(r==LOREG) printf(" LO");
11194         else printf(" r%d",r);
11195       }
11196     }
11197     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11198     else printf("\n");*/
11199     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11200       #if defined(__i386__) || defined(__x86_64__)
11201       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11202       if(branch_regs[i].dirty&1) printf("eax ");
11203       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11204       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11205       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11206       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11207       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11208       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11209       #endif
11210       #ifdef __arm__
11211       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11212       if(branch_regs[i].dirty&1) printf("r0 ");
11213       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11214       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11215       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11216       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11217       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11218       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11219       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11220       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11221       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11222       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11223       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11224       #endif
11225 #ifndef FORCE32
11226       printf(" 32:");
11227       for(r=0;r<=CCREG;r++) {
11228         if((branch_regs[i].is32>>r)&1) {
11229           if(r==CCREG) printf(" CC");
11230           else if(r==HIREG) printf(" HI");
11231           else if(r==LOREG) printf(" LO");
11232           else printf(" r%d",r);
11233         }
11234       }
11235       printf("\n");
11236 #endif
11237     }
11238   }
11239 #endif // DISASM
11240
11241   /* Pass 8 - Assembly */
11242   linkcount=0;stubcount=0;
11243   ds=0;is_delayslot=0;
11244   cop1_usable=0;
11245   uint64_t is32_pre=0;
11246   u_int dirty_pre=0;
11247   u_int beginning=(u_int)out;
11248   if((u_int)addr&1) {
11249     ds=1;
11250     pagespan_ds();
11251   }
11252   u_int instr_addr0_override=0;
11253
11254 #ifdef PCSX
11255   if (start == 0x80030000) {
11256     // nasty hack for fastbios thing
11257     // override block entry to this code
11258     instr_addr0_override=(u_int)out;
11259     emit_movimm(start,0);
11260     // abuse io address var as a flag that we
11261     // have already returned here once
11262     emit_readword((int)&address,1);
11263     emit_writeword(0,(int)&pcaddr);
11264     emit_writeword(0,(int)&address);
11265     emit_cmp(0,1);
11266     emit_jne((int)new_dyna_leave);
11267   }
11268 #endif
11269   for(i=0;i<slen;i++)
11270   {
11271     //if(ds) printf("ds: ");
11272     disassemble_inst(i);
11273     if(ds) {
11274       ds=0; // Skip delay slot
11275       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11276       instr_addr[i]=0;
11277     } else {
11278       speculate_register_values(i);
11279       #ifndef DESTRUCTIVE_WRITEBACK
11280       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11281       {
11282         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11283               unneeded_reg[i],unneeded_reg_upper[i]);
11284         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11285               unneeded_reg[i],unneeded_reg_upper[i]);
11286       }
11287       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11288         is32_pre=branch_regs[i].is32;
11289         dirty_pre=branch_regs[i].dirty;
11290       }else{
11291         is32_pre=regs[i].is32;
11292         dirty_pre=regs[i].dirty;
11293       }
11294       #endif
11295       // write back
11296       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11297       {
11298         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11299                       unneeded_reg[i],unneeded_reg_upper[i]);
11300         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11301       }
11302       // branch target entry point
11303       instr_addr[i]=(u_int)out;
11304       assem_debug("<->\n");
11305       // load regs
11306       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11307         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11308       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11309       address_generation(i,&regs[i],regs[i].regmap_entry);
11310       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11311       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11312       {
11313         // Load the delay slot registers if necessary
11314         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11315           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11316         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11317           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11318         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11319           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11320       }
11321       else if(i+1<slen)
11322       {
11323         // Preload registers for following instruction
11324         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11325           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11326             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11327         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11328           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11329             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11330       }
11331       // TODO: if(is_ooo(i)) address_generation(i+1);
11332       if(itype[i]==CJUMP||itype[i]==FJUMP)
11333         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11334       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11335         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11336       if(bt[i]) cop1_usable=0;
11337       // assemble
11338       switch(itype[i]) {
11339         case ALU:
11340           alu_assemble(i,&regs[i]);break;
11341         case IMM16:
11342           imm16_assemble(i,&regs[i]);break;
11343         case SHIFT:
11344           shift_assemble(i,&regs[i]);break;
11345         case SHIFTIMM:
11346           shiftimm_assemble(i,&regs[i]);break;
11347         case LOAD:
11348           load_assemble(i,&regs[i]);break;
11349         case LOADLR:
11350           loadlr_assemble(i,&regs[i]);break;
11351         case STORE:
11352           store_assemble(i,&regs[i]);break;
11353         case STORELR:
11354           storelr_assemble(i,&regs[i]);break;
11355         case COP0:
11356           cop0_assemble(i,&regs[i]);break;
11357         case COP1:
11358           cop1_assemble(i,&regs[i]);break;
11359         case C1LS:
11360           c1ls_assemble(i,&regs[i]);break;
11361         case COP2:
11362           cop2_assemble(i,&regs[i]);break;
11363         case C2LS:
11364           c2ls_assemble(i,&regs[i]);break;
11365         case C2OP:
11366           c2op_assemble(i,&regs[i]);break;
11367         case FCONV:
11368           fconv_assemble(i,&regs[i]);break;
11369         case FLOAT:
11370           float_assemble(i,&regs[i]);break;
11371         case FCOMP:
11372           fcomp_assemble(i,&regs[i]);break;
11373         case MULTDIV:
11374           multdiv_assemble(i,&regs[i]);break;
11375         case MOV:
11376           mov_assemble(i,&regs[i]);break;
11377         case SYSCALL:
11378           syscall_assemble(i,&regs[i]);break;
11379         case HLECALL:
11380           hlecall_assemble(i,&regs[i]);break;
11381         case INTCALL:
11382           intcall_assemble(i,&regs[i]);break;
11383         case UJUMP:
11384           ujump_assemble(i,&regs[i]);ds=1;break;
11385         case RJUMP:
11386           rjump_assemble(i,&regs[i]);ds=1;break;
11387         case CJUMP:
11388           cjump_assemble(i,&regs[i]);ds=1;break;
11389         case SJUMP:
11390           sjump_assemble(i,&regs[i]);ds=1;break;
11391         case FJUMP:
11392           fjump_assemble(i,&regs[i]);ds=1;break;
11393         case SPAN:
11394           pagespan_assemble(i,&regs[i]);break;
11395       }
11396       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11397         literal_pool(1024);
11398       else
11399         literal_pool_jumpover(256);
11400     }
11401   }
11402   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11403   // If the block did not end with an unconditional branch,
11404   // add a jump to the next instruction.
11405   if(i>1) {
11406     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11407       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11408       assert(i==slen);
11409       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11410         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11411         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11412           emit_loadreg(CCREG,HOST_CCREG);
11413         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11414       }
11415       else if(!likely[i-2])
11416       {
11417         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11418         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11419       }
11420       else
11421       {
11422         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11423         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11424       }
11425       add_to_linker((int)out,start+i*4,0);
11426       emit_jmp(0);
11427     }
11428   }
11429   else
11430   {
11431     assert(i>0);
11432     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11433     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11434     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11435       emit_loadreg(CCREG,HOST_CCREG);
11436     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11437     add_to_linker((int)out,start+i*4,0);
11438     emit_jmp(0);
11439   }
11440
11441   // TODO: delay slot stubs?
11442   // Stubs
11443   for(i=0;i<stubcount;i++)
11444   {
11445     switch(stubs[i][0])
11446     {
11447       case LOADB_STUB:
11448       case LOADH_STUB:
11449       case LOADW_STUB:
11450       case LOADD_STUB:
11451       case LOADBU_STUB:
11452       case LOADHU_STUB:
11453         do_readstub(i);break;
11454       case STOREB_STUB:
11455       case STOREH_STUB:
11456       case STOREW_STUB:
11457       case STORED_STUB:
11458         do_writestub(i);break;
11459       case CC_STUB:
11460         do_ccstub(i);break;
11461       case INVCODE_STUB:
11462         do_invstub(i);break;
11463       case FP_STUB:
11464         do_cop1stub(i);break;
11465       case STORELR_STUB:
11466         do_unalignedwritestub(i);break;
11467     }
11468   }
11469
11470   if (instr_addr0_override)
11471     instr_addr[0] = instr_addr0_override;
11472
11473   /* Pass 9 - Linker */
11474   for(i=0;i<linkcount;i++)
11475   {
11476     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11477     literal_pool(64);
11478     if(!link_addr[i][2])
11479     {
11480       void *stub=out;
11481       void *addr=check_addr(link_addr[i][1]);
11482       emit_extjump(link_addr[i][0],link_addr[i][1]);
11483       if(addr) {
11484         set_jump_target(link_addr[i][0],(int)addr);
11485         add_link(link_addr[i][1],stub);
11486       }
11487       else set_jump_target(link_addr[i][0],(int)stub);
11488     }
11489     else
11490     {
11491       // Internal branch
11492       int target=(link_addr[i][1]-start)>>2;
11493       assert(target>=0&&target<slen);
11494       assert(instr_addr[target]);
11495       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11496       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11497       //#else
11498       set_jump_target(link_addr[i][0],instr_addr[target]);
11499       //#endif
11500     }
11501   }
11502   // External Branch Targets (jump_in)
11503   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11504   for(i=0;i<slen;i++)
11505   {
11506     if(bt[i]||i==0)
11507     {
11508       if(instr_addr[i]) // TODO - delay slots (=null)
11509       {
11510         u_int vaddr=start+i*4;
11511         u_int page=get_page(vaddr);
11512         u_int vpage=get_vpage(vaddr);
11513         literal_pool(256);
11514         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11515 #ifndef FORCE32
11516         if(!requires_32bit[i])
11517 #else
11518         if(1)
11519 #endif
11520         {
11521           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11522           assem_debug("jump_in: %x\n",start+i*4);
11523           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11524           int entry_point=do_dirty_stub(i);
11525           ll_add(jump_in+page,vaddr,(void *)entry_point);
11526           // If there was an existing entry in the hash table,
11527           // replace it with the new address.
11528           // Don't add new entries.  We'll insert the
11529           // ones that actually get used in check_addr().
11530           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11531           if(ht_bin[0]==vaddr) {
11532             ht_bin[1]=entry_point;
11533           }
11534           if(ht_bin[2]==vaddr) {
11535             ht_bin[3]=entry_point;
11536           }
11537         }
11538         else
11539         {
11540           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11541           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11542           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11543           //int entry_point=(int)out;
11544           ////assem_debug("entry_point: %x\n",entry_point);
11545           //load_regs_entry(i);
11546           //if(entry_point==(int)out)
11547           //  entry_point=instr_addr[i];
11548           //else
11549           //  emit_jmp(instr_addr[i]);
11550           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11551           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11552           int entry_point=do_dirty_stub(i);
11553           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11554         }
11555       }
11556     }
11557   }
11558   // Write out the literal pool if necessary
11559   literal_pool(0);
11560   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11561   // Align code
11562   if(((u_int)out)&7) emit_addnop(13);
11563   #endif
11564   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11565   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11566   memcpy(copy,source,slen*4);
11567   copy+=slen*4;
11568   
11569   #ifdef __arm__
11570   __clear_cache((void *)beginning,out);
11571   #endif
11572   
11573   // If we're within 256K of the end of the buffer,
11574   // start over from the beginning. (Is 256K enough?)
11575   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11576   
11577   // Trap writes to any of the pages we compiled
11578   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11579     invalid_code[i]=0;
11580 #ifndef DISABLE_TLB
11581     memory_map[i]|=0x40000000;
11582     if((signed int)start>=(signed int)0xC0000000) {
11583       assert(using_tlb);
11584       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11585       invalid_code[j]=0;
11586       memory_map[j]|=0x40000000;
11587       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11588     }
11589 #endif
11590   }
11591   inv_code_start=inv_code_end=~0;
11592 #ifdef PCSX
11593   // for PCSX we need to mark all mirrors too
11594   if(get_page(start)<(RAM_SIZE>>12))
11595     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11596       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11597       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11598       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11599 #endif
11600   
11601   /* Pass 10 - Free memory by expiring oldest blocks */
11602   
11603   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11604   while(expirep!=end)
11605   {
11606     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11607     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11608     inv_debug("EXP: Phase %d\n",expirep);
11609     switch((expirep>>11)&3)
11610     {
11611       case 0:
11612         // Clear jump_in and jump_dirty
11613         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11614         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11615         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11616         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11617         break;
11618       case 1:
11619         // Clear pointers
11620         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11621         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11622         break;
11623       case 2:
11624         // Clear hash table
11625         for(i=0;i<32;i++) {
11626           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11627           if((ht_bin[3]>>shift)==(base>>shift) ||
11628              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11629             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11630             ht_bin[2]=ht_bin[3]=-1;
11631           }
11632           if((ht_bin[1]>>shift)==(base>>shift) ||
11633              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11634             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11635             ht_bin[0]=ht_bin[2];
11636             ht_bin[1]=ht_bin[3];
11637             ht_bin[2]=ht_bin[3]=-1;
11638           }
11639         }
11640         break;
11641       case 3:
11642         // Clear jump_out
11643         #ifdef __arm__
11644         if((expirep&2047)==0) 
11645           do_clear_cache();
11646         #endif
11647         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11648         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11649         break;
11650     }
11651     expirep=(expirep+1)&65535;
11652   }
11653   return 0;
11654 }
11655
11656 // vim:shiftwidth=2:expandtab