drc: remove yet yet more n64 stuff
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t wasdirty;
87   uint64_t dirty;
88   uint64_t u;
89   u_int wasconst;
90   u_int isconst;
91   u_int loadedconst;             // host regs that have constants loaded
92   u_int waswritten;              // MIPS regs that were used as store base before
93 };
94
95 // note: asm depends on this layout
96 struct ll_entry
97 {
98   u_int vaddr;
99   u_int reg_sv_flags;
100   void *addr;
101   struct ll_entry *next;
102 };
103
104 struct ht_entry
105 {
106   u_int vaddr[2];
107   void *tcaddr[2];
108 };
109
110 struct code_stub
111 {
112   enum stub_type type;
113   void *addr;
114   void *retaddr;
115   u_int a;
116   uintptr_t b;
117   uintptr_t c;
118   u_int d;
119   u_int e;
120 };
121
122 struct link_entry
123 {
124   void *addr;
125   u_int target;
126   u_int ext;
127 };
128
129   // used by asm:
130   u_char *out;
131   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
132   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
133   struct ll_entry *jump_dirty[4096];
134
135   static struct ll_entry *jump_out[4096];
136   static u_int start;
137   static u_int *source;
138   static char insn[MAXBLOCK][10];
139   static u_char itype[MAXBLOCK];
140   static u_char opcode[MAXBLOCK];
141   static u_char opcode2[MAXBLOCK];
142   static u_char bt[MAXBLOCK];
143   static u_char rs1[MAXBLOCK];
144   static u_char rs2[MAXBLOCK];
145   static u_char rt1[MAXBLOCK];
146   static u_char rt2[MAXBLOCK];
147   static u_char us1[MAXBLOCK];
148   static u_char us2[MAXBLOCK];
149   static u_char dep1[MAXBLOCK];
150   static u_char dep2[MAXBLOCK];
151   static u_char lt1[MAXBLOCK];
152   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
153   static uint64_t gte_rt[MAXBLOCK];
154   static uint64_t gte_unneeded[MAXBLOCK];
155   static u_int smrv[32]; // speculated MIPS register values
156   static u_int smrv_strong; // mask or regs that are likely to have correct values
157   static u_int smrv_weak; // same, but somewhat less likely
158   static u_int smrv_strong_next; // same, but after current insn executes
159   static u_int smrv_weak_next;
160   static int imm[MAXBLOCK];
161   static u_int ba[MAXBLOCK];
162   static char likely[MAXBLOCK];
163   static char is_ds[MAXBLOCK];
164   static char ooo[MAXBLOCK];
165   static uint64_t unneeded_reg[MAXBLOCK];
166   static uint64_t branch_unneeded_reg[MAXBLOCK];
167   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
168   static uint64_t current_constmap[HOST_REGS];
169   static uint64_t constmap[MAXBLOCK][HOST_REGS];
170   static struct regstat regs[MAXBLOCK];
171   static struct regstat branch_regs[MAXBLOCK];
172   static signed char minimum_free_regs[MAXBLOCK];
173   static u_int needed_reg[MAXBLOCK];
174   static u_int wont_dirty[MAXBLOCK];
175   static u_int will_dirty[MAXBLOCK];
176   static int ccadj[MAXBLOCK];
177   static int slen;
178   static void *instr_addr[MAXBLOCK];
179   static struct link_entry link_addr[MAXBLOCK];
180   static int linkcount;
181   static struct code_stub stubs[MAXBLOCK*3];
182   static int stubcount;
183   static u_int literals[1024][2];
184   static int literalcount;
185   static int is_delayslot;
186   static char shadow[1048576]  __attribute__((aligned(16)));
187   static void *copy;
188   static int expirep;
189   static u_int stop_after_jal;
190 #ifndef RAM_FIXED
191   static uintptr_t ram_offset;
192 #else
193   static const uintptr_t ram_offset=0;
194 #endif
195
196   int new_dynarec_hacks;
197   int new_dynarec_did_compile;
198   extern u_char restore_candidate[512];
199   extern int cycle_count;
200
201   /* registers that may be allocated */
202   /* 1-31 gpr */
203 #define HIREG 32 // hi
204 #define LOREG 33 // lo
205 //#define FSREG 34 // FPU status (FCSR)
206 #define CSREG 35 // Coprocessor status
207 #define CCREG 36 // Cycle count
208 #define INVCP 37 // Pointer to invalid_code
209 //#define MMREG 38 // Pointer to memory_map
210 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
211 #define TEMPREG 40
212 #define FTEMP 40 // FPU temporary register
213 #define PTEMP 41 // Prefetch temporary register
214 //#define TLREG 42 // TLB mapping offset
215 #define RHASH 43 // Return address hash
216 #define RHTBL 44 // Return address hash table address
217 #define RTEMP 45 // JR/JALR address register
218 #define MAXREG 45
219 #define AGEN1 46 // Address generation temporary register
220 //#define AGEN2 47 // Address generation temporary register
221 //#define MGEN1 48 // Maptable address generation temporary register
222 //#define MGEN2 49 // Maptable address generation temporary register
223 #define BTREG 50 // Branch target temporary register
224
225   /* instruction types */
226 #define NOP 0     // No operation
227 #define LOAD 1    // Load
228 #define STORE 2   // Store
229 #define LOADLR 3  // Unaligned load
230 #define STORELR 4 // Unaligned store
231 #define MOV 5     // Move
232 #define ALU 6     // Arithmetic/logic
233 #define MULTDIV 7 // Multiply/divide
234 #define SHIFT 8   // Shift by register
235 #define SHIFTIMM 9// Shift by immediate
236 #define IMM16 10  // 16-bit immediate
237 #define RJUMP 11  // Unconditional jump to register
238 #define UJUMP 12  // Unconditional jump
239 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
240 #define SJUMP 14  // Conditional branch (regimm format)
241 #define COP0 15   // Coprocessor 0
242 #define COP1 16   // Coprocessor 1
243 #define C1LS 17   // Coprocessor 1 load/store
244 //#define FJUMP 18  // Conditional branch (floating point)
245 //#define FLOAT 19  // Floating point unit
246 //#define FCONV 20  // Convert integer to float
247 //#define FCOMP 21  // Floating point compare (sets FSREG)
248 #define SYSCALL 22// SYSCALL
249 #define OTHER 23  // Other
250 #define SPAN 24   // Branch/delay slot spans 2 pages
251 #define NI 25     // Not implemented
252 #define HLECALL 26// PCSX fake opcodes for HLE
253 #define COP2 27   // Coprocessor 2 move
254 #define C2LS 28   // Coprocessor 2 load/store
255 #define C2OP 29   // Coprocessor 2 operation
256 #define INTCALL 30// Call interpreter to handle rare corner cases
257
258   /* branch codes */
259 #define TAKEN 1
260 #define NOTTAKEN 2
261 #define NULLDS 3
262
263 // asm linkage
264 int new_recompile_block(int addr);
265 void *get_addr_ht(u_int vaddr);
266 void invalidate_block(u_int block);
267 void invalidate_addr(u_int addr);
268 void remove_hash(int vaddr);
269 void dyna_linker();
270 void dyna_linker_ds();
271 void verify_code();
272 void verify_code_vm();
273 void verify_code_ds();
274 void cc_interrupt();
275 void fp_exception();
276 void fp_exception_ds();
277 void jump_syscall_hle();
278 void jump_hlecall();
279 void jump_intcall();
280 void new_dyna_leave();
281
282 // Needed by assembler
283 static void wb_register(signed char r,signed char regmap[],uint64_t dirty);
284 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty);
285 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr);
286 static void load_all_regs(signed char i_regmap[]);
287 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
288 static void load_regs_entry(int t);
289 static void load_all_consts(signed char regmap[],u_int dirty,int i);
290
291 static int verify_dirty(u_int *ptr);
292 static int get_final_value(int hr, int i, int *value);
293 static void add_stub(enum stub_type type, void *addr, void *retaddr,
294   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
295 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
296   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
297 static void add_to_linker(void *addr, u_int target, int ext);
298
299 static void mprotect_w_x(void *start, void *end, int is_x)
300 {
301 #ifdef NO_WRITE_EXEC
302   #if defined(VITA)
303   // *Open* enables write on all memory that was
304   // allocated by sceKernelAllocMemBlockForVM()?
305   if (is_x)
306     sceKernelCloseVMDomain();
307   else
308     sceKernelOpenVMDomain();
309   #else
310   u_long mstart = (u_long)start & ~4095ul;
311   u_long mend = (u_long)end;
312   if (mprotect((void *)mstart, mend - mstart,
313                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
314     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
315   #endif
316 #endif
317 }
318
319 static void start_tcache_write(void *start, void *end)
320 {
321   mprotect_w_x(start, end, 0);
322 }
323
324 static void end_tcache_write(void *start, void *end)
325 {
326 #ifdef __arm__
327   size_t len = (char *)end - (char *)start;
328   #if   defined(__BLACKBERRY_QNX__)
329   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
330   #elif defined(__MACH__)
331   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
332   #elif defined(VITA)
333   sceKernelSyncVMDomain(sceBlock, start, len);
334   #elif defined(_3DS)
335   ctr_flush_invalidate_cache();
336   #else
337   __clear_cache(start, end);
338   #endif
339   (void)len;
340 #endif
341
342   mprotect_w_x(start, end, 1);
343 }
344
345 static void *start_block(void)
346 {
347   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
348   if (end > translation_cache + (1<<TARGET_SIZE_2))
349     end = translation_cache + (1<<TARGET_SIZE_2);
350   start_tcache_write(out, end);
351   return out;
352 }
353
354 static void end_block(void *start)
355 {
356   end_tcache_write(start, out);
357 }
358
359 //#define DEBUG_CYCLE_COUNT 1
360
361 #define NO_CYCLE_PENALTY_THR 12
362
363 int cycle_multiplier; // 100 for 1.0
364
365 static int CLOCK_ADJUST(int x)
366 {
367   int s=(x>>31)|1;
368   return (x * cycle_multiplier + s * 50) / 100;
369 }
370
371 static u_int get_page(u_int vaddr)
372 {
373   u_int page=vaddr&~0xe0000000;
374   if (page < 0x1000000)
375     page &= ~0x0e00000; // RAM mirrors
376   page>>=12;
377   if(page>2048) page=2048+(page&2047);
378   return page;
379 }
380
381 // no virtual mem in PCSX
382 static u_int get_vpage(u_int vaddr)
383 {
384   return get_page(vaddr);
385 }
386
387 static struct ht_entry *hash_table_get(u_int vaddr)
388 {
389   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
390 }
391
392 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
393 {
394   ht_bin->vaddr[1] = ht_bin->vaddr[0];
395   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
396   ht_bin->vaddr[0] = vaddr;
397   ht_bin->tcaddr[0] = tcaddr;
398 }
399
400 // some messy ari64's code, seems to rely on unsigned 32bit overflow
401 static int doesnt_expire_soon(void *tcaddr)
402 {
403   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
404   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
405 }
406
407 // Get address from virtual address
408 // This is called from the recompiled JR/JALR instructions
409 void *get_addr(u_int vaddr)
410 {
411   u_int page=get_page(vaddr);
412   u_int vpage=get_vpage(vaddr);
413   struct ll_entry *head;
414   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
415   head=jump_in[page];
416   while(head!=NULL) {
417     if(head->vaddr==vaddr) {
418   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
419       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
420       return head->addr;
421     }
422     head=head->next;
423   }
424   head=jump_dirty[vpage];
425   while(head!=NULL) {
426     if(head->vaddr==vaddr) {
427       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
428       // Don't restore blocks which are about to expire from the cache
429       if (doesnt_expire_soon(head->addr))
430       if (verify_dirty(head->addr)) {
431         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
432         invalid_code[vaddr>>12]=0;
433         inv_code_start=inv_code_end=~0;
434         if(vpage<2048) {
435           restore_candidate[vpage>>3]|=1<<(vpage&7);
436         }
437         else restore_candidate[page>>3]|=1<<(page&7);
438         struct ht_entry *ht_bin = hash_table_get(vaddr);
439         if (ht_bin->vaddr[0] == vaddr)
440           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
441         else
442           hash_table_add(ht_bin, vaddr, head->addr);
443
444         return head->addr;
445       }
446     }
447     head=head->next;
448   }
449   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
450   int r=new_recompile_block(vaddr);
451   if(r==0) return get_addr(vaddr);
452   // Execute in unmapped page, generate pagefault execption
453   Status|=2;
454   Cause=(vaddr<<31)|0x8;
455   EPC=(vaddr&1)?vaddr-5:vaddr;
456   BadVAddr=(vaddr&~1);
457   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
458   EntryHi=BadVAddr&0xFFFFE000;
459   return get_addr_ht(0x80000000);
460 }
461 // Look up address in hash table first
462 void *get_addr_ht(u_int vaddr)
463 {
464   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
465   const struct ht_entry *ht_bin = hash_table_get(vaddr);
466   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
467   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
468   return get_addr(vaddr);
469 }
470
471 void clear_all_regs(signed char regmap[])
472 {
473   int hr;
474   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
475 }
476
477 signed char get_reg(signed char regmap[],int r)
478 {
479   int hr;
480   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
481   return -1;
482 }
483
484 // Find a register that is available for two consecutive cycles
485 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
486 {
487   int hr;
488   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
489   return -1;
490 }
491
492 int count_free_regs(signed char regmap[])
493 {
494   int count=0;
495   int hr;
496   for(hr=0;hr<HOST_REGS;hr++)
497   {
498     if(hr!=EXCLUDE_REG) {
499       if(regmap[hr]<0) count++;
500     }
501   }
502   return count;
503 }
504
505 void dirty_reg(struct regstat *cur,signed char reg)
506 {
507   int hr;
508   if(!reg) return;
509   for (hr=0;hr<HOST_REGS;hr++) {
510     if((cur->regmap[hr]&63)==reg) {
511       cur->dirty|=1<<hr;
512     }
513   }
514 }
515
516 void set_const(struct regstat *cur,signed char reg,uint64_t value)
517 {
518   int hr;
519   if(!reg) return;
520   for (hr=0;hr<HOST_REGS;hr++) {
521     if(cur->regmap[hr]==reg) {
522       cur->isconst|=1<<hr;
523       current_constmap[hr]=value;
524     }
525     else if((cur->regmap[hr]^64)==reg) {
526       cur->isconst|=1<<hr;
527       current_constmap[hr]=value>>32;
528     }
529   }
530 }
531
532 void clear_const(struct regstat *cur,signed char reg)
533 {
534   int hr;
535   if(!reg) return;
536   for (hr=0;hr<HOST_REGS;hr++) {
537     if((cur->regmap[hr]&63)==reg) {
538       cur->isconst&=~(1<<hr);
539     }
540   }
541 }
542
543 int is_const(struct regstat *cur,signed char reg)
544 {
545   int hr;
546   if(reg<0) return 0;
547   if(!reg) return 1;
548   for (hr=0;hr<HOST_REGS;hr++) {
549     if((cur->regmap[hr]&63)==reg) {
550       return (cur->isconst>>hr)&1;
551     }
552   }
553   return 0;
554 }
555 uint64_t get_const(struct regstat *cur,signed char reg)
556 {
557   int hr;
558   if(!reg) return 0;
559   for (hr=0;hr<HOST_REGS;hr++) {
560     if(cur->regmap[hr]==reg) {
561       return current_constmap[hr];
562     }
563   }
564   SysPrintf("Unknown constant in r%d\n",reg);
565   exit(1);
566 }
567
568 // Least soon needed registers
569 // Look at the next ten instructions and see which registers
570 // will be used.  Try not to reallocate these.
571 void lsn(u_char hsn[], int i, int *preferred_reg)
572 {
573   int j;
574   int b=-1;
575   for(j=0;j<9;j++)
576   {
577     if(i+j>=slen) {
578       j=slen-i-1;
579       break;
580     }
581     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
582     {
583       // Don't go past an unconditonal jump
584       j++;
585       break;
586     }
587   }
588   for(;j>=0;j--)
589   {
590     if(rs1[i+j]) hsn[rs1[i+j]]=j;
591     if(rs2[i+j]) hsn[rs2[i+j]]=j;
592     if(rt1[i+j]) hsn[rt1[i+j]]=j;
593     if(rt2[i+j]) hsn[rt2[i+j]]=j;
594     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
595       // Stores can allocate zero
596       hsn[rs1[i+j]]=j;
597       hsn[rs2[i+j]]=j;
598     }
599     // On some architectures stores need invc_ptr
600     #if defined(HOST_IMM8)
601     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
602       hsn[INVCP]=j;
603     }
604     #endif
605     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
606     {
607       hsn[CCREG]=j;
608       b=j;
609     }
610   }
611   if(b>=0)
612   {
613     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
614     {
615       // Follow first branch
616       int t=(ba[i+b]-start)>>2;
617       j=7-b;if(t+j>=slen) j=slen-t-1;
618       for(;j>=0;j--)
619       {
620         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
621         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
622         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
623         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
624       }
625     }
626     // TODO: preferred register based on backward branch
627   }
628   // Delay slot should preferably not overwrite branch conditions or cycle count
629   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
630     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
631     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
632     hsn[CCREG]=1;
633     // ...or hash tables
634     hsn[RHASH]=1;
635     hsn[RHTBL]=1;
636   }
637   // Coprocessor load/store needs FTEMP, even if not declared
638   if(itype[i]==C1LS||itype[i]==C2LS) {
639     hsn[FTEMP]=0;
640   }
641   // Load L/R also uses FTEMP as a temporary register
642   if(itype[i]==LOADLR) {
643     hsn[FTEMP]=0;
644   }
645   // Also SWL/SWR/SDL/SDR
646   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
647     hsn[FTEMP]=0;
648   }
649   // Don't remove the miniht registers
650   if(itype[i]==UJUMP||itype[i]==RJUMP)
651   {
652     hsn[RHASH]=0;
653     hsn[RHTBL]=0;
654   }
655 }
656
657 // We only want to allocate registers if we're going to use them again soon
658 int needed_again(int r, int i)
659 {
660   int j;
661   int b=-1;
662   int rn=10;
663
664   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
665   {
666     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
667       return 0; // Don't need any registers if exiting the block
668   }
669   for(j=0;j<9;j++)
670   {
671     if(i+j>=slen) {
672       j=slen-i-1;
673       break;
674     }
675     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
676     {
677       // Don't go past an unconditonal jump
678       j++;
679       break;
680     }
681     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
682     {
683       break;
684     }
685   }
686   for(;j>=1;j--)
687   {
688     if(rs1[i+j]==r) rn=j;
689     if(rs2[i+j]==r) rn=j;
690     if((unneeded_reg[i+j]>>r)&1) rn=10;
691     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
692     {
693       b=j;
694     }
695   }
696   /*
697   if(b>=0)
698   {
699     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
700     {
701       // Follow first branch
702       int o=rn;
703       int t=(ba[i+b]-start)>>2;
704       j=7-b;if(t+j>=slen) j=slen-t-1;
705       for(;j>=0;j--)
706       {
707         if(!((unneeded_reg[t+j]>>r)&1)) {
708           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
709           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
710         }
711         else rn=o;
712       }
713     }
714   }*/
715   if(rn<10) return 1;
716   (void)b;
717   return 0;
718 }
719
720 // Try to match register allocations at the end of a loop with those
721 // at the beginning
722 int loop_reg(int i, int r, int hr)
723 {
724   int j,k;
725   for(j=0;j<9;j++)
726   {
727     if(i+j>=slen) {
728       j=slen-i-1;
729       break;
730     }
731     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
732     {
733       // Don't go past an unconditonal jump
734       j++;
735       break;
736     }
737   }
738   k=0;
739   if(i>0){
740     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
741       k--;
742   }
743   for(;k<j;k++)
744   {
745     assert(r < 64);
746     if((unneeded_reg[i+k]>>r)&1) return hr;
747     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
748     {
749       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
750       {
751         int t=(ba[i+k]-start)>>2;
752         int reg=get_reg(regs[t].regmap_entry,r);
753         if(reg>=0) return reg;
754         //reg=get_reg(regs[t+1].regmap_entry,r);
755         //if(reg>=0) return reg;
756       }
757     }
758   }
759   return hr;
760 }
761
762
763 // Allocate every register, preserving source/target regs
764 void alloc_all(struct regstat *cur,int i)
765 {
766   int hr;
767
768   for(hr=0;hr<HOST_REGS;hr++) {
769     if(hr!=EXCLUDE_REG) {
770       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
771          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
772       {
773         cur->regmap[hr]=-1;
774         cur->dirty&=~(1<<hr);
775       }
776       // Don't need zeros
777       if((cur->regmap[hr]&63)==0)
778       {
779         cur->regmap[hr]=-1;
780         cur->dirty&=~(1<<hr);
781       }
782     }
783   }
784 }
785
786 #ifdef __i386__
787 #include "assem_x86.c"
788 #endif
789 #ifdef __x86_64__
790 #include "assem_x64.c"
791 #endif
792 #ifdef __arm__
793 #include "assem_arm.c"
794 #endif
795
796 // Add virtual address mapping to linked list
797 void ll_add(struct ll_entry **head,int vaddr,void *addr)
798 {
799   struct ll_entry *new_entry;
800   new_entry=malloc(sizeof(struct ll_entry));
801   assert(new_entry!=NULL);
802   new_entry->vaddr=vaddr;
803   new_entry->reg_sv_flags=0;
804   new_entry->addr=addr;
805   new_entry->next=*head;
806   *head=new_entry;
807 }
808
809 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
810 {
811   ll_add(head,vaddr,addr);
812   (*head)->reg_sv_flags=reg_sv_flags;
813 }
814
815 // Check if an address is already compiled
816 // but don't return addresses which are about to expire from the cache
817 void *check_addr(u_int vaddr)
818 {
819   struct ht_entry *ht_bin = hash_table_get(vaddr);
820   size_t i;
821   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
822     if (ht_bin->vaddr[i] == vaddr)
823       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
824         if (isclean(ht_bin->tcaddr[i]))
825           return ht_bin->tcaddr[i];
826   }
827   u_int page=get_page(vaddr);
828   struct ll_entry *head;
829   head=jump_in[page];
830   while (head != NULL) {
831     if (head->vaddr == vaddr) {
832       if (doesnt_expire_soon(head->addr)) {
833         // Update existing entry with current address
834         if (ht_bin->vaddr[0] == vaddr) {
835           ht_bin->tcaddr[0] = head->addr;
836           return head->addr;
837         }
838         if (ht_bin->vaddr[1] == vaddr) {
839           ht_bin->tcaddr[1] = head->addr;
840           return head->addr;
841         }
842         // Insert into hash table with low priority.
843         // Don't evict existing entries, as they are probably
844         // addresses that are being accessed frequently.
845         if (ht_bin->vaddr[0] == -1) {
846           ht_bin->vaddr[0] = vaddr;
847           ht_bin->tcaddr[0] = head->addr;
848         }
849         else if (ht_bin->vaddr[1] == -1) {
850           ht_bin->vaddr[1] = vaddr;
851           ht_bin->tcaddr[1] = head->addr;
852         }
853         return head->addr;
854       }
855     }
856     head=head->next;
857   }
858   return 0;
859 }
860
861 void remove_hash(int vaddr)
862 {
863   //printf("remove hash: %x\n",vaddr);
864   struct ht_entry *ht_bin = hash_table_get(vaddr);
865   if (ht_bin->vaddr[1] == vaddr) {
866     ht_bin->vaddr[1] = -1;
867     ht_bin->tcaddr[1] = NULL;
868   }
869   if (ht_bin->vaddr[0] == vaddr) {
870     ht_bin->vaddr[0] = ht_bin->vaddr[1];
871     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
872     ht_bin->vaddr[1] = -1;
873     ht_bin->tcaddr[1] = NULL;
874   }
875 }
876
877 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
878 {
879   struct ll_entry *next;
880   while(*head) {
881     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
882        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
883     {
884       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
885       remove_hash((*head)->vaddr);
886       next=(*head)->next;
887       free(*head);
888       *head=next;
889     }
890     else
891     {
892       head=&((*head)->next);
893     }
894   }
895 }
896
897 // Remove all entries from linked list
898 void ll_clear(struct ll_entry **head)
899 {
900   struct ll_entry *cur;
901   struct ll_entry *next;
902   if((cur=*head)) {
903     *head=0;
904     while(cur) {
905       next=cur->next;
906       free(cur);
907       cur=next;
908     }
909   }
910 }
911
912 // Dereference the pointers and remove if it matches
913 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
914 {
915   while(head) {
916     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
917     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
918     if(((ptr>>shift)==(addr>>shift)) ||
919        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
920     {
921       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
922       void *host_addr=find_extjump_insn(head->addr);
923       #ifdef __arm__
924         mark_clear_cache(host_addr);
925       #endif
926       set_jump_target(host_addr, head->addr);
927     }
928     head=head->next;
929   }
930 }
931
932 // This is called when we write to a compiled block (see do_invstub)
933 void invalidate_page(u_int page)
934 {
935   struct ll_entry *head;
936   struct ll_entry *next;
937   head=jump_in[page];
938   jump_in[page]=0;
939   while(head!=NULL) {
940     inv_debug("INVALIDATE: %x\n",head->vaddr);
941     remove_hash(head->vaddr);
942     next=head->next;
943     free(head);
944     head=next;
945   }
946   head=jump_out[page];
947   jump_out[page]=0;
948   while(head!=NULL) {
949     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
950     void *host_addr=find_extjump_insn(head->addr);
951     #ifdef __arm__
952       mark_clear_cache(host_addr);
953     #endif
954     set_jump_target(host_addr, head->addr);
955     next=head->next;
956     free(head);
957     head=next;
958   }
959 }
960
961 static void invalidate_block_range(u_int block, u_int first, u_int last)
962 {
963   u_int page=get_page(block<<12);
964   //printf("first=%d last=%d\n",first,last);
965   invalidate_page(page);
966   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
967   assert(last<page+5);
968   // Invalidate the adjacent pages if a block crosses a 4K boundary
969   while(first<page) {
970     invalidate_page(first);
971     first++;
972   }
973   for(first=page+1;first<last;first++) {
974     invalidate_page(first);
975   }
976   #ifdef __arm__
977     do_clear_cache();
978   #endif
979
980   // Don't trap writes
981   invalid_code[block]=1;
982
983   #ifdef USE_MINI_HT
984   memset(mini_ht,-1,sizeof(mini_ht));
985   #endif
986 }
987
988 void invalidate_block(u_int block)
989 {
990   u_int page=get_page(block<<12);
991   u_int vpage=get_vpage(block<<12);
992   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
993   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
994   u_int first,last;
995   first=last=page;
996   struct ll_entry *head;
997   head=jump_dirty[vpage];
998   //printf("page=%d vpage=%d\n",page,vpage);
999   while(head!=NULL) {
1000     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1001       u_char *start, *end;
1002       get_bounds(head->addr, &start, &end);
1003       //printf("start: %p end: %p\n", start, end);
1004       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1005         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1006           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1007           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1008         }
1009       }
1010     }
1011     head=head->next;
1012   }
1013   invalidate_block_range(block,first,last);
1014 }
1015
1016 void invalidate_addr(u_int addr)
1017 {
1018   //static int rhits;
1019   // this check is done by the caller
1020   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1021   u_int page=get_vpage(addr);
1022   if(page<2048) { // RAM
1023     struct ll_entry *head;
1024     u_int addr_min=~0, addr_max=0;
1025     u_int mask=RAM_SIZE-1;
1026     u_int addr_main=0x80000000|(addr&mask);
1027     int pg1;
1028     inv_code_start=addr_main&~0xfff;
1029     inv_code_end=addr_main|0xfff;
1030     pg1=page;
1031     if (pg1>0) {
1032       // must check previous page too because of spans..
1033       pg1--;
1034       inv_code_start-=0x1000;
1035     }
1036     for(;pg1<=page;pg1++) {
1037       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1038         u_char *start_h, *end_h;
1039         u_int start, end;
1040         get_bounds(head->addr, &start_h, &end_h);
1041         start = (uintptr_t)start_h - ram_offset;
1042         end = (uintptr_t)end_h - ram_offset;
1043         if(start<=addr_main&&addr_main<end) {
1044           if(start<addr_min) addr_min=start;
1045           if(end>addr_max) addr_max=end;
1046         }
1047         else if(addr_main<start) {
1048           if(start<inv_code_end)
1049             inv_code_end=start-1;
1050         }
1051         else {
1052           if(end>inv_code_start)
1053             inv_code_start=end;
1054         }
1055       }
1056     }
1057     if (addr_min!=~0) {
1058       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1059       inv_code_start=inv_code_end=~0;
1060       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1061       return;
1062     }
1063     else {
1064       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1065       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1066       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1067       return;
1068     }
1069   }
1070   invalidate_block(addr>>12);
1071 }
1072
1073 // This is called when loading a save state.
1074 // Anything could have changed, so invalidate everything.
1075 void invalidate_all_pages()
1076 {
1077   u_int page;
1078   for(page=0;page<4096;page++)
1079     invalidate_page(page);
1080   for(page=0;page<1048576;page++)
1081     if(!invalid_code[page]) {
1082       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1083       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1084     }
1085   #ifdef USE_MINI_HT
1086   memset(mini_ht,-1,sizeof(mini_ht));
1087   #endif
1088 }
1089
1090 // Add an entry to jump_out after making a link
1091 void add_link(u_int vaddr,void *src)
1092 {
1093   u_int page=get_page(vaddr);
1094   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1095   int *ptr=(int *)(src+4);
1096   assert((*ptr&0x0fff0000)==0x059f0000);
1097   (void)ptr;
1098   ll_add(jump_out+page,vaddr,src);
1099   //void *ptr=get_pointer(src);
1100   //inv_debug("add_link: Pointer is to %p\n",ptr);
1101 }
1102
1103 // If a code block was found to be unmodified (bit was set in
1104 // restore_candidate) and it remains unmodified (bit is clear
1105 // in invalid_code) then move the entries for that 4K page from
1106 // the dirty list to the clean list.
1107 void clean_blocks(u_int page)
1108 {
1109   struct ll_entry *head;
1110   inv_debug("INV: clean_blocks page=%d\n",page);
1111   head=jump_dirty[page];
1112   while(head!=NULL) {
1113     if(!invalid_code[head->vaddr>>12]) {
1114       // Don't restore blocks which are about to expire from the cache
1115       if (doesnt_expire_soon(head->addr)) {
1116         if(verify_dirty(head->addr)) {
1117           u_char *start, *end;
1118           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1119           u_int i;
1120           u_int inv=0;
1121           get_bounds(head->addr, &start, &end);
1122           if (start - rdram < RAM_SIZE) {
1123             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1124               inv|=invalid_code[i];
1125             }
1126           }
1127           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1128             inv=1;
1129           }
1130           if(!inv) {
1131             void *clean_addr = get_clean_addr(head->addr);
1132             if (doesnt_expire_soon(clean_addr)) {
1133               u_int ppage=page;
1134               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1135               //printf("page=%x, addr=%x\n",page,head->vaddr);
1136               //assert(head->vaddr>>12==(page|0x80000));
1137               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1138               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1139               if (ht_bin->vaddr[0] == head->vaddr)
1140                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1141               if (ht_bin->vaddr[1] == head->vaddr)
1142                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1143             }
1144           }
1145         }
1146       }
1147     }
1148     head=head->next;
1149   }
1150 }
1151
1152 static void mov_alloc(struct regstat *current,int i)
1153 {
1154   // Note: Don't need to actually alloc the source registers
1155   //alloc_reg(current,i,rs1[i]);
1156   alloc_reg(current,i,rt1[i]);
1157
1158   clear_const(current,rs1[i]);
1159   clear_const(current,rt1[i]);
1160   dirty_reg(current,rt1[i]);
1161 }
1162
1163 static void shiftimm_alloc(struct regstat *current,int i)
1164 {
1165   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1166   {
1167     if(rt1[i]) {
1168       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1169       else lt1[i]=rs1[i];
1170       alloc_reg(current,i,rt1[i]);
1171       dirty_reg(current,rt1[i]);
1172       if(is_const(current,rs1[i])) {
1173         int v=get_const(current,rs1[i]);
1174         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1175         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1176         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1177       }
1178       else clear_const(current,rt1[i]);
1179     }
1180   }
1181   else
1182   {
1183     clear_const(current,rs1[i]);
1184     clear_const(current,rt1[i]);
1185   }
1186
1187   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1188   {
1189     assert(0);
1190   }
1191   if(opcode2[i]==0x3c) // DSLL32
1192   {
1193     assert(0);
1194   }
1195   if(opcode2[i]==0x3e) // DSRL32
1196   {
1197     assert(0);
1198   }
1199   if(opcode2[i]==0x3f) // DSRA32
1200   {
1201     assert(0);
1202   }
1203 }
1204
1205 static void shift_alloc(struct regstat *current,int i)
1206 {
1207   if(rt1[i]) {
1208     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1209     {
1210       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1211       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1212       alloc_reg(current,i,rt1[i]);
1213       if(rt1[i]==rs2[i]) {
1214         alloc_reg_temp(current,i,-1);
1215         minimum_free_regs[i]=1;
1216       }
1217     } else { // DSLLV/DSRLV/DSRAV
1218       assert(0);
1219     }
1220     clear_const(current,rs1[i]);
1221     clear_const(current,rs2[i]);
1222     clear_const(current,rt1[i]);
1223     dirty_reg(current,rt1[i]);
1224   }
1225 }
1226
1227 static void alu_alloc(struct regstat *current,int i)
1228 {
1229   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1230     if(rt1[i]) {
1231       if(rs1[i]&&rs2[i]) {
1232         alloc_reg(current,i,rs1[i]);
1233         alloc_reg(current,i,rs2[i]);
1234       }
1235       else {
1236         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1237         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1238       }
1239       alloc_reg(current,i,rt1[i]);
1240     }
1241   }
1242   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1243     if(rt1[i]) {
1244       alloc_reg(current,i,rs1[i]);
1245       alloc_reg(current,i,rs2[i]);
1246       alloc_reg(current,i,rt1[i]);
1247     }
1248   }
1249   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1250     if(rt1[i]) {
1251       if(rs1[i]&&rs2[i]) {
1252         alloc_reg(current,i,rs1[i]);
1253         alloc_reg(current,i,rs2[i]);
1254       }
1255       else
1256       {
1257         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1258         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1259       }
1260       alloc_reg(current,i,rt1[i]);
1261     }
1262   }
1263   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1264     assert(0);
1265   }
1266   clear_const(current,rs1[i]);
1267   clear_const(current,rs2[i]);
1268   clear_const(current,rt1[i]);
1269   dirty_reg(current,rt1[i]);
1270 }
1271
1272 static void imm16_alloc(struct regstat *current,int i)
1273 {
1274   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1275   else lt1[i]=rs1[i];
1276   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1277   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1278     assert(0);
1279   }
1280   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rt1[i]);
1283   }
1284   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1285     if(is_const(current,rs1[i])) {
1286       int v=get_const(current,rs1[i]);
1287       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1288       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1289       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1290     }
1291     else clear_const(current,rt1[i]);
1292   }
1293   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1294     if(is_const(current,rs1[i])) {
1295       int v=get_const(current,rs1[i]);
1296       set_const(current,rt1[i],v+imm[i]);
1297     }
1298     else clear_const(current,rt1[i]);
1299   }
1300   else {
1301     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1302   }
1303   dirty_reg(current,rt1[i]);
1304 }
1305
1306 static void load_alloc(struct regstat *current,int i)
1307 {
1308   clear_const(current,rt1[i]);
1309   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1310   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1311   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1312   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1313     alloc_reg(current,i,rt1[i]);
1314     assert(get_reg(current->regmap,rt1[i])>=0);
1315     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1316     {
1317       assert(0);
1318     }
1319     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1320     {
1321       assert(0);
1322     }
1323     dirty_reg(current,rt1[i]);
1324     // LWL/LWR need a temporary register for the old value
1325     if(opcode[i]==0x22||opcode[i]==0x26)
1326     {
1327       alloc_reg(current,i,FTEMP);
1328       alloc_reg_temp(current,i,-1);
1329       minimum_free_regs[i]=1;
1330     }
1331   }
1332   else
1333   {
1334     // Load to r0 or unneeded register (dummy load)
1335     // but we still need a register to calculate the address
1336     if(opcode[i]==0x22||opcode[i]==0x26)
1337     {
1338       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1339     }
1340     alloc_reg_temp(current,i,-1);
1341     minimum_free_regs[i]=1;
1342     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1343     {
1344       assert(0);
1345     }
1346   }
1347 }
1348
1349 void store_alloc(struct regstat *current,int i)
1350 {
1351   clear_const(current,rs2[i]);
1352   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1353   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1354   alloc_reg(current,i,rs2[i]);
1355   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1356     assert(0);
1357   }
1358   #if defined(HOST_IMM8)
1359   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1360   else alloc_reg(current,i,INVCP);
1361   #endif
1362   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1363     alloc_reg(current,i,FTEMP);
1364   }
1365   // We need a temporary register for address generation
1366   alloc_reg_temp(current,i,-1);
1367   minimum_free_regs[i]=1;
1368 }
1369
1370 void c1ls_alloc(struct regstat *current,int i)
1371 {
1372   //clear_const(current,rs1[i]); // FIXME
1373   clear_const(current,rt1[i]);
1374   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1375   alloc_reg(current,i,CSREG); // Status
1376   alloc_reg(current,i,FTEMP);
1377   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1378     assert(0);
1379   }
1380   #if defined(HOST_IMM8)
1381   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1382   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1383     alloc_reg(current,i,INVCP);
1384   #endif
1385   // We need a temporary register for address generation
1386   alloc_reg_temp(current,i,-1);
1387 }
1388
1389 void c2ls_alloc(struct regstat *current,int i)
1390 {
1391   clear_const(current,rt1[i]);
1392   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1393   alloc_reg(current,i,FTEMP);
1394   #if defined(HOST_IMM8)
1395   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1396   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1397     alloc_reg(current,i,INVCP);
1398   #endif
1399   // We need a temporary register for address generation
1400   alloc_reg_temp(current,i,-1);
1401   minimum_free_regs[i]=1;
1402 }
1403
1404 #ifndef multdiv_alloc
1405 void multdiv_alloc(struct regstat *current,int i)
1406 {
1407   //  case 0x18: MULT
1408   //  case 0x19: MULTU
1409   //  case 0x1A: DIV
1410   //  case 0x1B: DIVU
1411   //  case 0x1C: DMULT
1412   //  case 0x1D: DMULTU
1413   //  case 0x1E: DDIV
1414   //  case 0x1F: DDIVU
1415   clear_const(current,rs1[i]);
1416   clear_const(current,rs2[i]);
1417   if(rs1[i]&&rs2[i])
1418   {
1419     if((opcode2[i]&4)==0) // 32-bit
1420     {
1421       current->u&=~(1LL<<HIREG);
1422       current->u&=~(1LL<<LOREG);
1423       alloc_reg(current,i,HIREG);
1424       alloc_reg(current,i,LOREG);
1425       alloc_reg(current,i,rs1[i]);
1426       alloc_reg(current,i,rs2[i]);
1427       dirty_reg(current,HIREG);
1428       dirty_reg(current,LOREG);
1429     }
1430     else // 64-bit
1431     {
1432       assert(0);
1433     }
1434   }
1435   else
1436   {
1437     // Multiply by zero is zero.
1438     // MIPS does not have a divide by zero exception.
1439     // The result is undefined, we return zero.
1440     alloc_reg(current,i,HIREG);
1441     alloc_reg(current,i,LOREG);
1442     dirty_reg(current,HIREG);
1443     dirty_reg(current,LOREG);
1444   }
1445 }
1446 #endif
1447
1448 void cop0_alloc(struct regstat *current,int i)
1449 {
1450   if(opcode2[i]==0) // MFC0
1451   {
1452     if(rt1[i]) {
1453       clear_const(current,rt1[i]);
1454       alloc_all(current,i);
1455       alloc_reg(current,i,rt1[i]);
1456       dirty_reg(current,rt1[i]);
1457     }
1458   }
1459   else if(opcode2[i]==4) // MTC0
1460   {
1461     if(rs1[i]){
1462       clear_const(current,rs1[i]);
1463       alloc_reg(current,i,rs1[i]);
1464       alloc_all(current,i);
1465     }
1466     else {
1467       alloc_all(current,i); // FIXME: Keep r0
1468       current->u&=~1LL;
1469       alloc_reg(current,i,0);
1470     }
1471   }
1472   else
1473   {
1474     // TLBR/TLBWI/TLBWR/TLBP/ERET
1475     assert(opcode2[i]==0x10);
1476     alloc_all(current,i);
1477   }
1478   minimum_free_regs[i]=HOST_REGS;
1479 }
1480
1481 static void cop12_alloc(struct regstat *current,int i)
1482 {
1483   alloc_reg(current,i,CSREG); // Load status
1484   if(opcode2[i]<3) // MFC1/CFC1
1485   {
1486     if(rt1[i]){
1487       clear_const(current,rt1[i]);
1488       alloc_reg(current,i,rt1[i]);
1489       dirty_reg(current,rt1[i]);
1490     }
1491     alloc_reg_temp(current,i,-1);
1492   }
1493   else if(opcode2[i]>3) // MTC1/CTC1
1494   {
1495     if(rs1[i]){
1496       clear_const(current,rs1[i]);
1497       alloc_reg(current,i,rs1[i]);
1498     }
1499     else {
1500       current->u&=~1LL;
1501       alloc_reg(current,i,0);
1502     }
1503     alloc_reg_temp(current,i,-1);
1504   }
1505   minimum_free_regs[i]=1;
1506 }
1507
1508 void c2op_alloc(struct regstat *current,int i)
1509 {
1510   alloc_reg_temp(current,i,-1);
1511 }
1512
1513 void syscall_alloc(struct regstat *current,int i)
1514 {
1515   alloc_cc(current,i);
1516   dirty_reg(current,CCREG);
1517   alloc_all(current,i);
1518   minimum_free_regs[i]=HOST_REGS;
1519   current->isconst=0;
1520 }
1521
1522 void delayslot_alloc(struct regstat *current,int i)
1523 {
1524   switch(itype[i]) {
1525     case UJUMP:
1526     case CJUMP:
1527     case SJUMP:
1528     case RJUMP:
1529     case SYSCALL:
1530     case HLECALL:
1531     case SPAN:
1532       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1533       SysPrintf("Disabled speculative precompilation\n");
1534       stop_after_jal=1;
1535       break;
1536     case IMM16:
1537       imm16_alloc(current,i);
1538       break;
1539     case LOAD:
1540     case LOADLR:
1541       load_alloc(current,i);
1542       break;
1543     case STORE:
1544     case STORELR:
1545       store_alloc(current,i);
1546       break;
1547     case ALU:
1548       alu_alloc(current,i);
1549       break;
1550     case SHIFT:
1551       shift_alloc(current,i);
1552       break;
1553     case MULTDIV:
1554       multdiv_alloc(current,i);
1555       break;
1556     case SHIFTIMM:
1557       shiftimm_alloc(current,i);
1558       break;
1559     case MOV:
1560       mov_alloc(current,i);
1561       break;
1562     case COP0:
1563       cop0_alloc(current,i);
1564       break;
1565     case COP1:
1566     case COP2:
1567       cop12_alloc(current,i);
1568       break;
1569     case C1LS:
1570       c1ls_alloc(current,i);
1571       break;
1572     case C2LS:
1573       c2ls_alloc(current,i);
1574       break;
1575     case C2OP:
1576       c2op_alloc(current,i);
1577       break;
1578   }
1579 }
1580
1581 // Special case where a branch and delay slot span two pages in virtual memory
1582 static void pagespan_alloc(struct regstat *current,int i)
1583 {
1584   current->isconst=0;
1585   current->wasconst=0;
1586   regs[i].wasconst=0;
1587   minimum_free_regs[i]=HOST_REGS;
1588   alloc_all(current,i);
1589   alloc_cc(current,i);
1590   dirty_reg(current,CCREG);
1591   if(opcode[i]==3) // JAL
1592   {
1593     alloc_reg(current,i,31);
1594     dirty_reg(current,31);
1595   }
1596   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1597   {
1598     alloc_reg(current,i,rs1[i]);
1599     if (rt1[i]!=0) {
1600       alloc_reg(current,i,rt1[i]);
1601       dirty_reg(current,rt1[i]);
1602     }
1603   }
1604   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1605   {
1606     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1607     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1608   }
1609   else
1610   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1611   {
1612     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1613   }
1614   //else ...
1615 }
1616
1617 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1618   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1619 {
1620   assert(a < ARRAY_SIZE(stubs));
1621   stubs[stubcount].type = type;
1622   stubs[stubcount].addr = addr;
1623   stubs[stubcount].retaddr = retaddr;
1624   stubs[stubcount].a = a;
1625   stubs[stubcount].b = b;
1626   stubs[stubcount].c = c;
1627   stubs[stubcount].d = d;
1628   stubs[stubcount].e = e;
1629   stubcount++;
1630 }
1631
1632 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1633   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1634 {
1635   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1636 }
1637
1638 // Write out a single register
1639 static void wb_register(signed char r,signed char regmap[],uint64_t dirty)
1640 {
1641   int hr;
1642   for(hr=0;hr<HOST_REGS;hr++) {
1643     if(hr!=EXCLUDE_REG) {
1644       if((regmap[hr]&63)==r) {
1645         if((dirty>>hr)&1) {
1646           assert(regmap[hr]<64);
1647           emit_storereg(r,hr);
1648         }
1649       }
1650     }
1651   }
1652 }
1653
1654 void rlist()
1655 {
1656   int i;
1657   printf("TRACE: ");
1658   for(i=0;i<32;i++)
1659     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1660   printf("\n");
1661 }
1662
1663 void alu_assemble(int i,struct regstat *i_regs)
1664 {
1665   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1666     if(rt1[i]) {
1667       signed char s1,s2,t;
1668       t=get_reg(i_regs->regmap,rt1[i]);
1669       if(t>=0) {
1670         s1=get_reg(i_regs->regmap,rs1[i]);
1671         s2=get_reg(i_regs->regmap,rs2[i]);
1672         if(rs1[i]&&rs2[i]) {
1673           assert(s1>=0);
1674           assert(s2>=0);
1675           if(opcode2[i]&2) emit_sub(s1,s2,t);
1676           else emit_add(s1,s2,t);
1677         }
1678         else if(rs1[i]) {
1679           if(s1>=0) emit_mov(s1,t);
1680           else emit_loadreg(rs1[i],t);
1681         }
1682         else if(rs2[i]) {
1683           if(s2>=0) {
1684             if(opcode2[i]&2) emit_neg(s2,t);
1685             else emit_mov(s2,t);
1686           }
1687           else {
1688             emit_loadreg(rs2[i],t);
1689             if(opcode2[i]&2) emit_neg(t,t);
1690           }
1691         }
1692         else emit_zeroreg(t);
1693       }
1694     }
1695   }
1696   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1697     assert(0);
1698   }
1699   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1700     if(rt1[i]) {
1701       signed char s1l,s2l,t;
1702       {
1703         t=get_reg(i_regs->regmap,rt1[i]);
1704         //assert(t>=0);
1705         if(t>=0) {
1706           s1l=get_reg(i_regs->regmap,rs1[i]);
1707           s2l=get_reg(i_regs->regmap,rs2[i]);
1708           if(rs2[i]==0) // rx<r0
1709           {
1710             assert(s1l>=0);
1711             if(opcode2[i]==0x2a) // SLT
1712               emit_shrimm(s1l,31,t);
1713             else // SLTU (unsigned can not be less than zero)
1714               emit_zeroreg(t);
1715           }
1716           else if(rs1[i]==0) // r0<rx
1717           {
1718             assert(s2l>=0);
1719             if(opcode2[i]==0x2a) // SLT
1720               emit_set_gz32(s2l,t);
1721             else // SLTU (set if not zero)
1722               emit_set_nz32(s2l,t);
1723           }
1724           else{
1725             assert(s1l>=0);assert(s2l>=0);
1726             if(opcode2[i]==0x2a) // SLT
1727               emit_set_if_less32(s1l,s2l,t);
1728             else // SLTU
1729               emit_set_if_carry32(s1l,s2l,t);
1730           }
1731         }
1732       }
1733     }
1734   }
1735   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1736     if(rt1[i]) {
1737       signed char s1l,s2l,tl;
1738       tl=get_reg(i_regs->regmap,rt1[i]);
1739       {
1740         if(tl>=0) {
1741           s1l=get_reg(i_regs->regmap,rs1[i]);
1742           s2l=get_reg(i_regs->regmap,rs2[i]);
1743           if(rs1[i]&&rs2[i]) {
1744             assert(s1l>=0);
1745             assert(s2l>=0);
1746             if(opcode2[i]==0x24) { // AND
1747               emit_and(s1l,s2l,tl);
1748             } else
1749             if(opcode2[i]==0x25) { // OR
1750               emit_or(s1l,s2l,tl);
1751             } else
1752             if(opcode2[i]==0x26) { // XOR
1753               emit_xor(s1l,s2l,tl);
1754             } else
1755             if(opcode2[i]==0x27) { // NOR
1756               emit_or(s1l,s2l,tl);
1757               emit_not(tl,tl);
1758             }
1759           }
1760           else
1761           {
1762             if(opcode2[i]==0x24) { // AND
1763               emit_zeroreg(tl);
1764             } else
1765             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
1766               if(rs1[i]){
1767                 if(s1l>=0) emit_mov(s1l,tl);
1768                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
1769               }
1770               else
1771               if(rs2[i]){
1772                 if(s2l>=0) emit_mov(s2l,tl);
1773                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
1774               }
1775               else emit_zeroreg(tl);
1776             } else
1777             if(opcode2[i]==0x27) { // NOR
1778               if(rs1[i]){
1779                 if(s1l>=0) emit_not(s1l,tl);
1780                 else {
1781                   emit_loadreg(rs1[i],tl);
1782                   emit_not(tl,tl);
1783                 }
1784               }
1785               else
1786               if(rs2[i]){
1787                 if(s2l>=0) emit_not(s2l,tl);
1788                 else {
1789                   emit_loadreg(rs2[i],tl);
1790                   emit_not(tl,tl);
1791                 }
1792               }
1793               else emit_movimm(-1,tl);
1794             }
1795           }
1796         }
1797       }
1798     }
1799   }
1800 }
1801
1802 void imm16_assemble(int i,struct regstat *i_regs)
1803 {
1804   if (opcode[i]==0x0f) { // LUI
1805     if(rt1[i]) {
1806       signed char t;
1807       t=get_reg(i_regs->regmap,rt1[i]);
1808       //assert(t>=0);
1809       if(t>=0) {
1810         if(!((i_regs->isconst>>t)&1))
1811           emit_movimm(imm[i]<<16,t);
1812       }
1813     }
1814   }
1815   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1816     if(rt1[i]) {
1817       signed char s,t;
1818       t=get_reg(i_regs->regmap,rt1[i]);
1819       s=get_reg(i_regs->regmap,rs1[i]);
1820       if(rs1[i]) {
1821         //assert(t>=0);
1822         //assert(s>=0);
1823         if(t>=0) {
1824           if(!((i_regs->isconst>>t)&1)) {
1825             if(s<0) {
1826               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
1827               emit_addimm(t,imm[i],t);
1828             }else{
1829               if(!((i_regs->wasconst>>s)&1))
1830                 emit_addimm(s,imm[i],t);
1831               else
1832                 emit_movimm(constmap[i][s]+imm[i],t);
1833             }
1834           }
1835         }
1836       } else {
1837         if(t>=0) {
1838           if(!((i_regs->isconst>>t)&1))
1839             emit_movimm(imm[i],t);
1840         }
1841       }
1842     }
1843   }
1844   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1845     if(rt1[i]) {
1846       signed char sh,sl,th,tl;
1847       th=get_reg(i_regs->regmap,rt1[i]|64);
1848       tl=get_reg(i_regs->regmap,rt1[i]);
1849       sh=get_reg(i_regs->regmap,rs1[i]|64);
1850       sl=get_reg(i_regs->regmap,rs1[i]);
1851       if(tl>=0) {
1852         if(rs1[i]) {
1853           assert(sh>=0);
1854           assert(sl>=0);
1855           if(th>=0) {
1856             emit_addimm64_32(sh,sl,imm[i],th,tl);
1857           }
1858           else {
1859             emit_addimm(sl,imm[i],tl);
1860           }
1861         } else {
1862           emit_movimm(imm[i],tl);
1863           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
1864         }
1865       }
1866     }
1867   }
1868   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1869     if(rt1[i]) {
1870       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
1871       signed char sl,t;
1872       t=get_reg(i_regs->regmap,rt1[i]);
1873       sl=get_reg(i_regs->regmap,rs1[i]);
1874       //assert(t>=0);
1875       if(t>=0) {
1876         if(rs1[i]>0) {
1877             if(opcode[i]==0x0a) { // SLTI
1878               if(sl<0) {
1879                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
1880                 emit_slti32(t,imm[i],t);
1881               }else{
1882                 emit_slti32(sl,imm[i],t);
1883               }
1884             }
1885             else { // SLTIU
1886               if(sl<0) {
1887                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
1888                 emit_sltiu32(t,imm[i],t);
1889               }else{
1890                 emit_sltiu32(sl,imm[i],t);
1891               }
1892             }
1893         }else{
1894           // SLTI(U) with r0 is just stupid,
1895           // nonetheless examples can be found
1896           if(opcode[i]==0x0a) // SLTI
1897             if(0<imm[i]) emit_movimm(1,t);
1898             else emit_zeroreg(t);
1899           else // SLTIU
1900           {
1901             if(imm[i]) emit_movimm(1,t);
1902             else emit_zeroreg(t);
1903           }
1904         }
1905       }
1906     }
1907   }
1908   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1909     if(rt1[i]) {
1910       signed char sh,sl,th,tl;
1911       th=get_reg(i_regs->regmap,rt1[i]|64);
1912       tl=get_reg(i_regs->regmap,rt1[i]);
1913       sh=get_reg(i_regs->regmap,rs1[i]|64);
1914       sl=get_reg(i_regs->regmap,rs1[i]);
1915       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
1916         if(opcode[i]==0x0c) //ANDI
1917         {
1918           if(rs1[i]) {
1919             if(sl<0) {
1920               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
1921               emit_andimm(tl,imm[i],tl);
1922             }else{
1923               if(!((i_regs->wasconst>>sl)&1))
1924                 emit_andimm(sl,imm[i],tl);
1925               else
1926                 emit_movimm(constmap[i][sl]&imm[i],tl);
1927             }
1928           }
1929           else
1930             emit_zeroreg(tl);
1931           if(th>=0) emit_zeroreg(th);
1932         }
1933         else
1934         {
1935           if(rs1[i]) {
1936             if(sl<0) {
1937               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
1938             }
1939             if(th>=0) {
1940               if(sh<0) {
1941                 emit_loadreg(rs1[i]|64,th);
1942               }else{
1943                 emit_mov(sh,th);
1944               }
1945             }
1946             if(opcode[i]==0x0d) { // ORI
1947               if(sl<0) {
1948                 emit_orimm(tl,imm[i],tl);
1949               }else{
1950                 if(!((i_regs->wasconst>>sl)&1))
1951                   emit_orimm(sl,imm[i],tl);
1952                 else
1953                   emit_movimm(constmap[i][sl]|imm[i],tl);
1954               }
1955             }
1956             if(opcode[i]==0x0e) { // XORI
1957               if(sl<0) {
1958                 emit_xorimm(tl,imm[i],tl);
1959               }else{
1960                 if(!((i_regs->wasconst>>sl)&1))
1961                   emit_xorimm(sl,imm[i],tl);
1962                 else
1963                   emit_movimm(constmap[i][sl]^imm[i],tl);
1964               }
1965             }
1966           }
1967           else {
1968             emit_movimm(imm[i],tl);
1969             if(th>=0) emit_zeroreg(th);
1970           }
1971         }
1972       }
1973     }
1974   }
1975 }
1976
1977 void shiftimm_assemble(int i,struct regstat *i_regs)
1978 {
1979   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1980   {
1981     if(rt1[i]) {
1982       signed char s,t;
1983       t=get_reg(i_regs->regmap,rt1[i]);
1984       s=get_reg(i_regs->regmap,rs1[i]);
1985       //assert(t>=0);
1986       if(t>=0&&!((i_regs->isconst>>t)&1)){
1987         if(rs1[i]==0)
1988         {
1989           emit_zeroreg(t);
1990         }
1991         else
1992         {
1993           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
1994           if(imm[i]) {
1995             if(opcode2[i]==0) // SLL
1996             {
1997               emit_shlimm(s<0?t:s,imm[i],t);
1998             }
1999             if(opcode2[i]==2) // SRL
2000             {
2001               emit_shrimm(s<0?t:s,imm[i],t);
2002             }
2003             if(opcode2[i]==3) // SRA
2004             {
2005               emit_sarimm(s<0?t:s,imm[i],t);
2006             }
2007           }else{
2008             // Shift by zero
2009             if(s>=0 && s!=t) emit_mov(s,t);
2010           }
2011         }
2012       }
2013       //emit_storereg(rt1[i],t); //DEBUG
2014     }
2015   }
2016   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2017   {
2018     assert(0);
2019   }
2020   if(opcode2[i]==0x3c) // DSLL32
2021   {
2022     assert(0);
2023   }
2024   if(opcode2[i]==0x3e) // DSRL32
2025   {
2026     assert(0);
2027   }
2028   if(opcode2[i]==0x3f) // DSRA32
2029   {
2030     assert(0);
2031   }
2032 }
2033
2034 #ifndef shift_assemble
2035 void shift_assemble(int i,struct regstat *i_regs)
2036 {
2037   printf("Need shift_assemble for this architecture.\n");
2038   exit(1);
2039 }
2040 #endif
2041
2042 void load_assemble(int i,struct regstat *i_regs)
2043 {
2044   int s,th,tl,addr;
2045   int offset;
2046   void *jaddr=0;
2047   int memtarget=0,c=0;
2048   int fastload_reg_override=0;
2049   u_int hr,reglist=0;
2050   th=get_reg(i_regs->regmap,rt1[i]|64);
2051   tl=get_reg(i_regs->regmap,rt1[i]);
2052   s=get_reg(i_regs->regmap,rs1[i]);
2053   offset=imm[i];
2054   for(hr=0;hr<HOST_REGS;hr++) {
2055     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2056   }
2057   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2058   if(s>=0) {
2059     c=(i_regs->wasconst>>s)&1;
2060     if (c) {
2061       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2062     }
2063   }
2064   //printf("load_assemble: c=%d\n",c);
2065   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2066   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2067   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2068     ||rt1[i]==0) {
2069       // could be FIFO, must perform the read
2070       // ||dummy read
2071       assem_debug("(forced read)\n");
2072       tl=get_reg(i_regs->regmap,-1);
2073       assert(tl>=0);
2074   }
2075   if(offset||s<0||c) addr=tl;
2076   else addr=s;
2077   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2078  if(tl>=0) {
2079   //printf("load_assemble: c=%d\n",c);
2080   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2081   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2082   reglist&=~(1<<tl);
2083   if(th>=0) reglist&=~(1<<th);
2084   if(!c) {
2085     #ifdef R29_HACK
2086     // Strmnnrmn's speed hack
2087     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2088     #endif
2089     {
2090       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2091     }
2092   }
2093   else if(ram_offset&&memtarget) {
2094     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2095     fastload_reg_override=HOST_TEMPREG;
2096   }
2097   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2098   if (opcode[i]==0x20) { // LB
2099     if(!c||memtarget) {
2100       if(!dummy) {
2101         {
2102           int x=0,a=tl;
2103           if(!c) a=addr;
2104           if(fastload_reg_override) a=fastload_reg_override;
2105
2106           emit_movsbl_indexed(x,a,tl);
2107         }
2108       }
2109       if(jaddr)
2110         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2111     }
2112     else
2113       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2114   }
2115   if (opcode[i]==0x21) { // LH
2116     if(!c||memtarget) {
2117       if(!dummy) {
2118         int x=0,a=tl;
2119         if(!c) a=addr;
2120         if(fastload_reg_override) a=fastload_reg_override;
2121         emit_movswl_indexed(x,a,tl);
2122       }
2123       if(jaddr)
2124         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2125     }
2126     else
2127       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2128   }
2129   if (opcode[i]==0x23) { // LW
2130     if(!c||memtarget) {
2131       if(!dummy) {
2132         int a=addr;
2133         if(fastload_reg_override) a=fastload_reg_override;
2134         emit_readword_indexed(0,a,tl);
2135       }
2136       if(jaddr)
2137         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2138     }
2139     else
2140       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2141   }
2142   if (opcode[i]==0x24) { // LBU
2143     if(!c||memtarget) {
2144       if(!dummy) {
2145         int x=0,a=tl;
2146         if(!c) a=addr;
2147         if(fastload_reg_override) a=fastload_reg_override;
2148
2149         emit_movzbl_indexed(x,a,tl);
2150       }
2151       if(jaddr)
2152         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2153     }
2154     else
2155       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2156   }
2157   if (opcode[i]==0x25) { // LHU
2158     if(!c||memtarget) {
2159       if(!dummy) {
2160         int x=0,a=tl;
2161         if(!c) a=addr;
2162         if(fastload_reg_override) a=fastload_reg_override;
2163         emit_movzwl_indexed(x,a,tl);
2164       }
2165       if(jaddr)
2166         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2167     }
2168     else
2169       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2170   }
2171   if (opcode[i]==0x27) { // LWU
2172     assert(th>=0);
2173     if(!c||memtarget) {
2174       if(!dummy) {
2175         int a=addr;
2176         if(fastload_reg_override) a=fastload_reg_override;
2177         emit_readword_indexed(0,a,tl);
2178       }
2179       if(jaddr)
2180         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2181     }
2182     else {
2183       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2184     }
2185     emit_zeroreg(th);
2186   }
2187   if (opcode[i]==0x37) { // LD
2188     assert(0);
2189   }
2190  }
2191 }
2192
2193 #ifndef loadlr_assemble
2194 void loadlr_assemble(int i,struct regstat *i_regs)
2195 {
2196   printf("Need loadlr_assemble for this architecture.\n");
2197   exit(1);
2198 }
2199 #endif
2200
2201 void store_assemble(int i,struct regstat *i_regs)
2202 {
2203   int s,tl;
2204   int addr,temp;
2205   int offset;
2206   void *jaddr=0;
2207   enum stub_type type;
2208   int memtarget=0,c=0;
2209   int agr=AGEN1+(i&1);
2210   int faststore_reg_override=0;
2211   u_int hr,reglist=0;
2212   tl=get_reg(i_regs->regmap,rs2[i]);
2213   s=get_reg(i_regs->regmap,rs1[i]);
2214   temp=get_reg(i_regs->regmap,agr);
2215   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2216   offset=imm[i];
2217   if(s>=0) {
2218     c=(i_regs->wasconst>>s)&1;
2219     if(c) {
2220       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2221     }
2222   }
2223   assert(tl>=0);
2224   assert(temp>=0);
2225   for(hr=0;hr<HOST_REGS;hr++) {
2226     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2227   }
2228   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2229   if(offset||s<0||c) addr=temp;
2230   else addr=s;
2231   if(!c) {
2232     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2233   }
2234   else if(ram_offset&&memtarget) {
2235     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2236     faststore_reg_override=HOST_TEMPREG;
2237   }
2238
2239   if (opcode[i]==0x28) { // SB
2240     if(!c||memtarget) {
2241       int x=0,a=temp;
2242       if(!c) a=addr;
2243       if(faststore_reg_override) a=faststore_reg_override;
2244       emit_writebyte_indexed(tl,x,a);
2245     }
2246     type=STOREB_STUB;
2247   }
2248   if (opcode[i]==0x29) { // SH
2249     if(!c||memtarget) {
2250       int x=0,a=temp;
2251       if(!c) a=addr;
2252       if(faststore_reg_override) a=faststore_reg_override;
2253       emit_writehword_indexed(tl,x,a);
2254     }
2255     type=STOREH_STUB;
2256   }
2257   if (opcode[i]==0x2B) { // SW
2258     if(!c||memtarget) {
2259       int a=addr;
2260       if(faststore_reg_override) a=faststore_reg_override;
2261       emit_writeword_indexed(tl,0,a);
2262     }
2263     type=STOREW_STUB;
2264   }
2265   if (opcode[i]==0x3F) { // SD
2266     assert(0);
2267     type=STORED_STUB;
2268   }
2269   if(jaddr) {
2270     // PCSX store handlers don't check invcode again
2271     reglist|=1<<addr;
2272     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2273     jaddr=0;
2274   }
2275   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2276     if(!c||memtarget) {
2277       #ifdef DESTRUCTIVE_SHIFT
2278       // The x86 shift operation is 'destructive'; it overwrites the
2279       // source register, so we need to make a copy first and use that.
2280       addr=temp;
2281       #endif
2282       #if defined(HOST_IMM8)
2283       int ir=get_reg(i_regs->regmap,INVCP);
2284       assert(ir>=0);
2285       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2286       #else
2287       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2288       #endif
2289       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2290       emit_callne(invalidate_addr_reg[addr]);
2291       #else
2292       void *jaddr2 = out;
2293       emit_jne(0);
2294       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2295       #endif
2296     }
2297   }
2298   u_int addr_val=constmap[i][s]+offset;
2299   if(jaddr) {
2300     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2301   } else if(c&&!memtarget) {
2302     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2303   }
2304   // basic current block modification detection..
2305   // not looking back as that should be in mips cache already
2306   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2307     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2308     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2309     if(i_regs->regmap==regs[i].regmap) {
2310       load_all_consts(regs[i].regmap_entry,regs[i].wasdirty,i);
2311       wb_dirtys(regs[i].regmap_entry,regs[i].wasdirty);
2312       emit_movimm(start+i*4+4,0);
2313       emit_writeword(0,&pcaddr);
2314       emit_jmp(do_interrupt);
2315     }
2316   }
2317 }
2318
2319 void storelr_assemble(int i,struct regstat *i_regs)
2320 {
2321   int s,tl;
2322   int temp;
2323   int offset;
2324   void *jaddr=0;
2325   void *case1, *case2, *case3;
2326   void *done0, *done1, *done2;
2327   int memtarget=0,c=0;
2328   int agr=AGEN1+(i&1);
2329   u_int hr,reglist=0;
2330   tl=get_reg(i_regs->regmap,rs2[i]);
2331   s=get_reg(i_regs->regmap,rs1[i]);
2332   temp=get_reg(i_regs->regmap,agr);
2333   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2334   offset=imm[i];
2335   if(s>=0) {
2336     c=(i_regs->isconst>>s)&1;
2337     if(c) {
2338       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2339     }
2340   }
2341   assert(tl>=0);
2342   for(hr=0;hr<HOST_REGS;hr++) {
2343     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2344   }
2345   assert(temp>=0);
2346   if(!c) {
2347     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
2348     if(!offset&&s!=temp) emit_mov(s,temp);
2349     jaddr=out;
2350     emit_jno(0);
2351   }
2352   else
2353   {
2354     if(!memtarget||!rs1[i]) {
2355       jaddr=out;
2356       emit_jmp(0);
2357     }
2358   }
2359   emit_addimm_no_flags(ram_offset,temp);
2360
2361   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
2362     assert(0);
2363   }
2364
2365   emit_xorimm(temp,3,temp);
2366   emit_testimm(temp,2);
2367   case2=out;
2368   emit_jne(0);
2369   emit_testimm(temp,1);
2370   case1=out;
2371   emit_jne(0);
2372   // 0
2373   if (opcode[i]==0x2A) { // SWL
2374     emit_writeword_indexed(tl,0,temp);
2375   }
2376   if (opcode[i]==0x2E) { // SWR
2377     emit_writebyte_indexed(tl,3,temp);
2378   }
2379   if (opcode[i]==0x2C) { // SDL
2380     assert(0);
2381   }
2382   if (opcode[i]==0x2D) { // SDR
2383     assert(0);
2384   }
2385   done0=out;
2386   emit_jmp(0);
2387   // 1
2388   set_jump_target(case1, out);
2389   if (opcode[i]==0x2A) { // SWL
2390     // Write 3 msb into three least significant bytes
2391     if(rs2[i]) emit_rorimm(tl,8,tl);
2392     emit_writehword_indexed(tl,-1,temp);
2393     if(rs2[i]) emit_rorimm(tl,16,tl);
2394     emit_writebyte_indexed(tl,1,temp);
2395     if(rs2[i]) emit_rorimm(tl,8,tl);
2396   }
2397   if (opcode[i]==0x2E) { // SWR
2398     // Write two lsb into two most significant bytes
2399     emit_writehword_indexed(tl,1,temp);
2400   }
2401   if (opcode[i]==0x2C) { // SDL
2402     assert(0);
2403   }
2404   if (opcode[i]==0x2D) { // SDR
2405     assert(0);
2406   }
2407   done1=out;
2408   emit_jmp(0);
2409   // 2
2410   set_jump_target(case2, out);
2411   emit_testimm(temp,1);
2412   case3=out;
2413   emit_jne(0);
2414   if (opcode[i]==0x2A) { // SWL
2415     // Write two msb into two least significant bytes
2416     if(rs2[i]) emit_rorimm(tl,16,tl);
2417     emit_writehword_indexed(tl,-2,temp);
2418     if(rs2[i]) emit_rorimm(tl,16,tl);
2419   }
2420   if (opcode[i]==0x2E) { // SWR
2421     // Write 3 lsb into three most significant bytes
2422     emit_writebyte_indexed(tl,-1,temp);
2423     if(rs2[i]) emit_rorimm(tl,8,tl);
2424     emit_writehword_indexed(tl,0,temp);
2425     if(rs2[i]) emit_rorimm(tl,24,tl);
2426   }
2427   if (opcode[i]==0x2C) { // SDL
2428     assert(0);
2429   }
2430   if (opcode[i]==0x2D) { // SDR
2431     assert(0);
2432   }
2433   done2=out;
2434   emit_jmp(0);
2435   // 3
2436   set_jump_target(case3, out);
2437   if (opcode[i]==0x2A) { // SWL
2438     // Write msb into least significant byte
2439     if(rs2[i]) emit_rorimm(tl,24,tl);
2440     emit_writebyte_indexed(tl,-3,temp);
2441     if(rs2[i]) emit_rorimm(tl,8,tl);
2442   }
2443   if (opcode[i]==0x2E) { // SWR
2444     // Write entire word
2445     emit_writeword_indexed(tl,-3,temp);
2446   }
2447   if (opcode[i]==0x2C) { // SDL
2448     assert(0);
2449   }
2450   if (opcode[i]==0x2D) { // SDR
2451     assert(0);
2452   }
2453   set_jump_target(done0, out);
2454   set_jump_target(done1, out);
2455   set_jump_target(done2, out);
2456   if (opcode[i]==0x2C) { // SDL
2457     assert(0);
2458   }
2459   if (opcode[i]==0x2D) { // SDR
2460     assert(0);
2461   }
2462   if(!c||!memtarget)
2463     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
2464   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2465     emit_addimm_no_flags(-ram_offset,temp);
2466     #if defined(HOST_IMM8)
2467     int ir=get_reg(i_regs->regmap,INVCP);
2468     assert(ir>=0);
2469     emit_cmpmem_indexedsr12_reg(ir,temp,1);
2470     #else
2471     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
2472     #endif
2473     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2474     emit_callne(invalidate_addr_reg[temp]);
2475     #else
2476     void *jaddr2 = out;
2477     emit_jne(0);
2478     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
2479     #endif
2480   }
2481 }
2482
2483 void c1ls_assemble(int i,struct regstat *i_regs)
2484 {
2485   cop1_unusable(i, i_regs);
2486 }
2487
2488 void c2ls_assemble(int i,struct regstat *i_regs)
2489 {
2490   int s,tl;
2491   int ar;
2492   int offset;
2493   int memtarget=0,c=0;
2494   void *jaddr2=NULL;
2495   enum stub_type type;
2496   int agr=AGEN1+(i&1);
2497   int fastio_reg_override=0;
2498   u_int hr,reglist=0;
2499   u_int copr=(source[i]>>16)&0x1f;
2500   s=get_reg(i_regs->regmap,rs1[i]);
2501   tl=get_reg(i_regs->regmap,FTEMP);
2502   offset=imm[i];
2503   assert(rs1[i]>0);
2504   assert(tl>=0);
2505
2506   for(hr=0;hr<HOST_REGS;hr++) {
2507     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2508   }
2509   if(i_regs->regmap[HOST_CCREG]==CCREG)
2510     reglist&=~(1<<HOST_CCREG);
2511
2512   // get the address
2513   if (opcode[i]==0x3a) { // SWC2
2514     ar=get_reg(i_regs->regmap,agr);
2515     if(ar<0) ar=get_reg(i_regs->regmap,-1);
2516     reglist|=1<<ar;
2517   } else { // LWC2
2518     ar=tl;
2519   }
2520   if(s>=0) c=(i_regs->wasconst>>s)&1;
2521   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
2522   if (!offset&&!c&&s>=0) ar=s;
2523   assert(ar>=0);
2524
2525   if (opcode[i]==0x3a) { // SWC2
2526     cop2_get_dreg(copr,tl,HOST_TEMPREG);
2527     type=STOREW_STUB;
2528   }
2529   else
2530     type=LOADW_STUB;
2531
2532   if(c&&!memtarget) {
2533     jaddr2=out;
2534     emit_jmp(0); // inline_readstub/inline_writestub?
2535   }
2536   else {
2537     if(!c) {
2538       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
2539     }
2540     else if(ram_offset&&memtarget) {
2541       emit_addimm(ar,ram_offset,HOST_TEMPREG);
2542       fastio_reg_override=HOST_TEMPREG;
2543     }
2544     if (opcode[i]==0x32) { // LWC2
2545       int a=ar;
2546       if(fastio_reg_override) a=fastio_reg_override;
2547       emit_readword_indexed(0,a,tl);
2548     }
2549     if (opcode[i]==0x3a) { // SWC2
2550       #ifdef DESTRUCTIVE_SHIFT
2551       if(!offset&&!c&&s>=0) emit_mov(s,ar);
2552       #endif
2553       int a=ar;
2554       if(fastio_reg_override) a=fastio_reg_override;
2555       emit_writeword_indexed(tl,0,a);
2556     }
2557   }
2558   if(jaddr2)
2559     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
2560   if(opcode[i]==0x3a) // SWC2
2561   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2562 #if defined(HOST_IMM8)
2563     int ir=get_reg(i_regs->regmap,INVCP);
2564     assert(ir>=0);
2565     emit_cmpmem_indexedsr12_reg(ir,ar,1);
2566 #else
2567     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
2568 #endif
2569     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2570     emit_callne(invalidate_addr_reg[ar]);
2571     #else
2572     void *jaddr3 = out;
2573     emit_jne(0);
2574     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
2575     #endif
2576   }
2577   if (opcode[i]==0x32) { // LWC2
2578     cop2_put_dreg(copr,tl,HOST_TEMPREG);
2579   }
2580 }
2581
2582 #ifndef multdiv_assemble
2583 void multdiv_assemble(int i,struct regstat *i_regs)
2584 {
2585   printf("Need multdiv_assemble for this architecture.\n");
2586   exit(1);
2587 }
2588 #endif
2589
2590 void mov_assemble(int i,struct regstat *i_regs)
2591 {
2592   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
2593   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
2594   if(rt1[i]) {
2595     signed char sh,sl,th,tl;
2596     th=get_reg(i_regs->regmap,rt1[i]|64);
2597     tl=get_reg(i_regs->regmap,rt1[i]);
2598     //assert(tl>=0);
2599     if(tl>=0) {
2600       sh=get_reg(i_regs->regmap,rs1[i]|64);
2601       sl=get_reg(i_regs->regmap,rs1[i]);
2602       if(sl>=0) emit_mov(sl,tl);
2603       else emit_loadreg(rs1[i],tl);
2604       if(th>=0) {
2605         if(sh>=0) emit_mov(sh,th);
2606         else emit_loadreg(rs1[i]|64,th);
2607       }
2608     }
2609   }
2610 }
2611
2612 void syscall_assemble(int i,struct regstat *i_regs)
2613 {
2614   signed char ccreg=get_reg(i_regs->regmap,CCREG);
2615   assert(ccreg==HOST_CCREG);
2616   assert(!is_delayslot);
2617   (void)ccreg;
2618   emit_movimm(start+i*4,EAX); // Get PC
2619   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
2620   emit_jmp(jump_syscall_hle); // XXX
2621 }
2622
2623 void hlecall_assemble(int i,struct regstat *i_regs)
2624 {
2625   extern void psxNULL();
2626   signed char ccreg=get_reg(i_regs->regmap,CCREG);
2627   assert(ccreg==HOST_CCREG);
2628   assert(!is_delayslot);
2629   (void)ccreg;
2630   emit_movimm(start+i*4+4,0); // Get PC
2631   uint32_t hleCode = source[i] & 0x03ffffff;
2632   if (hleCode >= ARRAY_SIZE(psxHLEt))
2633     emit_movimm((uintptr_t)psxNULL,1);
2634   else
2635     emit_movimm((uintptr_t)psxHLEt[hleCode],1);
2636   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
2637   emit_jmp(jump_hlecall);
2638 }
2639
2640 void intcall_assemble(int i,struct regstat *i_regs)
2641 {
2642   signed char ccreg=get_reg(i_regs->regmap,CCREG);
2643   assert(ccreg==HOST_CCREG);
2644   assert(!is_delayslot);
2645   (void)ccreg;
2646   emit_movimm(start+i*4,0); // Get PC
2647   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
2648   emit_jmp(jump_intcall);
2649 }
2650
2651 void ds_assemble(int i,struct regstat *i_regs)
2652 {
2653   speculate_register_values(i);
2654   is_delayslot=1;
2655   switch(itype[i]) {
2656     case ALU:
2657       alu_assemble(i,i_regs);break;
2658     case IMM16:
2659       imm16_assemble(i,i_regs);break;
2660     case SHIFT:
2661       shift_assemble(i,i_regs);break;
2662     case SHIFTIMM:
2663       shiftimm_assemble(i,i_regs);break;
2664     case LOAD:
2665       load_assemble(i,i_regs);break;
2666     case LOADLR:
2667       loadlr_assemble(i,i_regs);break;
2668     case STORE:
2669       store_assemble(i,i_regs);break;
2670     case STORELR:
2671       storelr_assemble(i,i_regs);break;
2672     case COP0:
2673       cop0_assemble(i,i_regs);break;
2674     case COP1:
2675       cop1_assemble(i,i_regs);break;
2676     case C1LS:
2677       c1ls_assemble(i,i_regs);break;
2678     case COP2:
2679       cop2_assemble(i,i_regs);break;
2680     case C2LS:
2681       c2ls_assemble(i,i_regs);break;
2682     case C2OP:
2683       c2op_assemble(i,i_regs);break;
2684     case MULTDIV:
2685       multdiv_assemble(i,i_regs);break;
2686     case MOV:
2687       mov_assemble(i,i_regs);break;
2688     case SYSCALL:
2689     case HLECALL:
2690     case INTCALL:
2691     case SPAN:
2692     case UJUMP:
2693     case RJUMP:
2694     case CJUMP:
2695     case SJUMP:
2696       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
2697   }
2698   is_delayslot=0;
2699 }
2700
2701 // Is the branch target a valid internal jump?
2702 static int internal_branch(int addr)
2703 {
2704   if(addr&1) return 0; // Indirect (register) jump
2705   if(addr>=start && addr<start+slen*4-4)
2706   {
2707     return 1;
2708   }
2709   return 0;
2710 }
2711
2712 static void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t u)
2713 {
2714   int hr;
2715   for(hr=0;hr<HOST_REGS;hr++) {
2716     if(hr!=EXCLUDE_REG) {
2717       if(pre[hr]!=entry[hr]) {
2718         if(pre[hr]>=0) {
2719           if((dirty>>hr)&1) {
2720             if(get_reg(entry,pre[hr])<0) {
2721               assert(pre[hr]<64);
2722               if(!((u>>pre[hr])&1))
2723                 emit_storereg(pre[hr],hr);
2724             }
2725           }
2726         }
2727       }
2728     }
2729   }
2730   // Move from one register to another (no writeback)
2731   for(hr=0;hr<HOST_REGS;hr++) {
2732     if(hr!=EXCLUDE_REG) {
2733       if(pre[hr]!=entry[hr]) {
2734         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
2735           int nr;
2736           if((nr=get_reg(entry,pre[hr]))>=0) {
2737             emit_mov(hr,nr);
2738           }
2739         }
2740       }
2741     }
2742   }
2743 }
2744
2745 // Load the specified registers
2746 // This only loads the registers given as arguments because
2747 // we don't want to load things that will be overwritten
2748 static void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2)
2749 {
2750   int hr;
2751   // Load 32-bit regs
2752   for(hr=0;hr<HOST_REGS;hr++) {
2753     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
2754       if(entry[hr]!=regmap[hr]) {
2755         if(regmap[hr]==rs1||regmap[hr]==rs2)
2756         {
2757           if(regmap[hr]==0) {
2758             emit_zeroreg(hr);
2759           }
2760           else
2761           {
2762             emit_loadreg(regmap[hr],hr);
2763           }
2764         }
2765       }
2766     }
2767   }
2768 }
2769
2770 // Load registers prior to the start of a loop
2771 // so that they are not loaded within the loop
2772 static void loop_preload(signed char pre[],signed char entry[])
2773 {
2774   int hr;
2775   for(hr=0;hr<HOST_REGS;hr++) {
2776     if(hr!=EXCLUDE_REG) {
2777       if(pre[hr]!=entry[hr]) {
2778         if(entry[hr]>=0) {
2779           if(get_reg(pre,entry[hr])<0) {
2780             assem_debug("loop preload:\n");
2781             //printf("loop preload: %d\n",hr);
2782             if(entry[hr]==0) {
2783               emit_zeroreg(hr);
2784             }
2785             else if(entry[hr]<TEMPREG)
2786             {
2787               emit_loadreg(entry[hr],hr);
2788             }
2789             else if(entry[hr]-64<TEMPREG)
2790             {
2791               emit_loadreg(entry[hr],hr);
2792             }
2793           }
2794         }
2795       }
2796     }
2797   }
2798 }
2799
2800 // Generate address for load/store instruction
2801 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
2802 void address_generation(int i,struct regstat *i_regs,signed char entry[])
2803 {
2804   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
2805     int ra=-1;
2806     int agr=AGEN1+(i&1);
2807     if(itype[i]==LOAD) {
2808       ra=get_reg(i_regs->regmap,rt1[i]);
2809       if(ra<0) ra=get_reg(i_regs->regmap,-1);
2810       assert(ra>=0);
2811     }
2812     if(itype[i]==LOADLR) {
2813       ra=get_reg(i_regs->regmap,FTEMP);
2814     }
2815     if(itype[i]==STORE||itype[i]==STORELR) {
2816       ra=get_reg(i_regs->regmap,agr);
2817       if(ra<0) ra=get_reg(i_regs->regmap,-1);
2818     }
2819     if(itype[i]==C1LS||itype[i]==C2LS) {
2820       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
2821         ra=get_reg(i_regs->regmap,FTEMP);
2822       else { // SWC1/SDC1/SWC2/SDC2
2823         ra=get_reg(i_regs->regmap,agr);
2824         if(ra<0) ra=get_reg(i_regs->regmap,-1);
2825       }
2826     }
2827     int rs=get_reg(i_regs->regmap,rs1[i]);
2828     if(ra>=0) {
2829       int offset=imm[i];
2830       int c=(i_regs->wasconst>>rs)&1;
2831       if(rs1[i]==0) {
2832         // Using r0 as a base address
2833         if(!entry||entry[ra]!=agr) {
2834           if (opcode[i]==0x22||opcode[i]==0x26) {
2835             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
2836           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
2837             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
2838           }else{
2839             emit_movimm(offset,ra);
2840           }
2841         } // else did it in the previous cycle
2842       }
2843       else if(rs<0) {
2844         if(!entry||entry[ra]!=rs1[i])
2845           emit_loadreg(rs1[i],ra);
2846         //if(!entry||entry[ra]!=rs1[i])
2847         //  printf("poor load scheduling!\n");
2848       }
2849       else if(c) {
2850         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
2851           if(!entry||entry[ra]!=agr) {
2852             if (opcode[i]==0x22||opcode[i]==0x26) {
2853               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
2854             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
2855               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
2856             }else{
2857               emit_movimm(constmap[i][rs]+offset,ra);
2858               regs[i].loadedconst|=1<<ra;
2859             }
2860           } // else did it in the previous cycle
2861         } // else load_consts already did it
2862       }
2863       if(offset&&!c&&rs1[i]) {
2864         if(rs>=0) {
2865           emit_addimm(rs,offset,ra);
2866         }else{
2867           emit_addimm(ra,offset,ra);
2868         }
2869       }
2870     }
2871   }
2872   // Preload constants for next instruction
2873   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
2874     int agr,ra;
2875     // Actual address
2876     agr=AGEN1+((i+1)&1);
2877     ra=get_reg(i_regs->regmap,agr);
2878     if(ra>=0) {
2879       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
2880       int offset=imm[i+1];
2881       int c=(regs[i+1].wasconst>>rs)&1;
2882       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
2883         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
2884           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
2885         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
2886           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
2887         }else{
2888           emit_movimm(constmap[i+1][rs]+offset,ra);
2889           regs[i+1].loadedconst|=1<<ra;
2890         }
2891       }
2892       else if(rs1[i+1]==0) {
2893         // Using r0 as a base address
2894         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
2895           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
2896         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
2897           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
2898         }else{
2899           emit_movimm(offset,ra);
2900         }
2901       }
2902     }
2903   }
2904 }
2905
2906 static int get_final_value(int hr, int i, int *value)
2907 {
2908   int reg=regs[i].regmap[hr];
2909   while(i<slen-1) {
2910     if(regs[i+1].regmap[hr]!=reg) break;
2911     if(!((regs[i+1].isconst>>hr)&1)) break;
2912     if(bt[i+1]) break;
2913     i++;
2914   }
2915   if(i<slen-1) {
2916     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
2917       *value=constmap[i][hr];
2918       return 1;
2919     }
2920     if(!bt[i+1]) {
2921       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
2922         // Load in delay slot, out-of-order execution
2923         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
2924         {
2925           // Precompute load address
2926           *value=constmap[i][hr]+imm[i+2];
2927           return 1;
2928         }
2929       }
2930       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
2931       {
2932         // Precompute load address
2933         *value=constmap[i][hr]+imm[i+1];
2934         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
2935         return 1;
2936       }
2937     }
2938   }
2939   *value=constmap[i][hr];
2940   //printf("c=%lx\n",(long)constmap[i][hr]);
2941   if(i==slen-1) return 1;
2942   assert(reg < 64);
2943   return !((unneeded_reg[i+1]>>reg)&1);
2944 }
2945
2946 // Load registers with known constants
2947 static void load_consts(signed char pre[],signed char regmap[],int i)
2948 {
2949   int hr,hr2;
2950   // propagate loaded constant flags
2951   if(i==0||bt[i])
2952     regs[i].loadedconst=0;
2953   else {
2954     for(hr=0;hr<HOST_REGS;hr++) {
2955       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
2956          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
2957       {
2958         regs[i].loadedconst|=1<<hr;
2959       }
2960     }
2961   }
2962   // Load 32-bit regs
2963   for(hr=0;hr<HOST_REGS;hr++) {
2964     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
2965       //if(entry[hr]!=regmap[hr]) {
2966       if(!((regs[i].loadedconst>>hr)&1)) {
2967         assert(regmap[hr]<64);
2968         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
2969           int value,similar=0;
2970           if(get_final_value(hr,i,&value)) {
2971             // see if some other register has similar value
2972             for(hr2=0;hr2<HOST_REGS;hr2++) {
2973               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
2974                 if(is_similar_value(value,constmap[i][hr2])) {
2975                   similar=1;
2976                   break;
2977                 }
2978               }
2979             }
2980             if(similar) {
2981               int value2;
2982               if(get_final_value(hr2,i,&value2)) // is this needed?
2983                 emit_movimm_from(value2,hr2,value,hr);
2984               else
2985                 emit_movimm(value,hr);
2986             }
2987             else if(value==0) {
2988               emit_zeroreg(hr);
2989             }
2990             else {
2991               emit_movimm(value,hr);
2992             }
2993           }
2994           regs[i].loadedconst|=1<<hr;
2995         }
2996       }
2997     }
2998   }
2999 }
3000
3001 void load_all_consts(signed char regmap[], u_int dirty, int i)
3002 {
3003   int hr;
3004   // Load 32-bit regs
3005   for(hr=0;hr<HOST_REGS;hr++) {
3006     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3007       assert(regmap[hr] < 64);
3008       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>0) {
3009         int value=constmap[i][hr];
3010         if(value==0) {
3011           emit_zeroreg(hr);
3012         }
3013         else {
3014           emit_movimm(value,hr);
3015         }
3016       }
3017     }
3018   }
3019 }
3020
3021 // Write out all dirty registers (except cycle count)
3022 static void wb_dirtys(signed char i_regmap[],uint64_t i_dirty)
3023 {
3024   int hr;
3025   for(hr=0;hr<HOST_REGS;hr++) {
3026     if(hr!=EXCLUDE_REG) {
3027       if(i_regmap[hr]>0) {
3028         if(i_regmap[hr]!=CCREG) {
3029           if((i_dirty>>hr)&1) {
3030             assert(i_regmap[hr]<64);
3031             emit_storereg(i_regmap[hr],hr);
3032           }
3033         }
3034       }
3035     }
3036   }
3037 }
3038
3039 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3040 // This writes the registers not written by store_regs_bt
3041 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_dirty,int addr)
3042 {
3043   int hr;
3044   int t=(addr-start)>>2;
3045   for(hr=0;hr<HOST_REGS;hr++) {
3046     if(hr!=EXCLUDE_REG) {
3047       if(i_regmap[hr]>0) {
3048         if(i_regmap[hr]!=CCREG) {
3049           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) {
3050             if((i_dirty>>hr)&1) {
3051               assert(i_regmap[hr]<64);
3052               emit_storereg(i_regmap[hr],hr);
3053             }
3054           }
3055         }
3056       }
3057     }
3058   }
3059 }
3060
3061 // Load all registers (except cycle count)
3062 void load_all_regs(signed char i_regmap[])
3063 {
3064   int hr;
3065   for(hr=0;hr<HOST_REGS;hr++) {
3066     if(hr!=EXCLUDE_REG) {
3067       if(i_regmap[hr]==0) {
3068         emit_zeroreg(hr);
3069       }
3070       else
3071       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3072       {
3073         emit_loadreg(i_regmap[hr],hr);
3074       }
3075     }
3076   }
3077 }
3078
3079 // Load all current registers also needed by next instruction
3080 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3081 {
3082   int hr;
3083   for(hr=0;hr<HOST_REGS;hr++) {
3084     if(hr!=EXCLUDE_REG) {
3085       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3086         if(i_regmap[hr]==0) {
3087           emit_zeroreg(hr);
3088         }
3089         else
3090         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3091         {
3092           emit_loadreg(i_regmap[hr],hr);
3093         }
3094       }
3095     }
3096   }
3097 }
3098
3099 // Load all regs, storing cycle count if necessary
3100 void load_regs_entry(int t)
3101 {
3102   int hr;
3103   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3104   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3105   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3106     emit_storereg(CCREG,HOST_CCREG);
3107   }
3108   // Load 32-bit regs
3109   for(hr=0;hr<HOST_REGS;hr++) {
3110     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3111       if(regs[t].regmap_entry[hr]==0) {
3112         emit_zeroreg(hr);
3113       }
3114       else if(regs[t].regmap_entry[hr]!=CCREG)
3115       {
3116         emit_loadreg(regs[t].regmap_entry[hr],hr);
3117       }
3118     }
3119   }
3120 }
3121
3122 // Store dirty registers prior to branch
3123 void store_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
3124 {
3125   if(internal_branch(addr))
3126   {
3127     int t=(addr-start)>>2;
3128     int hr;
3129     for(hr=0;hr<HOST_REGS;hr++) {
3130       if(hr!=EXCLUDE_REG) {
3131         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
3132           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1)) {
3133             if((i_dirty>>hr)&1) {
3134               assert(i_regmap[hr]<64);
3135               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3136                 emit_storereg(i_regmap[hr],hr);
3137             }
3138           }
3139         }
3140       }
3141     }
3142   }
3143   else
3144   {
3145     // Branch out of this block, write out all dirty regs
3146     wb_dirtys(i_regmap,i_dirty);
3147   }
3148 }
3149
3150 // Load all needed registers for branch target
3151 static void load_regs_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
3152 {
3153   //if(addr>=start && addr<(start+slen*4))
3154   if(internal_branch(addr))
3155   {
3156     int t=(addr-start)>>2;
3157     int hr;
3158     // Store the cycle count before loading something else
3159     if(i_regmap[HOST_CCREG]!=CCREG) {
3160       assert(i_regmap[HOST_CCREG]==-1);
3161     }
3162     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3163       emit_storereg(CCREG,HOST_CCREG);
3164     }
3165     // Load 32-bit regs
3166     for(hr=0;hr<HOST_REGS;hr++) {
3167       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3168         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
3169           if(regs[t].regmap_entry[hr]==0) {
3170             emit_zeroreg(hr);
3171           }
3172           else if(regs[t].regmap_entry[hr]!=CCREG)
3173           {
3174             emit_loadreg(regs[t].regmap_entry[hr],hr);
3175           }
3176         }
3177       }
3178     }
3179   }
3180 }
3181
3182 static int match_bt(signed char i_regmap[],uint64_t i_dirty,int addr)
3183 {
3184   if(addr>=start && addr<start+slen*4-4)
3185   {
3186     int t=(addr-start)>>2;
3187     int hr;
3188     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
3189     for(hr=0;hr<HOST_REGS;hr++)
3190     {
3191       if(hr!=EXCLUDE_REG)
3192       {
3193         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
3194         {
3195           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
3196           {
3197             return 0;
3198           }
3199           else
3200           if((i_dirty>>hr)&1)
3201           {
3202             if(i_regmap[hr]<TEMPREG)
3203             {
3204               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3205                 return 0;
3206             }
3207             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
3208             {
3209               assert(0);
3210             }
3211           }
3212         }
3213         else // Same register but is it 32-bit or dirty?
3214         if(i_regmap[hr]>=0)
3215         {
3216           if(!((regs[t].dirty>>hr)&1))
3217           {
3218             if((i_dirty>>hr)&1)
3219             {
3220               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3221               {
3222                 //printf("%x: dirty no match\n",addr);
3223                 return 0;
3224               }
3225             }
3226           }
3227         }
3228       }
3229     }
3230     // Delay slots are not valid branch targets
3231     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP)) return 0;
3232     // Delay slots require additional processing, so do not match
3233     if(is_ds[t]) return 0;
3234   }
3235   else
3236   {
3237     int hr;
3238     for(hr=0;hr<HOST_REGS;hr++)
3239     {
3240       if(hr!=EXCLUDE_REG)
3241       {
3242         if(i_regmap[hr]>=0)
3243         {
3244           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
3245           {
3246             if((i_dirty>>hr)&1)
3247             {
3248               return 0;
3249             }
3250           }
3251         }
3252       }
3253     }
3254   }
3255   return 1;
3256 }
3257
3258 #ifdef DRC_DBG
3259 static void drc_dbg_emit_do_cmp(int i)
3260 {
3261   extern void do_insn_cmp();
3262   extern int cycle;
3263   u_int hr,reglist=0;
3264
3265   for(hr=0;hr<HOST_REGS;hr++)
3266     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
3267   save_regs(reglist);
3268   emit_movimm(start+i*4,0);
3269   emit_writeword(0,&pcaddr);
3270   emit_call(do_insn_cmp);
3271   //emit_readword(&cycle,0);
3272   //emit_addimm(0,2,0);
3273   //emit_writeword(0,&cycle);
3274   restore_regs(reglist);
3275 }
3276 #else
3277 #define drc_dbg_emit_do_cmp(x)
3278 #endif
3279
3280 // Used when a branch jumps into the delay slot of another branch
3281 void ds_assemble_entry(int i)
3282 {
3283   int t=(ba[i]-start)>>2;
3284   if (!instr_addr[t])
3285     instr_addr[t] = out;
3286   assem_debug("Assemble delay slot at %x\n",ba[i]);
3287   assem_debug("<->\n");
3288   drc_dbg_emit_do_cmp(t);
3289   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
3290     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
3291   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t]);
3292   address_generation(t,&regs[t],regs[t].regmap_entry);
3293   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
3294     load_regs(regs[t].regmap_entry,regs[t].regmap,INVCP,INVCP);
3295   is_delayslot=0;
3296   switch(itype[t]) {
3297     case ALU:
3298       alu_assemble(t,&regs[t]);break;
3299     case IMM16:
3300       imm16_assemble(t,&regs[t]);break;
3301     case SHIFT:
3302       shift_assemble(t,&regs[t]);break;
3303     case SHIFTIMM:
3304       shiftimm_assemble(t,&regs[t]);break;
3305     case LOAD:
3306       load_assemble(t,&regs[t]);break;
3307     case LOADLR:
3308       loadlr_assemble(t,&regs[t]);break;
3309     case STORE:
3310       store_assemble(t,&regs[t]);break;
3311     case STORELR:
3312       storelr_assemble(t,&regs[t]);break;
3313     case COP0:
3314       cop0_assemble(t,&regs[t]);break;
3315     case COP1:
3316       cop1_assemble(t,&regs[t]);break;
3317     case C1LS:
3318       c1ls_assemble(t,&regs[t]);break;
3319     case COP2:
3320       cop2_assemble(t,&regs[t]);break;
3321     case C2LS:
3322       c2ls_assemble(t,&regs[t]);break;
3323     case C2OP:
3324       c2op_assemble(t,&regs[t]);break;
3325     case MULTDIV:
3326       multdiv_assemble(t,&regs[t]);break;
3327     case MOV:
3328       mov_assemble(t,&regs[t]);break;
3329     case SYSCALL:
3330     case HLECALL:
3331     case INTCALL:
3332     case SPAN:
3333     case UJUMP:
3334     case RJUMP:
3335     case CJUMP:
3336     case SJUMP:
3337       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3338   }
3339   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
3340   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+4);
3341   if(internal_branch(ba[i]+4))
3342     assem_debug("branch: internal\n");
3343   else
3344     assem_debug("branch: external\n");
3345   assert(internal_branch(ba[i]+4));
3346   add_to_linker(out,ba[i]+4,internal_branch(ba[i]+4));
3347   emit_jmp(0);
3348 }
3349
3350 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
3351 {
3352   int count;
3353   void *jaddr;
3354   void *idle=NULL;
3355   int t=0;
3356   if(itype[i]==RJUMP)
3357   {
3358     *adj=0;
3359   }
3360   //if(ba[i]>=start && ba[i]<(start+slen*4))
3361   if(internal_branch(ba[i]))
3362   {
3363     t=(ba[i]-start)>>2;
3364     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
3365     else *adj=ccadj[t];
3366   }
3367   else
3368   {
3369     *adj=0;
3370   }
3371   count=ccadj[i];
3372   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
3373     // Idle loop
3374     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
3375     idle=out;
3376     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
3377     emit_andimm(HOST_CCREG,3,HOST_CCREG);
3378     jaddr=out;
3379     emit_jmp(0);
3380   }
3381   else if(*adj==0||invert) {
3382     int cycles=CLOCK_ADJUST(count+2);
3383     // faster loop HACK
3384     if (t&&*adj) {
3385       int rel=t-i;
3386       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
3387         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
3388     }
3389     emit_addimm_and_set_flags(cycles,HOST_CCREG);
3390     jaddr=out;
3391     emit_jns(0);
3392   }
3393   else
3394   {
3395     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
3396     jaddr=out;
3397     emit_jns(0);
3398   }
3399   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
3400 }
3401
3402 static void do_ccstub(int n)
3403 {
3404   literal_pool(256);
3405   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
3406   set_jump_target(stubs[n].addr, out);
3407   int i=stubs[n].b;
3408   if(stubs[n].d==NULLDS) {
3409     // Delay slot instruction is nullified ("likely" branch)
3410     wb_dirtys(regs[i].regmap,regs[i].dirty);
3411   }
3412   else if(stubs[n].d!=TAKEN) {
3413     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
3414   }
3415   else {
3416     if(internal_branch(ba[i]))
3417       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3418   }
3419   if(stubs[n].c!=-1)
3420   {
3421     // Save PC as return address
3422     emit_movimm(stubs[n].c,EAX);
3423     emit_writeword(EAX,&pcaddr);
3424   }
3425   else
3426   {
3427     // Return address depends on which way the branch goes
3428     if(itype[i]==CJUMP||itype[i]==SJUMP)
3429     {
3430       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
3431       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
3432       if(rs1[i]==0)
3433       {
3434         s1l=s2l;
3435         s2l=-1;
3436       }
3437       else if(rs2[i]==0)
3438       {
3439         s2l=-1;
3440       }
3441       assert(s1l>=0);
3442       #ifdef DESTRUCTIVE_WRITEBACK
3443       if(rs1[i]) {
3444         if((branch_regs[i].dirty>>s1l)&&1)
3445           emit_loadreg(rs1[i],s1l);
3446       }
3447       else {
3448         if((branch_regs[i].dirty>>s1l)&1)
3449           emit_loadreg(rs2[i],s1l);
3450       }
3451       if(s2l>=0)
3452         if((branch_regs[i].dirty>>s2l)&1)
3453           emit_loadreg(rs2[i],s2l);
3454       #endif
3455       int hr=0;
3456       int addr=-1,alt=-1,ntaddr=-1;
3457       while(hr<HOST_REGS)
3458       {
3459         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
3460            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
3461            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
3462         {
3463           addr=hr++;break;
3464         }
3465         hr++;
3466       }
3467       while(hr<HOST_REGS)
3468       {
3469         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
3470            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
3471            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
3472         {
3473           alt=hr++;break;
3474         }
3475         hr++;
3476       }
3477       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
3478       {
3479         while(hr<HOST_REGS)
3480         {
3481           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
3482              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
3483              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
3484           {
3485             ntaddr=hr;break;
3486           }
3487           hr++;
3488         }
3489         assert(hr<HOST_REGS);
3490       }
3491       if((opcode[i]&0x2f)==4) // BEQ
3492       {
3493         #ifdef HAVE_CMOV_IMM
3494         if(s2l>=0) emit_cmp(s1l,s2l);
3495         else emit_test(s1l,s1l);
3496         emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
3497         #else
3498         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
3499         if(s2l>=0) emit_cmp(s1l,s2l);
3500         else emit_test(s1l,s1l);
3501         emit_cmovne_reg(alt,addr);
3502         #endif
3503       }
3504       if((opcode[i]&0x2f)==5) // BNE
3505       {
3506         #ifdef HAVE_CMOV_IMM
3507         if(s2l>=0) emit_cmp(s1l,s2l);
3508         else emit_test(s1l,s1l);
3509         emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
3510         #else
3511         emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
3512         if(s2l>=0) emit_cmp(s1l,s2l);
3513         else emit_test(s1l,s1l);
3514         emit_cmovne_reg(alt,addr);
3515         #endif
3516       }
3517       if((opcode[i]&0x2f)==6) // BLEZ
3518       {
3519         //emit_movimm(ba[i],alt);
3520         //emit_movimm(start+i*4+8,addr);
3521         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
3522         emit_cmpimm(s1l,1);
3523         emit_cmovl_reg(alt,addr);
3524       }
3525       if((opcode[i]&0x2f)==7) // BGTZ
3526       {
3527         //emit_movimm(ba[i],addr);
3528         //emit_movimm(start+i*4+8,ntaddr);
3529         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
3530         emit_cmpimm(s1l,1);
3531         emit_cmovl_reg(ntaddr,addr);
3532       }
3533       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
3534       {
3535         //emit_movimm(ba[i],alt);
3536         //emit_movimm(start+i*4+8,addr);
3537         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
3538         emit_test(s1l,s1l);
3539         emit_cmovs_reg(alt,addr);
3540       }
3541       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
3542       {
3543         //emit_movimm(ba[i],addr);
3544         //emit_movimm(start+i*4+8,alt);
3545         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
3546         emit_test(s1l,s1l);
3547         emit_cmovs_reg(alt,addr);
3548       }
3549       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
3550         if(source[i]&0x10000) // BC1T
3551         {
3552           //emit_movimm(ba[i],alt);
3553           //emit_movimm(start+i*4+8,addr);
3554           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
3555           emit_testimm(s1l,0x800000);
3556           emit_cmovne_reg(alt,addr);
3557         }
3558         else // BC1F
3559         {
3560           //emit_movimm(ba[i],addr);
3561           //emit_movimm(start+i*4+8,alt);
3562           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
3563           emit_testimm(s1l,0x800000);
3564           emit_cmovne_reg(alt,addr);
3565         }
3566       }
3567       emit_writeword(addr,&pcaddr);
3568     }
3569     else
3570     if(itype[i]==RJUMP)
3571     {
3572       int r=get_reg(branch_regs[i].regmap,rs1[i]);
3573       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
3574         r=get_reg(branch_regs[i].regmap,RTEMP);
3575       }
3576       emit_writeword(r,&pcaddr);
3577     }
3578     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
3579   }
3580   // Update cycle count
3581   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
3582   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
3583   emit_call(cc_interrupt);
3584   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
3585   if(stubs[n].d==TAKEN) {
3586     if(internal_branch(ba[i]))
3587       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
3588     else if(itype[i]==RJUMP) {
3589       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
3590         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
3591       else
3592         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
3593     }
3594   }else if(stubs[n].d==NOTTAKEN) {
3595     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
3596     else load_all_regs(branch_regs[i].regmap);
3597   }else if(stubs[n].d==NULLDS) {
3598     // Delay slot instruction is nullified ("likely" branch)
3599     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
3600     else load_all_regs(regs[i].regmap);
3601   }else{
3602     load_all_regs(branch_regs[i].regmap);
3603   }
3604   emit_jmp(stubs[n].retaddr);
3605 }
3606
3607 static void add_to_linker(void *addr, u_int target, int ext)
3608 {
3609   assert(linkcount < ARRAY_SIZE(link_addr));
3610   link_addr[linkcount].addr = addr;
3611   link_addr[linkcount].target = target;
3612   link_addr[linkcount].ext = ext;
3613   linkcount++;
3614 }
3615
3616 static void ujump_assemble_write_ra(int i)
3617 {
3618   int rt;
3619   unsigned int return_address;
3620   rt=get_reg(branch_regs[i].regmap,31);
3621   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
3622   //assert(rt>=0);
3623   return_address=start+i*4+8;
3624   if(rt>=0) {
3625     #ifdef USE_MINI_HT
3626     if(internal_branch(return_address)&&rt1[i+1]!=31) {
3627       int temp=-1; // note: must be ds-safe
3628       #ifdef HOST_TEMPREG
3629       temp=HOST_TEMPREG;
3630       #endif
3631       if(temp>=0) do_miniht_insert(return_address,rt,temp);
3632       else emit_movimm(return_address,rt);
3633     }
3634     else
3635     #endif
3636     {
3637       #ifdef REG_PREFETCH
3638       if(temp>=0)
3639       {
3640         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
3641       }
3642       #endif
3643       emit_movimm(return_address,rt); // PC into link register
3644       #ifdef IMM_PREFETCH
3645       emit_prefetch(hash_table_get(return_address));
3646       #endif
3647     }
3648   }
3649 }
3650
3651 void ujump_assemble(int i,struct regstat *i_regs)
3652 {
3653   int ra_done=0;
3654   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
3655   address_generation(i+1,i_regs,regs[i].regmap_entry);
3656   #ifdef REG_PREFETCH
3657   int temp=get_reg(branch_regs[i].regmap,PTEMP);
3658   if(rt1[i]==31&&temp>=0)
3659   {
3660     signed char *i_regmap=i_regs->regmap;
3661     int return_address=start+i*4+8;
3662     if(get_reg(branch_regs[i].regmap,31)>0)
3663     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
3664   }
3665   #endif
3666   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
3667     ujump_assemble_write_ra(i); // writeback ra for DS
3668     ra_done=1;
3669   }
3670   ds_assemble(i+1,i_regs);
3671   uint64_t bc_unneeded=branch_regs[i].u;
3672   bc_unneeded|=1|(1LL<<rt1[i]);
3673   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
3674   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
3675   if(!ra_done&&rt1[i]==31)
3676     ujump_assemble_write_ra(i);
3677   int cc,adj;
3678   cc=get_reg(branch_regs[i].regmap,CCREG);
3679   assert(cc==HOST_CCREG);
3680   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3681   #ifdef REG_PREFETCH
3682   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
3683   #endif
3684   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
3685   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
3686   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3687   if(internal_branch(ba[i]))
3688     assem_debug("branch: internal\n");
3689   else
3690     assem_debug("branch: external\n");
3691   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>2]) {
3692     ds_assemble_entry(i);
3693   }
3694   else {
3695     add_to_linker(out,ba[i],internal_branch(ba[i]));
3696     emit_jmp(0);
3697   }
3698 }
3699
3700 static void rjump_assemble_write_ra(int i)
3701 {
3702   int rt,return_address;
3703   assert(rt1[i+1]!=rt1[i]);
3704   assert(rt2[i+1]!=rt1[i]);
3705   rt=get_reg(branch_regs[i].regmap,rt1[i]);
3706   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
3707   assert(rt>=0);
3708   return_address=start+i*4+8;
3709   #ifdef REG_PREFETCH
3710   if(temp>=0)
3711   {
3712     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
3713   }
3714   #endif
3715   emit_movimm(return_address,rt); // PC into link register
3716   #ifdef IMM_PREFETCH
3717   emit_prefetch(hash_table_get(return_address));
3718   #endif
3719 }
3720
3721 void rjump_assemble(int i,struct regstat *i_regs)
3722 {
3723   int temp;
3724   int rs,cc;
3725   int ra_done=0;
3726   rs=get_reg(branch_regs[i].regmap,rs1[i]);
3727   assert(rs>=0);
3728   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
3729     // Delay slot abuse, make a copy of the branch address register
3730     temp=get_reg(branch_regs[i].regmap,RTEMP);
3731     assert(temp>=0);
3732     assert(regs[i].regmap[temp]==RTEMP);
3733     emit_mov(rs,temp);
3734     rs=temp;
3735   }
3736   address_generation(i+1,i_regs,regs[i].regmap_entry);
3737   #ifdef REG_PREFETCH
3738   if(rt1[i]==31)
3739   {
3740     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
3741       signed char *i_regmap=i_regs->regmap;
3742       int return_address=start+i*4+8;
3743       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
3744     }
3745   }
3746   #endif
3747   #ifdef USE_MINI_HT
3748   if(rs1[i]==31) {
3749     int rh=get_reg(regs[i].regmap,RHASH);
3750     if(rh>=0) do_preload_rhash(rh);
3751   }
3752   #endif
3753   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
3754     rjump_assemble_write_ra(i);
3755     ra_done=1;
3756   }
3757   ds_assemble(i+1,i_regs);
3758   uint64_t bc_unneeded=branch_regs[i].u;
3759   bc_unneeded|=1|(1LL<<rt1[i]);
3760   bc_unneeded&=~(1LL<<rs1[i]);
3761   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
3762   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG);
3763   if(!ra_done&&rt1[i]!=0)
3764     rjump_assemble_write_ra(i);
3765   cc=get_reg(branch_regs[i].regmap,CCREG);
3766   assert(cc==HOST_CCREG);
3767   (void)cc;
3768   #ifdef USE_MINI_HT
3769   int rh=get_reg(branch_regs[i].regmap,RHASH);
3770   int ht=get_reg(branch_regs[i].regmap,RHTBL);
3771   if(rs1[i]==31) {
3772     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
3773     do_preload_rhtbl(ht);
3774     do_rhash(rs,rh);
3775   }
3776   #endif
3777   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
3778   #ifdef DESTRUCTIVE_WRITEBACK
3779   if((branch_regs[i].dirty>>rs)&1) {
3780     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
3781       emit_loadreg(rs1[i],rs);
3782     }
3783   }
3784   #endif
3785   #ifdef REG_PREFETCH
3786   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
3787   #endif
3788   #ifdef USE_MINI_HT
3789   if(rs1[i]==31) {
3790     do_miniht_load(ht,rh);
3791   }
3792   #endif
3793   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
3794   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
3795   //assert(adj==0);
3796   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
3797   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
3798   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
3799     // special case for RFE
3800     emit_jmp(0);
3801   else
3802     emit_jns(0);
3803   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
3804   #ifdef USE_MINI_HT
3805   if(rs1[i]==31) {
3806     do_miniht_jump(rs,rh,ht);
3807   }
3808   else
3809   #endif
3810   {
3811     emit_jmp(jump_vaddr_reg[rs]);
3812   }
3813   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3814   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
3815   #endif
3816 }
3817
3818 void cjump_assemble(int i,struct regstat *i_regs)
3819 {
3820   signed char *i_regmap=i_regs->regmap;
3821   int cc;
3822   int match;
3823   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3824   assem_debug("match=%d\n",match);
3825   int s1l,s2l;
3826   int unconditional=0,nop=0;
3827   int invert=0;
3828   int internal=internal_branch(ba[i]);
3829   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
3830   if(!match) invert=1;
3831   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3832   if(i>(ba[i]-start)>>2) invert=1;
3833   #endif
3834
3835   if(ooo[i]) {
3836     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
3837     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
3838   }
3839   else {
3840     s1l=get_reg(i_regmap,rs1[i]);
3841     s2l=get_reg(i_regmap,rs2[i]);
3842   }
3843   if(rs1[i]==0&&rs2[i]==0)
3844   {
3845     if(opcode[i]&1) nop=1;
3846     else unconditional=1;
3847     //assert(opcode[i]!=5);
3848     //assert(opcode[i]!=7);
3849     //assert(opcode[i]!=0x15);
3850     //assert(opcode[i]!=0x17);
3851   }
3852   else if(rs1[i]==0)
3853   {
3854     s1l=s2l;
3855     s2l=-1;
3856   }
3857   else if(rs2[i]==0)
3858   {
3859     s2l=-1;
3860   }
3861
3862   if(ooo[i]) {
3863     // Out of order execution (delay slot first)
3864     //printf("OOOE\n");
3865     address_generation(i+1,i_regs,regs[i].regmap_entry);
3866     ds_assemble(i+1,i_regs);
3867     int adj;
3868     uint64_t bc_unneeded=branch_regs[i].u;
3869     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
3870     bc_unneeded|=1;
3871     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
3872     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs2[i]);
3873     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
3874     cc=get_reg(branch_regs[i].regmap,CCREG);
3875     assert(cc==HOST_CCREG);
3876     if(unconditional)
3877       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3878     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
3879     //assem_debug("cycle count (adj)\n");
3880     if(unconditional) {
3881       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
3882       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
3883         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
3884         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3885         if(internal)
3886           assem_debug("branch: internal\n");
3887         else
3888           assem_debug("branch: external\n");
3889         if(internal&&is_ds[(ba[i]-start)>>2]) {
3890           ds_assemble_entry(i);
3891         }
3892         else {
3893           add_to_linker(out,ba[i],internal);
3894           emit_jmp(0);
3895         }
3896         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3897         if(((u_int)out)&7) emit_addnop(0);
3898         #endif
3899       }
3900     }
3901     else if(nop) {
3902       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
3903       void *jaddr=out;
3904       emit_jns(0);
3905       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
3906     }
3907     else {
3908       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
3909       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
3910       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
3911
3912       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
3913       assert(s1l>=0);
3914       if(opcode[i]==4) // BEQ
3915       {
3916         if(s2l>=0) emit_cmp(s1l,s2l);
3917         else emit_test(s1l,s1l);
3918         if(invert){
3919           nottaken=out;
3920           emit_jne((void *)1l);
3921         }else{
3922           add_to_linker(out,ba[i],internal);
3923           emit_jeq(0);
3924         }
3925       }
3926       if(opcode[i]==5) // BNE
3927       {
3928         if(s2l>=0) emit_cmp(s1l,s2l);
3929         else emit_test(s1l,s1l);
3930         if(invert){
3931           nottaken=out;
3932           emit_jeq(1);
3933         }else{
3934           add_to_linker(out,ba[i],internal);
3935           emit_jne(0);
3936         }
3937       }
3938       if(opcode[i]==6) // BLEZ
3939       {
3940         emit_cmpimm(s1l,1);
3941         if(invert){
3942           nottaken=out;
3943           emit_jge(1);
3944         }else{
3945           add_to_linker(out,ba[i],internal);
3946           emit_jl(0);
3947         }
3948       }
3949       if(opcode[i]==7) // BGTZ
3950       {
3951         emit_cmpimm(s1l,1);
3952         if(invert){
3953           nottaken=out;
3954           emit_jl(1);
3955         }else{
3956           add_to_linker(out,ba[i],internal);
3957           emit_jge(0);
3958         }
3959       }
3960       if(invert) {
3961         if(taken) set_jump_target(taken, out);
3962         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3963         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
3964           if(adj) {
3965             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
3966             add_to_linker(out,ba[i],internal);
3967           }else{
3968             emit_addnop(13);
3969             add_to_linker(out,ba[i],internal*2);
3970           }
3971           emit_jmp(0);
3972         }else
3973         #endif
3974         {
3975           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
3976           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3977           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3978           if(internal)
3979             assem_debug("branch: internal\n");
3980           else
3981             assem_debug("branch: external\n");
3982           if(internal&&is_ds[(ba[i]-start)>>2]) {
3983             ds_assemble_entry(i);
3984           }
3985           else {
3986             add_to_linker(out,ba[i],internal);
3987             emit_jmp(0);
3988           }
3989         }
3990         set_jump_target(nottaken, out);
3991       }
3992
3993       if(nottaken1) set_jump_target(nottaken1, out);
3994       if(adj) {
3995         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
3996       }
3997     } // (!unconditional)
3998   } // if(ooo)
3999   else
4000   {
4001     // In-order execution (branch first)
4002     //if(likely[i]) printf("IOL\n");
4003     //else
4004     //printf("IOE\n");
4005     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4006     if(!unconditional&&!nop) {
4007       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4008       assert(s1l>=0);
4009       if((opcode[i]&0x2f)==4) // BEQ
4010       {
4011         if(s2l>=0) emit_cmp(s1l,s2l);
4012         else emit_test(s1l,s1l);
4013         nottaken=out;
4014         emit_jne((void *)2l);
4015       }
4016       if((opcode[i]&0x2f)==5) // BNE
4017       {
4018         if(s2l>=0) emit_cmp(s1l,s2l);
4019         else emit_test(s1l,s1l);
4020         nottaken=out;
4021         emit_jeq(2);
4022       }
4023       if((opcode[i]&0x2f)==6) // BLEZ
4024       {
4025         emit_cmpimm(s1l,1);
4026         nottaken=out;
4027         emit_jge(2);
4028       }
4029       if((opcode[i]&0x2f)==7) // BGTZ
4030       {
4031         emit_cmpimm(s1l,1);
4032         nottaken=out;
4033         emit_jl(2);
4034       }
4035     } // if(!unconditional)
4036     int adj;
4037     uint64_t ds_unneeded=branch_regs[i].u;
4038     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4039     ds_unneeded|=1;
4040     // branch taken
4041     if(!nop) {
4042       if(taken) set_jump_target(taken, out);
4043       assem_debug("1:\n");
4044       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
4045       // load regs
4046       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
4047       address_generation(i+1,&branch_regs[i],0);
4048       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
4049       ds_assemble(i+1,&branch_regs[i]);
4050       cc=get_reg(branch_regs[i].regmap,CCREG);
4051       if(cc==-1) {
4052         emit_loadreg(CCREG,cc=HOST_CCREG);
4053         // CHECK: Is the following instruction (fall thru) allocated ok?
4054       }
4055       assert(cc==HOST_CCREG);
4056       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4057       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4058       assem_debug("cycle count (adj)\n");
4059       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4060       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4061       if(internal)
4062         assem_debug("branch: internal\n");
4063       else
4064         assem_debug("branch: external\n");
4065       if(internal&&is_ds[(ba[i]-start)>>2]) {
4066         ds_assemble_entry(i);
4067       }
4068       else {
4069         add_to_linker(out,ba[i],internal);
4070         emit_jmp(0);
4071       }
4072     }
4073     // branch not taken
4074     if(!unconditional) {
4075       if(nottaken1) set_jump_target(nottaken1, out);
4076       set_jump_target(nottaken, out);
4077       assem_debug("2:\n");
4078       if(!likely[i]) {
4079         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
4080         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
4081         address_generation(i+1,&branch_regs[i],0);
4082         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4083         ds_assemble(i+1,&branch_regs[i]);
4084       }
4085       cc=get_reg(branch_regs[i].regmap,CCREG);
4086       if(cc==-1&&!likely[i]) {
4087         // Cycle count isn't in a register, temporarily load it then write it out
4088         emit_loadreg(CCREG,HOST_CCREG);
4089         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4090         void *jaddr=out;
4091         emit_jns(0);
4092         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4093         emit_storereg(CCREG,HOST_CCREG);
4094       }
4095       else{
4096         cc=get_reg(i_regmap,CCREG);
4097         assert(cc==HOST_CCREG);
4098         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4099         void *jaddr=out;
4100         emit_jns(0);
4101         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
4102       }
4103     }
4104   }
4105 }
4106
4107 void sjump_assemble(int i,struct regstat *i_regs)
4108 {
4109   signed char *i_regmap=i_regs->regmap;
4110   int cc;
4111   int match;
4112   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4113   assem_debug("smatch=%d\n",match);
4114   int s1l;
4115   int unconditional=0,nevertaken=0;
4116   int invert=0;
4117   int internal=internal_branch(ba[i]);
4118   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4119   if(!match) invert=1;
4120   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4121   if(i>(ba[i]-start)>>2) invert=1;
4122   #endif
4123
4124   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
4125   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
4126
4127   if(ooo[i]) {
4128     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4129   }
4130   else {
4131     s1l=get_reg(i_regmap,rs1[i]);
4132   }
4133   if(rs1[i]==0)
4134   {
4135     if(opcode2[i]&1) unconditional=1;
4136     else nevertaken=1;
4137     // These are never taken (r0 is never less than zero)
4138     //assert(opcode2[i]!=0);
4139     //assert(opcode2[i]!=2);
4140     //assert(opcode2[i]!=0x10);
4141     //assert(opcode2[i]!=0x12);
4142   }
4143
4144   if(ooo[i]) {
4145     // Out of order execution (delay slot first)
4146     //printf("OOOE\n");
4147     address_generation(i+1,i_regs,regs[i].regmap_entry);
4148     ds_assemble(i+1,i_regs);
4149     int adj;
4150     uint64_t bc_unneeded=branch_regs[i].u;
4151     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4152     bc_unneeded|=1;
4153     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,bc_unneeded);
4154     load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],rs1[i]);
4155     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4156     if(rt1[i]==31) {
4157       int rt,return_address;
4158       rt=get_reg(branch_regs[i].regmap,31);
4159       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4160       if(rt>=0) {
4161         // Save the PC even if the branch is not taken
4162         return_address=start+i*4+8;
4163         emit_movimm(return_address,rt); // PC into link register
4164         #ifdef IMM_PREFETCH
4165         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
4166         #endif
4167       }
4168     }
4169     cc=get_reg(branch_regs[i].regmap,CCREG);
4170     assert(cc==HOST_CCREG);
4171     if(unconditional)
4172       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4173     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4174     assem_debug("cycle count (adj)\n");
4175     if(unconditional) {
4176       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4177       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4178         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4179         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4180         if(internal)
4181           assem_debug("branch: internal\n");
4182         else
4183           assem_debug("branch: external\n");
4184         if(internal&&is_ds[(ba[i]-start)>>2]) {
4185           ds_assemble_entry(i);
4186         }
4187         else {
4188           add_to_linker(out,ba[i],internal);
4189           emit_jmp(0);
4190         }
4191         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4192         if(((u_int)out)&7) emit_addnop(0);
4193         #endif
4194       }
4195     }
4196     else if(nevertaken) {
4197       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4198       void *jaddr=out;
4199       emit_jns(0);
4200       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4201     }
4202     else {
4203       void *nottaken = NULL;
4204       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4205       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4206       {
4207         assert(s1l>=0);
4208         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
4209         {
4210           emit_test(s1l,s1l);
4211           if(invert){
4212             nottaken=out;
4213             emit_jns(1);
4214           }else{
4215             add_to_linker(out,ba[i],internal);
4216             emit_js(0);
4217           }
4218         }
4219         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
4220         {
4221           emit_test(s1l,s1l);
4222           if(invert){
4223             nottaken=out;
4224             emit_js(1);
4225           }else{
4226             add_to_linker(out,ba[i],internal);
4227             emit_jns(0);
4228           }
4229         }
4230       }
4231
4232       if(invert) {
4233         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4234         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
4235           if(adj) {
4236             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4237             add_to_linker(out,ba[i],internal);
4238           }else{
4239             emit_addnop(13);
4240             add_to_linker(out,ba[i],internal*2);
4241           }
4242           emit_jmp(0);
4243         }else
4244         #endif
4245         {
4246           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4247           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4248           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4249           if(internal)
4250             assem_debug("branch: internal\n");
4251           else
4252             assem_debug("branch: external\n");
4253           if(internal&&is_ds[(ba[i]-start)>>2]) {
4254             ds_assemble_entry(i);
4255           }
4256           else {
4257             add_to_linker(out,ba[i],internal);
4258             emit_jmp(0);
4259           }
4260         }
4261         set_jump_target(nottaken, out);
4262       }
4263
4264       if(adj) {
4265         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
4266       }
4267     } // (!unconditional)
4268   } // if(ooo)
4269   else
4270   {
4271     // In-order execution (branch first)
4272     //printf("IOE\n");
4273     void *nottaken = NULL;
4274     if(rt1[i]==31) {
4275       int rt,return_address;
4276       rt=get_reg(branch_regs[i].regmap,31);
4277       if(rt>=0) {
4278         // Save the PC even if the branch is not taken
4279         return_address=start+i*4+8;
4280         emit_movimm(return_address,rt); // PC into link register
4281         #ifdef IMM_PREFETCH
4282         emit_prefetch(hash_table_get(return_address));
4283         #endif
4284       }
4285     }
4286     if(!unconditional) {
4287       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4288         assert(s1l>=0);
4289         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
4290         {
4291           emit_test(s1l,s1l);
4292           nottaken=out;
4293           emit_jns(1);
4294         }
4295         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
4296         {
4297           emit_test(s1l,s1l);
4298           nottaken=out;
4299           emit_js(1);
4300         }
4301     } // if(!unconditional)
4302     int adj;
4303     uint64_t ds_unneeded=branch_regs[i].u;
4304     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4305     ds_unneeded|=1;
4306     // branch taken
4307     if(!nevertaken) {
4308       //assem_debug("1:\n");
4309       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
4310       // load regs
4311       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
4312       address_generation(i+1,&branch_regs[i],0);
4313       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,INVCP);
4314       ds_assemble(i+1,&branch_regs[i]);
4315       cc=get_reg(branch_regs[i].regmap,CCREG);
4316       if(cc==-1) {
4317         emit_loadreg(CCREG,cc=HOST_CCREG);
4318         // CHECK: Is the following instruction (fall thru) allocated ok?
4319       }
4320       assert(cc==HOST_CCREG);
4321       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4322       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4323       assem_debug("cycle count (adj)\n");
4324       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4325       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4326       if(internal)
4327         assem_debug("branch: internal\n");
4328       else
4329         assem_debug("branch: external\n");
4330       if(internal&&is_ds[(ba[i]-start)>>2]) {
4331         ds_assemble_entry(i);
4332       }
4333       else {
4334         add_to_linker(out,ba[i],internal);
4335         emit_jmp(0);
4336       }
4337     }
4338     // branch not taken
4339     if(!unconditional) {
4340       set_jump_target(nottaken, out);
4341       assem_debug("1:\n");
4342       if(!likely[i]) {
4343         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,ds_unneeded);
4344         load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1]);
4345         address_generation(i+1,&branch_regs[i],0);
4346         load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG);
4347         ds_assemble(i+1,&branch_regs[i]);
4348       }
4349       cc=get_reg(branch_regs[i].regmap,CCREG);
4350       if(cc==-1&&!likely[i]) {
4351         // Cycle count isn't in a register, temporarily load it then write it out
4352         emit_loadreg(CCREG,HOST_CCREG);
4353         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4354         void *jaddr=out;
4355         emit_jns(0);
4356         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4357         emit_storereg(CCREG,HOST_CCREG);
4358       }
4359       else{
4360         cc=get_reg(i_regmap,CCREG);
4361         assert(cc==HOST_CCREG);
4362         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4363         void *jaddr=out;
4364         emit_jns(0);
4365         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
4366       }
4367     }
4368   }
4369 }
4370
4371 static void pagespan_assemble(int i,struct regstat *i_regs)
4372 {
4373   int s1l=get_reg(i_regs->regmap,rs1[i]);
4374   int s2l=get_reg(i_regs->regmap,rs2[i]);
4375   void *taken = NULL;
4376   void *nottaken = NULL;
4377   int unconditional=0;
4378   if(rs1[i]==0)
4379   {
4380     s1l=s2l;
4381     s2l=-1;
4382   }
4383   else if(rs2[i]==0)
4384   {
4385     s2l=-1;
4386   }
4387   int hr=0;
4388   int addr=-1,alt=-1,ntaddr=-1;
4389   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
4390   else {
4391     while(hr<HOST_REGS)
4392     {
4393       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4394          (i_regs->regmap[hr]&63)!=rs1[i] &&
4395          (i_regs->regmap[hr]&63)!=rs2[i] )
4396       {
4397         addr=hr++;break;
4398       }
4399       hr++;
4400     }
4401   }
4402   while(hr<HOST_REGS)
4403   {
4404     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
4405        (i_regs->regmap[hr]&63)!=rs1[i] &&
4406        (i_regs->regmap[hr]&63)!=rs2[i] )
4407     {
4408       alt=hr++;break;
4409     }
4410     hr++;
4411   }
4412   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4413   {
4414     while(hr<HOST_REGS)
4415     {
4416       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
4417          (i_regs->regmap[hr]&63)!=rs1[i] &&
4418          (i_regs->regmap[hr]&63)!=rs2[i] )
4419       {
4420         ntaddr=hr;break;
4421       }
4422       hr++;
4423     }
4424   }
4425   assert(hr<HOST_REGS);
4426   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
4427     load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
4428   }
4429   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4430   if(opcode[i]==2) // J
4431   {
4432     unconditional=1;
4433   }
4434   if(opcode[i]==3) // JAL
4435   {
4436     // TODO: mini_ht
4437     int rt=get_reg(i_regs->regmap,31);
4438     emit_movimm(start+i*4+8,rt);
4439     unconditional=1;
4440   }
4441   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
4442   {
4443     emit_mov(s1l,addr);
4444     if(opcode2[i]==9) // JALR
4445     {
4446       int rt=get_reg(i_regs->regmap,rt1[i]);
4447       emit_movimm(start+i*4+8,rt);
4448     }
4449   }
4450   if((opcode[i]&0x3f)==4) // BEQ
4451   {
4452     if(rs1[i]==rs2[i])
4453     {
4454       unconditional=1;
4455     }
4456     else
4457     #ifdef HAVE_CMOV_IMM
4458     if(1) {
4459       if(s2l>=0) emit_cmp(s1l,s2l);
4460       else emit_test(s1l,s1l);
4461       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4462     }
4463     else
4464     #endif
4465     {
4466       assert(s1l>=0);
4467       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4468       if(s2l>=0) emit_cmp(s1l,s2l);
4469       else emit_test(s1l,s1l);
4470       emit_cmovne_reg(alt,addr);
4471     }
4472   }
4473   if((opcode[i]&0x3f)==5) // BNE
4474   {
4475     #ifdef HAVE_CMOV_IMM
4476     if(s2l>=0) emit_cmp(s1l,s2l);
4477     else emit_test(s1l,s1l);
4478     emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4479     #else
4480     assert(s1l>=0);
4481     emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4482     if(s2l>=0) emit_cmp(s1l,s2l);
4483     else emit_test(s1l,s1l);
4484     emit_cmovne_reg(alt,addr);
4485     #endif
4486   }
4487   if((opcode[i]&0x3f)==0x14) // BEQL
4488   {
4489     if(s2l>=0) emit_cmp(s1l,s2l);
4490     else emit_test(s1l,s1l);
4491     if(nottaken) set_jump_target(nottaken, out);
4492     nottaken=out;
4493     emit_jne(0);
4494   }
4495   if((opcode[i]&0x3f)==0x15) // BNEL
4496   {
4497     if(s2l>=0) emit_cmp(s1l,s2l);
4498     else emit_test(s1l,s1l);
4499     nottaken=out;
4500     emit_jeq(0);
4501     if(taken) set_jump_target(taken, out);
4502   }
4503   if((opcode[i]&0x3f)==6) // BLEZ
4504   {
4505     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4506     emit_cmpimm(s1l,1);
4507     emit_cmovl_reg(alt,addr);
4508   }
4509   if((opcode[i]&0x3f)==7) // BGTZ
4510   {
4511     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4512     emit_cmpimm(s1l,1);
4513     emit_cmovl_reg(ntaddr,addr);
4514   }
4515   if((opcode[i]&0x3f)==0x16) // BLEZL
4516   {
4517     assert((opcode[i]&0x3f)!=0x16);
4518   }
4519   if((opcode[i]&0x3f)==0x17) // BGTZL
4520   {
4521     assert((opcode[i]&0x3f)!=0x17);
4522   }
4523   assert(opcode[i]!=1); // BLTZ/BGEZ
4524
4525   //FIXME: Check CSREG
4526   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4527     if((source[i]&0x30000)==0) // BC1F
4528     {
4529       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4530       emit_testimm(s1l,0x800000);
4531       emit_cmovne_reg(alt,addr);
4532     }
4533     if((source[i]&0x30000)==0x10000) // BC1T
4534     {
4535       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4536       emit_testimm(s1l,0x800000);
4537       emit_cmovne_reg(alt,addr);
4538     }
4539     if((source[i]&0x30000)==0x20000) // BC1FL
4540     {
4541       emit_testimm(s1l,0x800000);
4542       nottaken=out;
4543       emit_jne(0);
4544     }
4545     if((source[i]&0x30000)==0x30000) // BC1TL
4546     {
4547       emit_testimm(s1l,0x800000);
4548       nottaken=out;
4549       emit_jeq(0);
4550     }
4551   }
4552
4553   assert(i_regs->regmap[HOST_CCREG]==CCREG);
4554   wb_dirtys(regs[i].regmap,regs[i].dirty);
4555   if(likely[i]||unconditional)
4556   {
4557     emit_movimm(ba[i],HOST_BTREG);
4558   }
4559   else if(addr!=HOST_BTREG)
4560   {
4561     emit_mov(addr,HOST_BTREG);
4562   }
4563   void *branch_addr=out;
4564   emit_jmp(0);
4565   int target_addr=start+i*4+5;
4566   void *stub=out;
4567   void *compiled_target_addr=check_addr(target_addr);
4568   emit_extjump_ds(branch_addr, target_addr);
4569   if(compiled_target_addr) {
4570     set_jump_target(branch_addr, compiled_target_addr);
4571     add_link(target_addr,stub);
4572   }
4573   else set_jump_target(branch_addr, stub);
4574   if(likely[i]) {
4575     // Not-taken path
4576     set_jump_target(nottaken, out);
4577     wb_dirtys(regs[i].regmap,regs[i].dirty);
4578     void *branch_addr=out;
4579     emit_jmp(0);
4580     int target_addr=start+i*4+8;
4581     void *stub=out;
4582     void *compiled_target_addr=check_addr(target_addr);
4583     emit_extjump_ds(branch_addr, target_addr);
4584     if(compiled_target_addr) {
4585       set_jump_target(branch_addr, compiled_target_addr);
4586       add_link(target_addr,stub);
4587     }
4588     else set_jump_target(branch_addr, stub);
4589   }
4590 }
4591
4592 // Assemble the delay slot for the above
4593 static void pagespan_ds()
4594 {
4595   assem_debug("initial delay slot:\n");
4596   u_int vaddr=start+1;
4597   u_int page=get_page(vaddr);
4598   u_int vpage=get_vpage(vaddr);
4599   ll_add(jump_dirty+vpage,vaddr,(void *)out);
4600   do_dirty_stub_ds();
4601   ll_add(jump_in+page,vaddr,(void *)out);
4602   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
4603   if(regs[0].regmap[HOST_CCREG]!=CCREG)
4604     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty);
4605   if(regs[0].regmap[HOST_BTREG]!=BTREG)
4606     emit_writeword(HOST_BTREG,&branch_target);
4607   load_regs(regs[0].regmap_entry,regs[0].regmap,rs1[0],rs2[0]);
4608   address_generation(0,&regs[0],regs[0].regmap_entry);
4609   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
4610     load_regs(regs[0].regmap_entry,regs[0].regmap,INVCP,INVCP);
4611   is_delayslot=0;
4612   switch(itype[0]) {
4613     case ALU:
4614       alu_assemble(0,&regs[0]);break;
4615     case IMM16:
4616       imm16_assemble(0,&regs[0]);break;
4617     case SHIFT:
4618       shift_assemble(0,&regs[0]);break;
4619     case SHIFTIMM:
4620       shiftimm_assemble(0,&regs[0]);break;
4621     case LOAD:
4622       load_assemble(0,&regs[0]);break;
4623     case LOADLR:
4624       loadlr_assemble(0,&regs[0]);break;
4625     case STORE:
4626       store_assemble(0,&regs[0]);break;
4627     case STORELR:
4628       storelr_assemble(0,&regs[0]);break;
4629     case COP0:
4630       cop0_assemble(0,&regs[0]);break;
4631     case COP1:
4632       cop1_assemble(0,&regs[0]);break;
4633     case C1LS:
4634       c1ls_assemble(0,&regs[0]);break;
4635     case COP2:
4636       cop2_assemble(0,&regs[0]);break;
4637     case C2LS:
4638       c2ls_assemble(0,&regs[0]);break;
4639     case C2OP:
4640       c2op_assemble(0,&regs[0]);break;
4641     case MULTDIV:
4642       multdiv_assemble(0,&regs[0]);break;
4643     case MOV:
4644       mov_assemble(0,&regs[0]);break;
4645     case SYSCALL:
4646     case HLECALL:
4647     case INTCALL:
4648     case SPAN:
4649     case UJUMP:
4650     case RJUMP:
4651     case CJUMP:
4652     case SJUMP:
4653       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4654   }
4655   int btaddr=get_reg(regs[0].regmap,BTREG);
4656   if(btaddr<0) {
4657     btaddr=get_reg(regs[0].regmap,-1);
4658     emit_readword(&branch_target,btaddr);
4659   }
4660   assert(btaddr!=HOST_CCREG);
4661   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
4662 #ifdef HOST_IMM8
4663   emit_movimm(start+4,HOST_TEMPREG);
4664   emit_cmp(btaddr,HOST_TEMPREG);
4665 #else
4666   emit_cmpimm(btaddr,start+4);
4667 #endif
4668   void *branch = out;
4669   emit_jeq(0);
4670   store_regs_bt(regs[0].regmap,regs[0].dirty,-1);
4671   emit_jmp(jump_vaddr_reg[btaddr]);
4672   set_jump_target(branch, out);
4673   store_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
4674   load_regs_bt(regs[0].regmap,regs[0].dirty,start+4);
4675 }
4676
4677 // Basic liveness analysis for MIPS registers
4678 void unneeded_registers(int istart,int iend,int r)
4679 {
4680   int i;
4681   uint64_t u,gte_u,b,gte_b;
4682   uint64_t temp_u,temp_gte_u=0;
4683   uint64_t gte_u_unknown=0;
4684   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
4685     gte_u_unknown=~0ll;
4686   if(iend==slen-1) {
4687     u=1;
4688     gte_u=gte_u_unknown;
4689   }else{
4690     //u=unneeded_reg[iend+1];
4691     u=1;
4692     gte_u=gte_unneeded[iend+1];
4693   }
4694
4695   for (i=iend;i>=istart;i--)
4696   {
4697     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
4698     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
4699     {
4700       // If subroutine call, flag return address as a possible branch target
4701       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
4702
4703       if(ba[i]<start || ba[i]>=(start+slen*4))
4704       {
4705         // Branch out of this block, flush all regs
4706         u=1;
4707         gte_u=gte_u_unknown;
4708         branch_unneeded_reg[i]=u;
4709         // Merge in delay slot
4710         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
4711         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4712         u|=1;
4713         gte_u|=gte_rt[i+1];
4714         gte_u&=~gte_rs[i+1];
4715         // If branch is "likely" (and conditional)
4716         // then we skip the delay slot on the fall-thru path
4717         if(likely[i]) {
4718           if(i<slen-1) {
4719             u&=unneeded_reg[i+2];
4720             gte_u&=gte_unneeded[i+2];
4721           }
4722           else
4723           {
4724             u=1;
4725             gte_u=gte_u_unknown;
4726           }
4727         }
4728       }
4729       else
4730       {
4731         // Internal branch, flag target
4732         bt[(ba[i]-start)>>2]=1;
4733         if(ba[i]<=start+i*4) {
4734           // Backward branch
4735           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
4736           {
4737             // Unconditional branch
4738             temp_u=1;
4739             temp_gte_u=0;
4740           } else {
4741             // Conditional branch (not taken case)
4742             temp_u=unneeded_reg[i+2];
4743             temp_gte_u&=gte_unneeded[i+2];
4744           }
4745           // Merge in delay slot
4746           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
4747           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4748           temp_u|=1;
4749           temp_gte_u|=gte_rt[i+1];
4750           temp_gte_u&=~gte_rs[i+1];
4751           // If branch is "likely" (and conditional)
4752           // then we skip the delay slot on the fall-thru path
4753           if(likely[i]) {
4754             if(i<slen-1) {
4755               temp_u&=unneeded_reg[i+2];
4756               temp_gte_u&=gte_unneeded[i+2];
4757             }
4758             else
4759             {
4760               temp_u=1;
4761               temp_gte_u=gte_u_unknown;
4762             }
4763           }
4764           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
4765           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4766           temp_u|=1;
4767           temp_gte_u|=gte_rt[i];
4768           temp_gte_u&=~gte_rs[i];
4769           unneeded_reg[i]=temp_u;
4770           gte_unneeded[i]=temp_gte_u;
4771           // Only go three levels deep.  This recursion can take an
4772           // excessive amount of time if there are a lot of nested loops.
4773           if(r<2) {
4774             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
4775           }else{
4776             unneeded_reg[(ba[i]-start)>>2]=1;
4777             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
4778           }
4779         } /*else*/ if(1) {
4780           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
4781           {
4782             // Unconditional branch
4783             u=unneeded_reg[(ba[i]-start)>>2];
4784             gte_u=gte_unneeded[(ba[i]-start)>>2];
4785             branch_unneeded_reg[i]=u;
4786             // Merge in delay slot
4787             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
4788             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4789             u|=1;
4790             gte_u|=gte_rt[i+1];
4791             gte_u&=~gte_rs[i+1];
4792           } else {
4793             // Conditional branch
4794             b=unneeded_reg[(ba[i]-start)>>2];
4795             gte_b=gte_unneeded[(ba[i]-start)>>2];
4796             branch_unneeded_reg[i]=b;
4797             // Branch delay slot
4798             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
4799             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4800             b|=1;
4801             gte_b|=gte_rt[i+1];
4802             gte_b&=~gte_rs[i+1];
4803             // If branch is "likely" then we skip the
4804             // delay slot on the fall-thru path
4805             if(likely[i]) {
4806               u=b;
4807               gte_u=gte_b;
4808               if(i<slen-1) {
4809                 u&=unneeded_reg[i+2];
4810                 gte_u&=gte_unneeded[i+2];
4811               }
4812             } else {
4813               u&=b;
4814               gte_u&=gte_b;
4815             }
4816             if(i<slen-1) {
4817               branch_unneeded_reg[i]&=unneeded_reg[i+2];
4818             } else {
4819               branch_unneeded_reg[i]=1;
4820             }
4821           }
4822         }
4823       }
4824     }
4825     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
4826     {
4827       // SYSCALL instruction (software interrupt)
4828       u=1;
4829     }
4830     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
4831     {
4832       // ERET instruction (return from interrupt)
4833       u=1;
4834     }
4835     //u=1; // DEBUG
4836     // Written registers are unneeded
4837     u|=1LL<<rt1[i];
4838     u|=1LL<<rt2[i];
4839     gte_u|=gte_rt[i];
4840     // Accessed registers are needed
4841     u&=~(1LL<<rs1[i]);
4842     u&=~(1LL<<rs2[i]);
4843     gte_u&=~gte_rs[i];
4844     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
4845       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
4846     // Source-target dependencies
4847     // R0 is always unneeded
4848     u|=1;
4849     // Save it
4850     unneeded_reg[i]=u;
4851     gte_unneeded[i]=gte_u;
4852     /*
4853     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
4854     printf("U:");
4855     int r;
4856     for(r=1;r<=CCREG;r++) {
4857       if((unneeded_reg[i]>>r)&1) {
4858         if(r==HIREG) printf(" HI");
4859         else if(r==LOREG) printf(" LO");
4860         else printf(" r%d",r);
4861       }
4862     }
4863     printf("\n");
4864     */
4865   }
4866 }
4867
4868 // Write back dirty registers as soon as we will no longer modify them,
4869 // so that we don't end up with lots of writes at the branches.
4870 void clean_registers(int istart,int iend,int wr)
4871 {
4872   int i;
4873   int r;
4874   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
4875   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
4876   if(iend==slen-1) {
4877     will_dirty_i=will_dirty_next=0;
4878     wont_dirty_i=wont_dirty_next=0;
4879   }else{
4880     will_dirty_i=will_dirty_next=will_dirty[iend+1];
4881     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
4882   }
4883   for (i=iend;i>=istart;i--)
4884   {
4885     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
4886     {
4887       if(ba[i]<start || ba[i]>=(start+slen*4))
4888       {
4889         // Branch out of this block, flush all regs
4890         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
4891         {
4892           // Unconditional branch
4893           will_dirty_i=0;
4894           wont_dirty_i=0;
4895           // Merge in delay slot (will dirty)
4896           for(r=0;r<HOST_REGS;r++) {
4897             if(r!=EXCLUDE_REG) {
4898               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4899               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4900               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4901               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4902               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
4903               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
4904               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4905               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4906               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4907               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4908               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4909               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
4910               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
4911               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4912             }
4913           }
4914         }
4915         else
4916         {
4917           // Conditional branch
4918           will_dirty_i=0;
4919           wont_dirty_i=wont_dirty_next;
4920           // Merge in delay slot (will dirty)
4921           for(r=0;r<HOST_REGS;r++) {
4922             if(r!=EXCLUDE_REG) {
4923               if(!likely[i]) {
4924                 // Might not dirty if likely branch is not taken
4925                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4926                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4927                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4928                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4929                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
4930                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
4931                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4932                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4933                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4934                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4935                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4936                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
4937                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
4938                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4939               }
4940             }
4941           }
4942         }
4943         // Merge in delay slot (wont dirty)
4944         for(r=0;r<HOST_REGS;r++) {
4945           if(r!=EXCLUDE_REG) {
4946             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
4947             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
4948             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
4949             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
4950             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
4951             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
4952             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
4953             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
4954             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
4955             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
4956           }
4957         }
4958         if(wr) {
4959           #ifndef DESTRUCTIVE_WRITEBACK
4960           branch_regs[i].dirty&=wont_dirty_i;
4961           #endif
4962           branch_regs[i].dirty|=will_dirty_i;
4963         }
4964       }
4965       else
4966       {
4967         // Internal branch
4968         if(ba[i]<=start+i*4) {
4969           // Backward branch
4970           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
4971           {
4972             // Unconditional branch
4973             temp_will_dirty=0;
4974             temp_wont_dirty=0;
4975             // Merge in delay slot (will dirty)
4976             for(r=0;r<HOST_REGS;r++) {
4977               if(r!=EXCLUDE_REG) {
4978                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
4979                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
4980                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
4981                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
4982                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
4983                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
4984                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
4985                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
4986                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
4987                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
4988                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
4989                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
4990                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
4991                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
4992               }
4993             }
4994           } else {
4995             // Conditional branch (not taken case)
4996             temp_will_dirty=will_dirty_next;
4997             temp_wont_dirty=wont_dirty_next;
4998             // Merge in delay slot (will dirty)
4999             for(r=0;r<HOST_REGS;r++) {
5000               if(r!=EXCLUDE_REG) {
5001                 if(!likely[i]) {
5002                   // Will not dirty if likely branch is not taken
5003                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
5004                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
5005                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
5006                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
5007                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
5008                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
5009                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
5010                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
5011                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
5012                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
5013                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
5014                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
5015                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
5016                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
5017                 }
5018               }
5019             }
5020           }
5021           // Merge in delay slot (wont dirty)
5022           for(r=0;r<HOST_REGS;r++) {
5023             if(r!=EXCLUDE_REG) {
5024               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
5025               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
5026               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
5027               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
5028               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
5029               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
5030               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
5031               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
5032               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
5033               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
5034             }
5035           }
5036           // Deal with changed mappings
5037           if(i<iend) {
5038             for(r=0;r<HOST_REGS;r++) {
5039               if(r!=EXCLUDE_REG) {
5040                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
5041                   temp_will_dirty&=~(1<<r);
5042                   temp_wont_dirty&=~(1<<r);
5043                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
5044                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5045                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5046                   } else {
5047                     temp_will_dirty|=1<<r;
5048                     temp_wont_dirty|=1<<r;
5049                   }
5050                 }
5051               }
5052             }
5053           }
5054           if(wr) {
5055             will_dirty[i]=temp_will_dirty;
5056             wont_dirty[i]=temp_wont_dirty;
5057             clean_registers((ba[i]-start)>>2,i-1,0);
5058           }else{
5059             // Limit recursion.  It can take an excessive amount
5060             // of time if there are a lot of nested loops.
5061             will_dirty[(ba[i]-start)>>2]=0;
5062             wont_dirty[(ba[i]-start)>>2]=-1;
5063           }
5064         }
5065         /*else*/ if(1)
5066         {
5067           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5068           {
5069             // Unconditional branch
5070             will_dirty_i=0;
5071             wont_dirty_i=0;
5072           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
5073             for(r=0;r<HOST_REGS;r++) {
5074               if(r!=EXCLUDE_REG) {
5075                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
5076                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
5077                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
5078                 }
5079                 if(branch_regs[i].regmap[r]>=0) {
5080                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
5081                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
5082                 }
5083               }
5084             }
5085           //}
5086             // Merge in delay slot
5087             for(r=0;r<HOST_REGS;r++) {
5088               if(r!=EXCLUDE_REG) {
5089                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5090                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5091                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5092                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5093                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5094                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5095                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5096                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5097                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5098                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5099                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5100                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5101                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5102                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5103               }
5104             }
5105           } else {
5106             // Conditional branch
5107             will_dirty_i=will_dirty_next;
5108             wont_dirty_i=wont_dirty_next;
5109           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
5110             for(r=0;r<HOST_REGS;r++) {
5111               if(r!=EXCLUDE_REG) {
5112                 signed char target_reg=branch_regs[i].regmap[r];
5113                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
5114                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
5115                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
5116                 }
5117                 else if(target_reg>=0) {
5118                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
5119                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
5120                 }
5121                 // Treat delay slot as part of branch too
5122                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
5123                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
5124                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
5125                 }
5126                 else
5127                 {
5128                   will_dirty[i+1]&=~(1<<r);
5129                 }*/
5130               }
5131             }
5132           //}
5133             // Merge in delay slot
5134             for(r=0;r<HOST_REGS;r++) {
5135               if(r!=EXCLUDE_REG) {
5136                 if(!likely[i]) {
5137                   // Might not dirty if likely branch is not taken
5138                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5139                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5140                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5141                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5142                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5143                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5144                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5145                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5146                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5147                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
5148                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
5149                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5150                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5151                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5152                 }
5153               }
5154             }
5155           }
5156           // Merge in delay slot (won't dirty)
5157           for(r=0;r<HOST_REGS;r++) {
5158             if(r!=EXCLUDE_REG) {
5159               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5160               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5161               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5162               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5163               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5164               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5165               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5166               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
5167               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
5168               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5169             }
5170           }
5171           if(wr) {
5172             #ifndef DESTRUCTIVE_WRITEBACK
5173             branch_regs[i].dirty&=wont_dirty_i;
5174             #endif
5175             branch_regs[i].dirty|=will_dirty_i;
5176           }
5177         }
5178       }
5179     }
5180     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5181     {
5182       // SYSCALL instruction (software interrupt)
5183       will_dirty_i=0;
5184       wont_dirty_i=0;
5185     }
5186     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5187     {
5188       // ERET instruction (return from interrupt)
5189       will_dirty_i=0;
5190       wont_dirty_i=0;
5191     }
5192     will_dirty_next=will_dirty_i;
5193     wont_dirty_next=wont_dirty_i;
5194     for(r=0;r<HOST_REGS;r++) {
5195       if(r!=EXCLUDE_REG) {
5196         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
5197         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
5198         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
5199         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
5200         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
5201         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5202         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5203         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5204         if(i>istart) {
5205           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
5206           {
5207             // Don't store a register immediately after writing it,
5208             // may prevent dual-issue.
5209             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
5210             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
5211           }
5212         }
5213       }
5214     }
5215     // Save it
5216     will_dirty[i]=will_dirty_i;
5217     wont_dirty[i]=wont_dirty_i;
5218     // Mark registers that won't be dirtied as not dirty
5219     if(wr) {
5220       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
5221       for(r=0;r<HOST_REGS;r++) {
5222         if((will_dirty_i>>r)&1) {
5223           printf(" r%d",r);
5224         }
5225       }
5226       printf("\n");*/
5227
5228       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP)) {
5229         regs[i].dirty|=will_dirty_i;
5230         #ifndef DESTRUCTIVE_WRITEBACK
5231         regs[i].dirty&=wont_dirty_i;
5232         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
5233         {
5234           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
5235             for(r=0;r<HOST_REGS;r++) {
5236               if(r!=EXCLUDE_REG) {
5237                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
5238                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
5239                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
5240               }
5241             }
5242           }
5243         }
5244         else
5245         {
5246           if(i<iend) {
5247             for(r=0;r<HOST_REGS;r++) {
5248               if(r!=EXCLUDE_REG) {
5249                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
5250                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
5251                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
5252               }
5253             }
5254           }
5255         }
5256         #endif
5257       //}
5258     }
5259     // Deal with changed mappings
5260     temp_will_dirty=will_dirty_i;
5261     temp_wont_dirty=wont_dirty_i;
5262     for(r=0;r<HOST_REGS;r++) {
5263       if(r!=EXCLUDE_REG) {
5264         int nr;
5265         if(regs[i].regmap[r]==regmap_pre[i][r]) {
5266           if(wr) {
5267             #ifndef DESTRUCTIVE_WRITEBACK
5268             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
5269             #endif
5270             regs[i].wasdirty|=will_dirty_i&(1<<r);
5271           }
5272         }
5273         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
5274           // Register moved to a different register
5275           will_dirty_i&=~(1<<r);
5276           wont_dirty_i&=~(1<<r);
5277           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
5278           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
5279           if(wr) {
5280             #ifndef DESTRUCTIVE_WRITEBACK
5281             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
5282             #endif
5283             regs[i].wasdirty|=will_dirty_i&(1<<r);
5284           }
5285         }
5286         else {
5287           will_dirty_i&=~(1<<r);
5288           wont_dirty_i&=~(1<<r);
5289           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
5290             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5291             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5292           } else {
5293             wont_dirty_i|=1<<r;
5294             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
5295           }
5296         }
5297       }
5298     }
5299   }
5300 }
5301
5302 #ifdef DISASM
5303   /* disassembly */
5304 void disassemble_inst(int i)
5305 {
5306     if (bt[i]) printf("*"); else printf(" ");
5307     switch(itype[i]) {
5308       case UJUMP:
5309         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
5310       case CJUMP:
5311         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
5312       case SJUMP:
5313         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
5314       case RJUMP:
5315         if (opcode[i]==0x9&&rt1[i]!=31)
5316           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
5317         else
5318           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
5319         break;
5320       case SPAN:
5321         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
5322       case IMM16:
5323         if(opcode[i]==0xf) //LUI
5324           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
5325         else
5326           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
5327         break;
5328       case LOAD:
5329       case LOADLR:
5330         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
5331         break;
5332       case STORE:
5333       case STORELR:
5334         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
5335         break;
5336       case ALU:
5337       case SHIFT:
5338         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
5339         break;
5340       case MULTDIV:
5341         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
5342         break;
5343       case SHIFTIMM:
5344         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
5345         break;
5346       case MOV:
5347         if((opcode2[i]&0x1d)==0x10)
5348           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
5349         else if((opcode2[i]&0x1d)==0x11)
5350           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
5351         else
5352           printf (" %x: %s\n",start+i*4,insn[i]);
5353         break;
5354       case COP0:
5355         if(opcode2[i]==0)
5356           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
5357         else if(opcode2[i]==4)
5358           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
5359         else printf (" %x: %s\n",start+i*4,insn[i]);
5360         break;
5361       case COP1:
5362         if(opcode2[i]<3)
5363           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
5364         else if(opcode2[i]>3)
5365           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
5366         else printf (" %x: %s\n",start+i*4,insn[i]);
5367         break;
5368       case COP2:
5369         if(opcode2[i]<3)
5370           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
5371         else if(opcode2[i]>3)
5372           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
5373         else printf (" %x: %s\n",start+i*4,insn[i]);
5374         break;
5375       case C1LS:
5376         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
5377         break;
5378       case C2LS:
5379         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
5380         break;
5381       case INTCALL:
5382         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
5383         break;
5384       default:
5385         //printf (" %s %8x\n",insn[i],source[i]);
5386         printf (" %x: %s\n",start+i*4,insn[i]);
5387     }
5388 }
5389 #else
5390 static void disassemble_inst(int i) {}
5391 #endif // DISASM
5392
5393 #define DRC_TEST_VAL 0x74657374
5394
5395 static int new_dynarec_test(void)
5396 {
5397   int (*testfunc)(void) = (void *)out;
5398   void *beginning;
5399   int ret;
5400
5401   beginning = start_block();
5402   emit_movimm(DRC_TEST_VAL,0); // test
5403   emit_jmpreg(14);
5404   literal_pool(0);
5405   end_block(beginning);
5406   SysPrintf("testing if we can run recompiled code..\n");
5407   ret = testfunc();
5408   if (ret == DRC_TEST_VAL)
5409     SysPrintf("test passed.\n");
5410   else
5411     SysPrintf("test failed: %08x\n", ret);
5412   out = translation_cache;
5413   return ret == DRC_TEST_VAL;
5414 }
5415
5416 // clear the state completely, instead of just marking
5417 // things invalid like invalidate_all_pages() does
5418 void new_dynarec_clear_full()
5419 {
5420   int n;
5421   out = translation_cache;
5422   memset(invalid_code,1,sizeof(invalid_code));
5423   memset(hash_table,0xff,sizeof(hash_table));
5424   memset(mini_ht,-1,sizeof(mini_ht));
5425   memset(restore_candidate,0,sizeof(restore_candidate));
5426   memset(shadow,0,sizeof(shadow));
5427   copy=shadow;
5428   expirep=16384; // Expiry pointer, +2 blocks
5429   pending_exception=0;
5430   literalcount=0;
5431   stop_after_jal=0;
5432   inv_code_start=inv_code_end=~0;
5433   // TLB
5434   for(n=0;n<4096;n++) ll_clear(jump_in+n);
5435   for(n=0;n<4096;n++) ll_clear(jump_out+n);
5436   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
5437 }
5438
5439 void new_dynarec_init()
5440 {
5441   SysPrintf("Init new dynarec\n");
5442
5443   // allocate/prepare a buffer for translation cache
5444   // see assem_arm.h for some explanation
5445 #if   defined(BASE_ADDR_FIXED)
5446   if (mmap(translation_cache, 1 << TARGET_SIZE_2,
5447             PROT_READ | PROT_WRITE | PROT_EXEC,
5448             MAP_PRIVATE | MAP_ANONYMOUS,
5449             -1, 0) != translation_cache) {
5450     SysPrintf("mmap() failed: %s\n", strerror(errno));
5451     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
5452     abort();
5453   }
5454 #elif defined(BASE_ADDR_DYNAMIC)
5455   #ifdef VITA
5456   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
5457   if (sceBlock < 0)
5458     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
5459   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
5460   if (ret < 0)
5461     SysPrintf("sceKernelGetMemBlockBase failed\n");
5462   #else
5463   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
5464             PROT_READ | PROT_WRITE | PROT_EXEC,
5465             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
5466   if (translation_cache == MAP_FAILED) {
5467     SysPrintf("mmap() failed: %s\n", strerror(errno));
5468     abort();
5469   }
5470   #endif
5471 #else
5472   #ifndef NO_WRITE_EXEC
5473   // not all systems allow execute in data segment by default
5474   if (mprotect(translation_cache, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
5475     SysPrintf("mprotect() failed: %s\n", strerror(errno));
5476   #endif
5477 #endif
5478   out = translation_cache;
5479   cycle_multiplier=200;
5480   new_dynarec_clear_full();
5481 #ifdef HOST_IMM8
5482   // Copy this into local area so we don't have to put it in every literal pool
5483   invc_ptr=invalid_code;
5484 #endif
5485   arch_init();
5486   new_dynarec_test();
5487 #ifndef RAM_FIXED
5488   ram_offset=(uintptr_t)rdram-0x80000000;
5489 #endif
5490   if (ram_offset!=0)
5491     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
5492 }
5493
5494 void new_dynarec_cleanup()
5495 {
5496   int n;
5497 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
5498   #ifdef VITA
5499   sceKernelFreeMemBlock(sceBlock);
5500   sceBlock = -1;
5501   #else
5502   if (munmap(translation_cache, 1<<TARGET_SIZE_2) < 0)
5503     SysPrintf("munmap() failed\n");
5504   #endif
5505 #endif
5506   for(n=0;n<4096;n++) ll_clear(jump_in+n);
5507   for(n=0;n<4096;n++) ll_clear(jump_out+n);
5508   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
5509   #ifdef ROM_COPY
5510   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
5511   #endif
5512 }
5513
5514 static u_int *get_source_start(u_int addr, u_int *limit)
5515 {
5516   if (addr < 0x00200000 ||
5517     (0xa0000000 <= addr && addr < 0xa0200000)) {
5518     // used for BIOS calls mostly?
5519     *limit = (addr&0xa0000000)|0x00200000;
5520     return (u_int *)(rdram + (addr&0x1fffff));
5521   }
5522   else if (!Config.HLE && (
5523     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
5524     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
5525     // BIOS
5526     *limit = (addr & 0xfff00000) | 0x80000;
5527     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
5528   }
5529   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
5530     *limit = (addr & 0x80600000) + 0x00200000;
5531     return (u_int *)(rdram + (addr&0x1fffff));
5532   }
5533   return NULL;
5534 }
5535
5536 static u_int scan_for_ret(u_int addr)
5537 {
5538   u_int limit = 0;
5539   u_int *mem;
5540
5541   mem = get_source_start(addr, &limit);
5542   if (mem == NULL)
5543     return addr;
5544
5545   if (limit > addr + 0x1000)
5546     limit = addr + 0x1000;
5547   for (; addr < limit; addr += 4, mem++) {
5548     if (*mem == 0x03e00008) // jr $ra
5549       return addr + 8;
5550   }
5551   return addr;
5552 }
5553
5554 struct savestate_block {
5555   uint32_t addr;
5556   uint32_t regflags;
5557 };
5558
5559 static int addr_cmp(const void *p1_, const void *p2_)
5560 {
5561   const struct savestate_block *p1 = p1_, *p2 = p2_;
5562   return p1->addr - p2->addr;
5563 }
5564
5565 int new_dynarec_save_blocks(void *save, int size)
5566 {
5567   struct savestate_block *blocks = save;
5568   int maxcount = size / sizeof(blocks[0]);
5569   struct savestate_block tmp_blocks[1024];
5570   struct ll_entry *head;
5571   int p, s, d, o, bcnt;
5572   u_int addr;
5573
5574   o = 0;
5575   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
5576     bcnt = 0;
5577     for (head = jump_in[p]; head != NULL; head = head->next) {
5578       tmp_blocks[bcnt].addr = head->vaddr;
5579       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
5580       bcnt++;
5581     }
5582     if (bcnt < 1)
5583       continue;
5584     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
5585
5586     addr = tmp_blocks[0].addr;
5587     for (s = d = 0; s < bcnt; s++) {
5588       if (tmp_blocks[s].addr < addr)
5589         continue;
5590       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
5591         tmp_blocks[d++] = tmp_blocks[s];
5592       addr = scan_for_ret(tmp_blocks[s].addr);
5593     }
5594
5595     if (o + d > maxcount)
5596       d = maxcount - o;
5597     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
5598     o += d;
5599   }
5600
5601   return o * sizeof(blocks[0]);
5602 }
5603
5604 void new_dynarec_load_blocks(const void *save, int size)
5605 {
5606   const struct savestate_block *blocks = save;
5607   int count = size / sizeof(blocks[0]);
5608   u_int regs_save[32];
5609   uint32_t f;
5610   int i, b;
5611
5612   get_addr(psxRegs.pc);
5613
5614   // change GPRs for speculation to at least partially work..
5615   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
5616   for (i = 1; i < 32; i++)
5617     psxRegs.GPR.r[i] = 0x80000000;
5618
5619   for (b = 0; b < count; b++) {
5620     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
5621       if (f & 1)
5622         psxRegs.GPR.r[i] = 0x1f800000;
5623     }
5624
5625     get_addr(blocks[b].addr);
5626
5627     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
5628       if (f & 1)
5629         psxRegs.GPR.r[i] = 0x80000000;
5630     }
5631   }
5632
5633   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
5634 }
5635
5636 int new_recompile_block(int addr)
5637 {
5638   u_int pagelimit = 0;
5639   u_int state_rflags = 0;
5640   int i;
5641
5642   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
5643   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
5644   //if(debug)
5645   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
5646
5647   // this is just for speculation
5648   for (i = 1; i < 32; i++) {
5649     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
5650       state_rflags |= 1 << i;
5651   }
5652
5653   start = (u_int)addr&~3;
5654   //assert(((u_int)addr&1)==0);
5655   new_dynarec_did_compile=1;
5656   if (Config.HLE && start == 0x80001000) // hlecall
5657   {
5658     // XXX: is this enough? Maybe check hleSoftCall?
5659     void *beginning=start_block();
5660     u_int page=get_page(start);
5661
5662     invalid_code[start>>12]=0;
5663     emit_movimm(start,0);
5664     emit_writeword(0,&pcaddr);
5665     emit_jmp(new_dyna_leave);
5666     literal_pool(0);
5667     end_block(beginning);
5668     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
5669     return 0;
5670   }
5671
5672   source = get_source_start(start, &pagelimit);
5673   if (source == NULL) {
5674     SysPrintf("Compile at bogus memory address: %08x\n", addr);
5675     exit(1);
5676   }
5677
5678   /* Pass 1: disassemble */
5679   /* Pass 2: register dependencies, branch targets */
5680   /* Pass 3: register allocation */
5681   /* Pass 4: branch dependencies */
5682   /* Pass 5: pre-alloc */
5683   /* Pass 6: optimize clean/dirty state */
5684   /* Pass 7: flag 32-bit registers */
5685   /* Pass 8: assembly */
5686   /* Pass 9: linker */
5687   /* Pass 10: garbage collection / free memory */
5688
5689   int j;
5690   int done=0;
5691   unsigned int type,op,op2;
5692
5693   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
5694
5695   /* Pass 1 disassembly */
5696
5697   for(i=0;!done;i++) {
5698     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
5699     minimum_free_regs[i]=0;
5700     opcode[i]=op=source[i]>>26;
5701     switch(op)
5702     {
5703       case 0x00: strcpy(insn[i],"special"); type=NI;
5704         op2=source[i]&0x3f;
5705         switch(op2)
5706         {
5707           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
5708           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
5709           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
5710           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
5711           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
5712           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
5713           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
5714           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
5715           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
5716           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
5717           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
5718           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
5719           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
5720           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
5721           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
5722           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
5723           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
5724           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
5725           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
5726           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
5727           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
5728           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
5729           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
5730           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
5731           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
5732           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
5733           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
5734           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
5735           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
5736           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
5737           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
5738           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
5739           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
5740           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
5741           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
5742 #if 0
5743           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
5744           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
5745           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
5746           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
5747           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
5748           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
5749           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
5750           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
5751           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
5752           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
5753           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
5754           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
5755           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
5756           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
5757           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
5758           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
5759           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
5760 #endif
5761         }
5762         break;
5763       case 0x01: strcpy(insn[i],"regimm"); type=NI;
5764         op2=(source[i]>>16)&0x1f;
5765         switch(op2)
5766         {
5767           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
5768           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
5769           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
5770           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
5771           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
5772           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
5773           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
5774           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
5775           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
5776           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
5777           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
5778           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
5779           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
5780           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
5781         }
5782         break;
5783       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
5784       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
5785       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
5786       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
5787       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
5788       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
5789       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
5790       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
5791       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
5792       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
5793       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
5794       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
5795       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
5796       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
5797       case 0x10: strcpy(insn[i],"cop0"); type=NI;
5798         op2=(source[i]>>21)&0x1f;
5799         switch(op2)
5800         {
5801           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
5802           case 0x02: strcpy(insn[i],"CFC0"); type=COP0; break;
5803           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
5804           case 0x06: strcpy(insn[i],"CTC0"); type=COP0; break;
5805           case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
5806         }
5807         break;
5808       case 0x11: strcpy(insn[i],"cop1"); type=COP1;
5809         op2=(source[i]>>21)&0x1f;
5810         break;
5811 #if 0
5812       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
5813       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
5814       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
5815       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
5816       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
5817       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
5818       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
5819       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
5820 #endif
5821       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
5822       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
5823       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
5824       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
5825       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
5826       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
5827       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
5828 #if 0
5829       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
5830 #endif
5831       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
5832       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
5833       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
5834       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
5835 #if 0
5836       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
5837       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
5838 #endif
5839       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
5840       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
5841       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
5842       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
5843 #if 0
5844       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
5845       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
5846       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
5847 #endif
5848       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
5849       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
5850 #if 0
5851       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
5852       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
5853       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
5854 #endif
5855       case 0x12: strcpy(insn[i],"COP2"); type=NI;
5856         op2=(source[i]>>21)&0x1f;
5857         //if (op2 & 0x10) {
5858         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
5859           if (gte_handlers[source[i]&0x3f]!=NULL) {
5860             if (gte_regnames[source[i]&0x3f]!=NULL)
5861               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
5862             else
5863               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
5864             type=C2OP;
5865           }
5866         }
5867         else switch(op2)
5868         {
5869           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
5870           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
5871           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
5872           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
5873         }
5874         break;
5875       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
5876       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
5877       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
5878       default: strcpy(insn[i],"???"); type=NI;
5879         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
5880         break;
5881     }
5882     itype[i]=type;
5883     opcode2[i]=op2;
5884     /* Get registers/immediates */
5885     lt1[i]=0;
5886     us1[i]=0;
5887     us2[i]=0;
5888     dep1[i]=0;
5889     dep2[i]=0;
5890     gte_rs[i]=gte_rt[i]=0;
5891     switch(type) {
5892       case LOAD:
5893         rs1[i]=(source[i]>>21)&0x1f;
5894         rs2[i]=0;
5895         rt1[i]=(source[i]>>16)&0x1f;
5896         rt2[i]=0;
5897         imm[i]=(short)source[i];
5898         break;
5899       case STORE:
5900       case STORELR:
5901         rs1[i]=(source[i]>>21)&0x1f;
5902         rs2[i]=(source[i]>>16)&0x1f;
5903         rt1[i]=0;
5904         rt2[i]=0;
5905         imm[i]=(short)source[i];
5906         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
5907         break;
5908       case LOADLR:
5909         // LWL/LWR only load part of the register,
5910         // therefore the target register must be treated as a source too
5911         rs1[i]=(source[i]>>21)&0x1f;
5912         rs2[i]=(source[i]>>16)&0x1f;
5913         rt1[i]=(source[i]>>16)&0x1f;
5914         rt2[i]=0;
5915         imm[i]=(short)source[i];
5916         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
5917         if(op==0x26) dep1[i]=rt1[i]; // LWR
5918         break;
5919       case IMM16:
5920         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
5921         else rs1[i]=(source[i]>>21)&0x1f;
5922         rs2[i]=0;
5923         rt1[i]=(source[i]>>16)&0x1f;
5924         rt2[i]=0;
5925         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
5926           imm[i]=(unsigned short)source[i];
5927         }else{
5928           imm[i]=(short)source[i];
5929         }
5930         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
5931         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
5932         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
5933         break;
5934       case UJUMP:
5935         rs1[i]=0;
5936         rs2[i]=0;
5937         rt1[i]=0;
5938         rt2[i]=0;
5939         // The JAL instruction writes to r31.
5940         if (op&1) {
5941           rt1[i]=31;
5942         }
5943         rs2[i]=CCREG;
5944         break;
5945       case RJUMP:
5946         rs1[i]=(source[i]>>21)&0x1f;
5947         rs2[i]=0;
5948         rt1[i]=0;
5949         rt2[i]=0;
5950         // The JALR instruction writes to rd.
5951         if (op2&1) {
5952           rt1[i]=(source[i]>>11)&0x1f;
5953         }
5954         rs2[i]=CCREG;
5955         break;
5956       case CJUMP:
5957         rs1[i]=(source[i]>>21)&0x1f;
5958         rs2[i]=(source[i]>>16)&0x1f;
5959         rt1[i]=0;
5960         rt2[i]=0;
5961         if(op&2) { // BGTZ/BLEZ
5962           rs2[i]=0;
5963         }
5964         us1[i]=rs1[i];
5965         us2[i]=rs2[i];
5966         likely[i]=op>>4;
5967         break;
5968       case SJUMP:
5969         rs1[i]=(source[i]>>21)&0x1f;
5970         rs2[i]=CCREG;
5971         rt1[i]=0;
5972         rt2[i]=0;
5973         us1[i]=rs1[i];
5974         if(op2&0x10) { // BxxAL
5975           rt1[i]=31;
5976           // NOTE: If the branch is not taken, r31 is still overwritten
5977         }
5978         likely[i]=(op2&2)>>1;
5979         break;
5980       case ALU:
5981         rs1[i]=(source[i]>>21)&0x1f; // source
5982         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
5983         rt1[i]=(source[i]>>11)&0x1f; // destination
5984         rt2[i]=0;
5985         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
5986           us1[i]=rs1[i];us2[i]=rs2[i];
5987         }
5988         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
5989           dep1[i]=rs1[i];dep2[i]=rs2[i];
5990         }
5991         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
5992           dep1[i]=rs1[i];dep2[i]=rs2[i];
5993         }
5994         break;
5995       case MULTDIV:
5996         rs1[i]=(source[i]>>21)&0x1f; // source
5997         rs2[i]=(source[i]>>16)&0x1f; // divisor
5998         rt1[i]=HIREG;
5999         rt2[i]=LOREG;
6000         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6001           us1[i]=rs1[i];us2[i]=rs2[i];
6002         }
6003         break;
6004       case MOV:
6005         rs1[i]=0;
6006         rs2[i]=0;
6007         rt1[i]=0;
6008         rt2[i]=0;
6009         if(op2==0x10) rs1[i]=HIREG; // MFHI
6010         if(op2==0x11) rt1[i]=HIREG; // MTHI
6011         if(op2==0x12) rs1[i]=LOREG; // MFLO
6012         if(op2==0x13) rt1[i]=LOREG; // MTLO
6013         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
6014         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
6015         dep1[i]=rs1[i];
6016         break;
6017       case SHIFT:
6018         rs1[i]=(source[i]>>16)&0x1f; // target of shift
6019         rs2[i]=(source[i]>>21)&0x1f; // shift amount
6020         rt1[i]=(source[i]>>11)&0x1f; // destination
6021         rt2[i]=0;
6022         // DSLLV/DSRLV/DSRAV are 64-bit
6023         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
6024         break;
6025       case SHIFTIMM:
6026         rs1[i]=(source[i]>>16)&0x1f;
6027         rs2[i]=0;
6028         rt1[i]=(source[i]>>11)&0x1f;
6029         rt2[i]=0;
6030         imm[i]=(source[i]>>6)&0x1f;
6031         // DSxx32 instructions
6032         if(op2>=0x3c) imm[i]|=0x20;
6033         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
6034         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
6035         break;
6036       case COP0:
6037         rs1[i]=0;
6038         rs2[i]=0;
6039         rt1[i]=0;
6040         rt2[i]=0;
6041         if(op2==0||op2==2) rt1[i]=(source[i]>>16)&0x1F; // MFC0/CFC0
6042         if(op2==4||op2==6) rs1[i]=(source[i]>>16)&0x1F; // MTC0/CTC0
6043         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
6044         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
6045         break;
6046       case COP1:
6047         rs1[i]=0;
6048         rs2[i]=0;
6049         rt1[i]=0;
6050         rt2[i]=0;
6051         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
6052         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
6053         if(op2==5) us1[i]=rs1[i]; // DMTC1
6054         rs2[i]=CSREG;
6055         break;
6056       case COP2:
6057         rs1[i]=0;
6058         rs2[i]=0;
6059         rt1[i]=0;
6060         rt2[i]=0;
6061         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
6062         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
6063         rs2[i]=CSREG;
6064         int gr=(source[i]>>11)&0x1F;
6065         switch(op2)
6066         {
6067           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
6068           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
6069           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
6070           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
6071         }
6072         break;
6073       case C1LS:
6074         rs1[i]=(source[i]>>21)&0x1F;
6075         rs2[i]=CSREG;
6076         rt1[i]=0;
6077         rt2[i]=0;
6078         imm[i]=(short)source[i];
6079         break;
6080       case C2LS:
6081         rs1[i]=(source[i]>>21)&0x1F;
6082         rs2[i]=0;
6083         rt1[i]=0;
6084         rt2[i]=0;
6085         imm[i]=(short)source[i];
6086         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
6087         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
6088         break;
6089       case C2OP:
6090         rs1[i]=0;
6091         rs2[i]=0;
6092         rt1[i]=0;
6093         rt2[i]=0;
6094         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
6095         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
6096         gte_rt[i]|=1ll<<63; // every op changes flags
6097         if((source[i]&0x3f)==GTE_MVMVA) {
6098           int v = (source[i] >> 15) & 3;
6099           gte_rs[i]&=~0xe3fll;
6100           if(v==3) gte_rs[i]|=0xe00ll;
6101           else gte_rs[i]|=3ll<<(v*2);
6102         }
6103         break;
6104       case SYSCALL:
6105       case HLECALL:
6106       case INTCALL:
6107         rs1[i]=CCREG;
6108         rs2[i]=0;
6109         rt1[i]=0;
6110         rt2[i]=0;
6111         break;
6112       default:
6113         rs1[i]=0;
6114         rs2[i]=0;
6115         rt1[i]=0;
6116         rt2[i]=0;
6117     }
6118     /* Calculate branch target addresses */
6119     if(type==UJUMP)
6120       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
6121     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
6122       ba[i]=start+i*4+8; // Ignore never taken branch
6123     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
6124       ba[i]=start+i*4+8; // Ignore never taken branch
6125     else if(type==CJUMP||type==SJUMP)
6126       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
6127     else ba[i]=-1;
6128     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)) {
6129       int do_in_intrp=0;
6130       // branch in delay slot?
6131       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP) {
6132         // don't handle first branch and call interpreter if it's hit
6133         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
6134         do_in_intrp=1;
6135       }
6136       // basic load delay detection
6137       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
6138         int t=(ba[i-1]-start)/4;
6139         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
6140           // jump target wants DS result - potential load delay effect
6141           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
6142           do_in_intrp=1;
6143           bt[t+1]=1; // expected return from interpreter
6144         }
6145         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
6146               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
6147           // v0 overwrite like this is a sign of trouble, bail out
6148           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
6149           do_in_intrp=1;
6150         }
6151       }
6152       if(do_in_intrp) {
6153         rs1[i-1]=CCREG;
6154         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
6155         ba[i-1]=-1;
6156         itype[i-1]=INTCALL;
6157         done=2;
6158         i--; // don't compile the DS
6159       }
6160     }
6161     /* Is this the end of the block? */
6162     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
6163       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
6164         done=2;
6165       }
6166       else {
6167         if(stop_after_jal) done=1;
6168         // Stop on BREAK
6169         if((source[i+1]&0xfc00003f)==0x0d) done=1;
6170       }
6171       // Don't recompile stuff that's already compiled
6172       if(check_addr(start+i*4+4)) done=1;
6173       // Don't get too close to the limit
6174       if(i>MAXBLOCK/2) done=1;
6175     }
6176     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
6177     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
6178     if(done==2) {
6179       // Does the block continue due to a branch?
6180       for(j=i-1;j>=0;j--)
6181       {
6182         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
6183         if(ba[j]==start+i*4+4) done=j=0;
6184         if(ba[j]==start+i*4+8) done=j=0;
6185       }
6186     }
6187     //assert(i<MAXBLOCK-1);
6188     if(start+i*4==pagelimit-4) done=1;
6189     assert(start+i*4<pagelimit);
6190     if (i==MAXBLOCK-1) done=1;
6191     // Stop if we're compiling junk
6192     if(itype[i]==NI&&opcode[i]==0x11) {
6193       done=stop_after_jal=1;
6194       SysPrintf("Disabled speculative precompilation\n");
6195     }
6196   }
6197   slen=i;
6198   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP) {
6199     if(start+i*4==pagelimit) {
6200       itype[i-1]=SPAN;
6201     }
6202   }
6203   assert(slen>0);
6204
6205   /* Pass 2 - Register dependencies and branch targets */
6206
6207   unneeded_registers(0,slen-1,0);
6208
6209   /* Pass 3 - Register allocation */
6210
6211   struct regstat current; // Current register allocations/status
6212   current.dirty=0;
6213   current.u=unneeded_reg[0];
6214   clear_all_regs(current.regmap);
6215   alloc_reg(&current,0,CCREG);
6216   dirty_reg(&current,CCREG);
6217   current.isconst=0;
6218   current.wasconst=0;
6219   current.waswritten=0;
6220   int ds=0;
6221   int cc=0;
6222   int hr=-1;
6223
6224   if((u_int)addr&1) {
6225     // First instruction is delay slot
6226     cc=-1;
6227     bt[1]=1;
6228     ds=1;
6229     unneeded_reg[0]=1;
6230     current.regmap[HOST_BTREG]=BTREG;
6231   }
6232
6233   for(i=0;i<slen;i++)
6234   {
6235     if(bt[i])
6236     {
6237       int hr;
6238       for(hr=0;hr<HOST_REGS;hr++)
6239       {
6240         // Is this really necessary?
6241         if(current.regmap[hr]==0) current.regmap[hr]=-1;
6242       }
6243       current.isconst=0;
6244       current.waswritten=0;
6245     }
6246     if(i>1)
6247     {
6248       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6249       {
6250         if(rs1[i-2]==0||rs2[i-2]==0)
6251         {
6252           if(rs1[i-2]) {
6253             int hr=get_reg(current.regmap,rs1[i-2]|64);
6254             if(hr>=0) current.regmap[hr]=-1;
6255           }
6256           if(rs2[i-2]) {
6257             int hr=get_reg(current.regmap,rs2[i-2]|64);
6258             if(hr>=0) current.regmap[hr]=-1;
6259           }
6260         }
6261       }
6262     }
6263
6264     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
6265     regs[i].wasconst=current.isconst;
6266     regs[i].wasdirty=current.dirty;
6267     regs[i].loadedconst=0;
6268     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP) {
6269       if(i+1<slen) {
6270         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6271         current.u|=1;
6272       } else {
6273         current.u=1;
6274       }
6275     } else {
6276       if(i+1<slen) {
6277         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6278         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6279         current.u|=1;
6280       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
6281     }
6282     is_ds[i]=ds;
6283     if(ds) {
6284       ds=0; // Skip delay slot, already allocated as part of branch
6285       // ...but we need to alloc it in case something jumps here
6286       if(i+1<slen) {
6287         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
6288       }else{
6289         current.u=branch_unneeded_reg[i-1];
6290       }
6291       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6292       current.u|=1;
6293       struct regstat temp;
6294       memcpy(&temp,&current,sizeof(current));
6295       temp.wasdirty=temp.dirty;
6296       // TODO: Take into account unconditional branches, as below
6297       delayslot_alloc(&temp,i);
6298       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
6299       regs[i].wasdirty=temp.wasdirty;
6300       regs[i].dirty=temp.dirty;
6301       regs[i].isconst=0;
6302       regs[i].wasconst=0;
6303       current.isconst=0;
6304       // Create entry (branch target) regmap
6305       for(hr=0;hr<HOST_REGS;hr++)
6306       {
6307         int r=temp.regmap[hr];
6308         if(r>=0) {
6309           if(r!=regmap_pre[i][hr]) {
6310             regs[i].regmap_entry[hr]=-1;
6311           }
6312           else
6313           {
6314             if(r<64){
6315               if((current.u>>r)&1) {
6316                 regs[i].regmap_entry[hr]=-1;
6317                 regs[i].regmap[hr]=-1;
6318                 //Don't clear regs in the delay slot as the branch might need them
6319                 //current.regmap[hr]=-1;
6320               }else
6321                 regs[i].regmap_entry[hr]=r;
6322             }
6323             else {
6324               assert(0);
6325             }
6326           }
6327         } else {
6328           // First instruction expects CCREG to be allocated
6329           if(i==0&&hr==HOST_CCREG)
6330             regs[i].regmap_entry[hr]=CCREG;
6331           else
6332             regs[i].regmap_entry[hr]=-1;
6333         }
6334       }
6335     }
6336     else { // Not delay slot
6337       switch(itype[i]) {
6338         case UJUMP:
6339           //current.isconst=0; // DEBUG
6340           //current.wasconst=0; // DEBUG
6341           //regs[i].wasconst=0; // DEBUG
6342           clear_const(&current,rt1[i]);
6343           alloc_cc(&current,i);
6344           dirty_reg(&current,CCREG);
6345           if (rt1[i]==31) {
6346             alloc_reg(&current,i,31);
6347             dirty_reg(&current,31);
6348             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
6349             //assert(rt1[i+1]!=rt1[i]);
6350             #ifdef REG_PREFETCH
6351             alloc_reg(&current,i,PTEMP);
6352             #endif
6353           }
6354           ooo[i]=1;
6355           delayslot_alloc(&current,i+1);
6356           //current.isconst=0; // DEBUG
6357           ds=1;
6358           //printf("i=%d, isconst=%x\n",i,current.isconst);
6359           break;
6360         case RJUMP:
6361           //current.isconst=0;
6362           //current.wasconst=0;
6363           //regs[i].wasconst=0;
6364           clear_const(&current,rs1[i]);
6365           clear_const(&current,rt1[i]);
6366           alloc_cc(&current,i);
6367           dirty_reg(&current,CCREG);
6368           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
6369             alloc_reg(&current,i,rs1[i]);
6370             if (rt1[i]!=0) {
6371               alloc_reg(&current,i,rt1[i]);
6372               dirty_reg(&current,rt1[i]);
6373               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
6374               assert(rt1[i+1]!=rt1[i]);
6375               #ifdef REG_PREFETCH
6376               alloc_reg(&current,i,PTEMP);
6377               #endif
6378             }
6379             #ifdef USE_MINI_HT
6380             if(rs1[i]==31) { // JALR
6381               alloc_reg(&current,i,RHASH);
6382               alloc_reg(&current,i,RHTBL);
6383             }
6384             #endif
6385             delayslot_alloc(&current,i+1);
6386           } else {
6387             // The delay slot overwrites our source register,
6388             // allocate a temporary register to hold the old value.
6389             current.isconst=0;
6390             current.wasconst=0;
6391             regs[i].wasconst=0;
6392             delayslot_alloc(&current,i+1);
6393             current.isconst=0;
6394             alloc_reg(&current,i,RTEMP);
6395           }
6396           //current.isconst=0; // DEBUG
6397           ooo[i]=1;
6398           ds=1;
6399           break;
6400         case CJUMP:
6401           //current.isconst=0;
6402           //current.wasconst=0;
6403           //regs[i].wasconst=0;
6404           clear_const(&current,rs1[i]);
6405           clear_const(&current,rs2[i]);
6406           if((opcode[i]&0x3E)==4) // BEQ/BNE
6407           {
6408             alloc_cc(&current,i);
6409             dirty_reg(&current,CCREG);
6410             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
6411             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
6412             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
6413                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
6414               // The delay slot overwrites one of our conditions.
6415               // Allocate the branch condition registers instead.
6416               current.isconst=0;
6417               current.wasconst=0;
6418               regs[i].wasconst=0;
6419               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
6420               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
6421             }
6422             else
6423             {
6424               ooo[i]=1;
6425               delayslot_alloc(&current,i+1);
6426             }
6427           }
6428           else
6429           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
6430           {
6431             alloc_cc(&current,i);
6432             dirty_reg(&current,CCREG);
6433             alloc_reg(&current,i,rs1[i]);
6434             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
6435               // The delay slot overwrites one of our conditions.
6436               // Allocate the branch condition registers instead.
6437               current.isconst=0;
6438               current.wasconst=0;
6439               regs[i].wasconst=0;
6440               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
6441             }
6442             else
6443             {
6444               ooo[i]=1;
6445               delayslot_alloc(&current,i+1);
6446             }
6447           }
6448           else
6449           // Don't alloc the delay slot yet because we might not execute it
6450           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
6451           {
6452             current.isconst=0;
6453             current.wasconst=0;
6454             regs[i].wasconst=0;
6455             alloc_cc(&current,i);
6456             dirty_reg(&current,CCREG);
6457             alloc_reg(&current,i,rs1[i]);
6458             alloc_reg(&current,i,rs2[i]);
6459           }
6460           else
6461           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
6462           {
6463             current.isconst=0;
6464             current.wasconst=0;
6465             regs[i].wasconst=0;
6466             alloc_cc(&current,i);
6467             dirty_reg(&current,CCREG);
6468             alloc_reg(&current,i,rs1[i]);
6469           }
6470           ds=1;
6471           //current.isconst=0;
6472           break;
6473         case SJUMP:
6474           //current.isconst=0;
6475           //current.wasconst=0;
6476           //regs[i].wasconst=0;
6477           clear_const(&current,rs1[i]);
6478           clear_const(&current,rt1[i]);
6479           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
6480           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
6481           {
6482             alloc_cc(&current,i);
6483             dirty_reg(&current,CCREG);
6484             alloc_reg(&current,i,rs1[i]);
6485             if (rt1[i]==31) { // BLTZAL/BGEZAL
6486               alloc_reg(&current,i,31);
6487               dirty_reg(&current,31);
6488               //#ifdef REG_PREFETCH
6489               //alloc_reg(&current,i,PTEMP);
6490               //#endif
6491             }
6492             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
6493                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
6494               // Allocate the branch condition registers instead.
6495               current.isconst=0;
6496               current.wasconst=0;
6497               regs[i].wasconst=0;
6498               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
6499             }
6500             else
6501             {
6502               ooo[i]=1;
6503               delayslot_alloc(&current,i+1);
6504             }
6505           }
6506           else
6507           // Don't alloc the delay slot yet because we might not execute it
6508           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
6509           {
6510             current.isconst=0;
6511             current.wasconst=0;
6512             regs[i].wasconst=0;
6513             alloc_cc(&current,i);
6514             dirty_reg(&current,CCREG);
6515             alloc_reg(&current,i,rs1[i]);
6516           }
6517           ds=1;
6518           //current.isconst=0;
6519           break;
6520         case IMM16:
6521           imm16_alloc(&current,i);
6522           break;
6523         case LOAD:
6524         case LOADLR:
6525           load_alloc(&current,i);
6526           break;
6527         case STORE:
6528         case STORELR:
6529           store_alloc(&current,i);
6530           break;
6531         case ALU:
6532           alu_alloc(&current,i);
6533           break;
6534         case SHIFT:
6535           shift_alloc(&current,i);
6536           break;
6537         case MULTDIV:
6538           multdiv_alloc(&current,i);
6539           break;
6540         case SHIFTIMM:
6541           shiftimm_alloc(&current,i);
6542           break;
6543         case MOV:
6544           mov_alloc(&current,i);
6545           break;
6546         case COP0:
6547           cop0_alloc(&current,i);
6548           break;
6549         case COP1:
6550         case COP2:
6551           cop12_alloc(&current,i);
6552           break;
6553         case C1LS:
6554           c1ls_alloc(&current,i);
6555           break;
6556         case C2LS:
6557           c2ls_alloc(&current,i);
6558           break;
6559         case C2OP:
6560           c2op_alloc(&current,i);
6561           break;
6562         case SYSCALL:
6563         case HLECALL:
6564         case INTCALL:
6565           syscall_alloc(&current,i);
6566           break;
6567         case SPAN:
6568           pagespan_alloc(&current,i);
6569           break;
6570       }
6571
6572       // Create entry (branch target) regmap
6573       for(hr=0;hr<HOST_REGS;hr++)
6574       {
6575         int r,or;
6576         r=current.regmap[hr];
6577         if(r>=0) {
6578           if(r!=regmap_pre[i][hr]) {
6579             // TODO: delay slot (?)
6580             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
6581             if(or<0||(r&63)>=TEMPREG){
6582               regs[i].regmap_entry[hr]=-1;
6583             }
6584             else
6585             {
6586               // Just move it to a different register
6587               regs[i].regmap_entry[hr]=r;
6588               // If it was dirty before, it's still dirty
6589               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
6590             }
6591           }
6592           else
6593           {
6594             // Unneeded
6595             if(r==0){
6596               regs[i].regmap_entry[hr]=0;
6597             }
6598             else
6599             if(r<64){
6600               if((current.u>>r)&1) {
6601                 regs[i].regmap_entry[hr]=-1;
6602                 //regs[i].regmap[hr]=-1;
6603                 current.regmap[hr]=-1;
6604               }else
6605                 regs[i].regmap_entry[hr]=r;
6606             }
6607             else {
6608               assert(0);
6609             }
6610           }
6611         } else {
6612           // Branches expect CCREG to be allocated at the target
6613           if(regmap_pre[i][hr]==CCREG)
6614             regs[i].regmap_entry[hr]=CCREG;
6615           else
6616             regs[i].regmap_entry[hr]=-1;
6617         }
6618       }
6619       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
6620     }
6621
6622     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
6623       current.waswritten|=1<<rs1[i-1];
6624     current.waswritten&=~(1<<rt1[i]);
6625     current.waswritten&=~(1<<rt2[i]);
6626     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
6627       current.waswritten&=~(1<<rs1[i]);
6628
6629     /* Branch post-alloc */
6630     if(i>0)
6631     {
6632       current.wasdirty=current.dirty;
6633       switch(itype[i-1]) {
6634         case UJUMP:
6635           memcpy(&branch_regs[i-1],&current,sizeof(current));
6636           branch_regs[i-1].isconst=0;
6637           branch_regs[i-1].wasconst=0;
6638           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
6639           alloc_cc(&branch_regs[i-1],i-1);
6640           dirty_reg(&branch_regs[i-1],CCREG);
6641           if(rt1[i-1]==31) { // JAL
6642             alloc_reg(&branch_regs[i-1],i-1,31);
6643             dirty_reg(&branch_regs[i-1],31);
6644           }
6645           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6646           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
6647           break;
6648         case RJUMP:
6649           memcpy(&branch_regs[i-1],&current,sizeof(current));
6650           branch_regs[i-1].isconst=0;
6651           branch_regs[i-1].wasconst=0;
6652           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
6653           alloc_cc(&branch_regs[i-1],i-1);
6654           dirty_reg(&branch_regs[i-1],CCREG);
6655           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
6656           if(rt1[i-1]!=0) { // JALR
6657             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
6658             dirty_reg(&branch_regs[i-1],rt1[i-1]);
6659           }
6660           #ifdef USE_MINI_HT
6661           if(rs1[i-1]==31) { // JALR
6662             alloc_reg(&branch_regs[i-1],i-1,RHASH);
6663             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
6664           }
6665           #endif
6666           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6667           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
6668           break;
6669         case CJUMP:
6670           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
6671           {
6672             alloc_cc(&current,i-1);
6673             dirty_reg(&current,CCREG);
6674             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
6675                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
6676               // The delay slot overwrote one of our conditions
6677               // Delay slot goes after the test (in order)
6678               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6679               current.u|=1;
6680               delayslot_alloc(&current,i);
6681               current.isconst=0;
6682             }
6683             else
6684             {
6685               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
6686               // Alloc the branch condition registers
6687               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
6688               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
6689             }
6690             memcpy(&branch_regs[i-1],&current,sizeof(current));
6691             branch_regs[i-1].isconst=0;
6692             branch_regs[i-1].wasconst=0;
6693             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
6694             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
6695           }
6696           else
6697           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
6698           {
6699             alloc_cc(&current,i-1);
6700             dirty_reg(&current,CCREG);
6701             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
6702               // The delay slot overwrote the branch condition
6703               // Delay slot goes after the test (in order)
6704               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6705               current.u|=1;
6706               delayslot_alloc(&current,i);
6707               current.isconst=0;
6708             }
6709             else
6710             {
6711               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
6712               // Alloc the branch condition register
6713               alloc_reg(&current,i-1,rs1[i-1]);
6714             }
6715             memcpy(&branch_regs[i-1],&current,sizeof(current));
6716             branch_regs[i-1].isconst=0;
6717             branch_regs[i-1].wasconst=0;
6718             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
6719             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
6720           }
6721           else
6722           // Alloc the delay slot in case the branch is taken
6723           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
6724           {
6725             memcpy(&branch_regs[i-1],&current,sizeof(current));
6726             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
6727             alloc_cc(&branch_regs[i-1],i);
6728             dirty_reg(&branch_regs[i-1],CCREG);
6729             delayslot_alloc(&branch_regs[i-1],i);
6730             branch_regs[i-1].isconst=0;
6731             alloc_reg(&current,i,CCREG); // Not taken path
6732             dirty_reg(&current,CCREG);
6733             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6734           }
6735           else
6736           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
6737           {
6738             memcpy(&branch_regs[i-1],&current,sizeof(current));
6739             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
6740             alloc_cc(&branch_regs[i-1],i);
6741             dirty_reg(&branch_regs[i-1],CCREG);
6742             delayslot_alloc(&branch_regs[i-1],i);
6743             branch_regs[i-1].isconst=0;
6744             alloc_reg(&current,i,CCREG); // Not taken path
6745             dirty_reg(&current,CCREG);
6746             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6747           }
6748           break;
6749         case SJUMP:
6750           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
6751           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
6752           {
6753             alloc_cc(&current,i-1);
6754             dirty_reg(&current,CCREG);
6755             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
6756               // The delay slot overwrote the branch condition
6757               // Delay slot goes after the test (in order)
6758               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6759               current.u|=1;
6760               delayslot_alloc(&current,i);
6761               current.isconst=0;
6762             }
6763             else
6764             {
6765               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
6766               // Alloc the branch condition register
6767               alloc_reg(&current,i-1,rs1[i-1]);
6768             }
6769             memcpy(&branch_regs[i-1],&current,sizeof(current));
6770             branch_regs[i-1].isconst=0;
6771             branch_regs[i-1].wasconst=0;
6772             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
6773             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
6774           }
6775           else
6776           // Alloc the delay slot in case the branch is taken
6777           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
6778           {
6779             memcpy(&branch_regs[i-1],&current,sizeof(current));
6780             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
6781             alloc_cc(&branch_regs[i-1],i);
6782             dirty_reg(&branch_regs[i-1],CCREG);
6783             delayslot_alloc(&branch_regs[i-1],i);
6784             branch_regs[i-1].isconst=0;
6785             alloc_reg(&current,i,CCREG); // Not taken path
6786             dirty_reg(&current,CCREG);
6787             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6788           }
6789           // FIXME: BLTZAL/BGEZAL
6790           if(opcode2[i-1]&0x10) { // BxxZAL
6791             alloc_reg(&branch_regs[i-1],i-1,31);
6792             dirty_reg(&branch_regs[i-1],31);
6793           }
6794           break;
6795       }
6796
6797       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6798       {
6799         if(rt1[i-1]==31) // JAL/JALR
6800         {
6801           // Subroutine call will return here, don't alloc any registers
6802           current.dirty=0;
6803           clear_all_regs(current.regmap);
6804           alloc_reg(&current,i,CCREG);
6805           dirty_reg(&current,CCREG);
6806         }
6807         else if(i+1<slen)
6808         {
6809           // Internal branch will jump here, match registers to caller
6810           current.dirty=0;
6811           clear_all_regs(current.regmap);
6812           alloc_reg(&current,i,CCREG);
6813           dirty_reg(&current,CCREG);
6814           for(j=i-1;j>=0;j--)
6815           {
6816             if(ba[j]==start+i*4+4) {
6817               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
6818               current.dirty=branch_regs[j].dirty;
6819               break;
6820             }
6821           }
6822           while(j>=0) {
6823             if(ba[j]==start+i*4+4) {
6824               for(hr=0;hr<HOST_REGS;hr++) {
6825                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
6826                   current.regmap[hr]=-1;
6827                 }
6828                 current.dirty&=branch_regs[j].dirty;
6829               }
6830             }
6831             j--;
6832           }
6833         }
6834       }
6835     }
6836
6837     // Count cycles in between branches
6838     ccadj[i]=cc;
6839     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
6840     {
6841       cc=0;
6842     }
6843 #if !defined(DRC_DBG)
6844     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
6845     {
6846       // GTE runs in parallel until accessed, divide by 2 for a rough guess
6847       cc+=gte_cycletab[source[i]&0x3f]/2;
6848     }
6849     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
6850     {
6851       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
6852     }
6853     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
6854     {
6855       cc+=4;
6856     }
6857     else if(itype[i]==C2LS)
6858     {
6859       cc+=4;
6860     }
6861 #endif
6862     else
6863     {
6864       cc++;
6865     }
6866
6867     if(!is_ds[i]) {
6868       regs[i].dirty=current.dirty;
6869       regs[i].isconst=current.isconst;
6870       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
6871     }
6872     for(hr=0;hr<HOST_REGS;hr++) {
6873       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
6874         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
6875           regs[i].wasconst&=~(1<<hr);
6876         }
6877       }
6878     }
6879     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
6880     regs[i].waswritten=current.waswritten;
6881   }
6882
6883   /* Pass 4 - Cull unused host registers */
6884
6885   uint64_t nr=0;
6886
6887   for (i=slen-1;i>=0;i--)
6888   {
6889     int hr;
6890     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6891     {
6892       if(ba[i]<start || ba[i]>=(start+slen*4))
6893       {
6894         // Branch out of this block, don't need anything
6895         nr=0;
6896       }
6897       else
6898       {
6899         // Internal branch
6900         // Need whatever matches the target
6901         nr=0;
6902         int t=(ba[i]-start)>>2;
6903         for(hr=0;hr<HOST_REGS;hr++)
6904         {
6905           if(regs[i].regmap_entry[hr]>=0) {
6906             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
6907           }
6908         }
6909       }
6910       // Conditional branch may need registers for following instructions
6911       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
6912       {
6913         if(i<slen-2) {
6914           nr|=needed_reg[i+2];
6915           for(hr=0;hr<HOST_REGS;hr++)
6916           {
6917             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
6918             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
6919           }
6920         }
6921       }
6922       // Don't need stuff which is overwritten
6923       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
6924       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
6925       // Merge in delay slot
6926       for(hr=0;hr<HOST_REGS;hr++)
6927       {
6928         if(!likely[i]) {
6929           // These are overwritten unless the branch is "likely"
6930           // and the delay slot is nullified if not taken
6931           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6932           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6933         }
6934         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6935         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6936         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
6937         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
6938         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6939         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6940         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6941         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6942         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
6943           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
6944           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
6945         }
6946       }
6947     }
6948     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6949     {
6950       // SYSCALL instruction (software interrupt)
6951       nr=0;
6952     }
6953     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6954     {
6955       // ERET instruction (return from interrupt)
6956       nr=0;
6957     }
6958     else // Non-branch
6959     {
6960       if(i<slen-1) {
6961         for(hr=0;hr<HOST_REGS;hr++) {
6962           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
6963           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
6964           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
6965           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
6966         }
6967       }
6968     }
6969     for(hr=0;hr<HOST_REGS;hr++)
6970     {
6971       // Overwritten registers are not needed
6972       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6973       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6974       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6975       // Source registers are needed
6976       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6977       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6978       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
6979       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
6980       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6981       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6982       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6983       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6984       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
6985         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
6986         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
6987       }
6988       // Don't store a register immediately after writing it,
6989       // may prevent dual-issue.
6990       // But do so if this is a branch target, otherwise we
6991       // might have to load the register before the branch.
6992       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
6993         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1))) {
6994           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6995           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6996         }
6997         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1))) {
6998           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6999           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
7000         }
7001       }
7002     }
7003     // Cycle count is needed at branches.  Assume it is needed at the target too.
7004     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SPAN) {
7005       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
7006       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
7007     }
7008     // Save it
7009     needed_reg[i]=nr;
7010
7011     // Deallocate unneeded registers
7012     for(hr=0;hr<HOST_REGS;hr++)
7013     {
7014       if(!((nr>>hr)&1)) {
7015         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
7016         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
7017            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
7018            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
7019         {
7020           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7021           {
7022             if(likely[i]) {
7023               regs[i].regmap[hr]=-1;
7024               regs[i].isconst&=~(1<<hr);
7025               if(i<slen-2) {
7026                 regmap_pre[i+2][hr]=-1;
7027                 regs[i+2].wasconst&=~(1<<hr);
7028               }
7029             }
7030           }
7031         }
7032         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7033         {
7034           int d1=0,d2=0,map=0,temp=0;
7035           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
7036           {
7037             d1=dep1[i+1];
7038             d2=dep2[i+1];
7039           }
7040           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
7041              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
7042             map=INVCP;
7043           }
7044           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
7045              itype[i+1]==C1LS || itype[i+1]==C2LS)
7046             temp=FTEMP;
7047           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
7048              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
7049              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
7050              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
7051              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
7052              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
7053              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
7054              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
7055              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
7056              regs[i].regmap[hr]!=map )
7057           {
7058             regs[i].regmap[hr]=-1;
7059             regs[i].isconst&=~(1<<hr);
7060             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
7061                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
7062                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
7063                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
7064                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
7065                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
7066                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
7067                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
7068                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
7069                branch_regs[i].regmap[hr]!=map)
7070             {
7071               branch_regs[i].regmap[hr]=-1;
7072               branch_regs[i].regmap_entry[hr]=-1;
7073               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7074               {
7075                 if(!likely[i]&&i<slen-2) {
7076                   regmap_pre[i+2][hr]=-1;
7077                   regs[i+2].wasconst&=~(1<<hr);
7078                 }
7079               }
7080             }
7081           }
7082         }
7083         else
7084         {
7085           // Non-branch
7086           if(i>0)
7087           {
7088             int d1=0,d2=0,map=-1,temp=-1;
7089             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
7090             {
7091               d1=dep1[i];
7092               d2=dep2[i];
7093             }
7094             if(itype[i]==STORE || itype[i]==STORELR ||
7095                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
7096               map=INVCP;
7097             }
7098             if(itype[i]==LOADLR || itype[i]==STORELR ||
7099                itype[i]==C1LS || itype[i]==C2LS)
7100               temp=FTEMP;
7101             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
7102                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
7103                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
7104                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
7105                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
7106                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
7107             {
7108               if(i<slen-1&&!is_ds[i]) {
7109                 assert(regs[i].regmap[hr]<64);
7110                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
7111                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
7112                 {
7113                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
7114                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
7115                 }
7116                 regmap_pre[i+1][hr]=-1;
7117                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
7118                 regs[i+1].wasconst&=~(1<<hr);
7119               }
7120               regs[i].regmap[hr]=-1;
7121               regs[i].isconst&=~(1<<hr);
7122             }
7123           }
7124         }
7125       }
7126     }
7127   }
7128
7129   /* Pass 5 - Pre-allocate registers */
7130
7131   // If a register is allocated during a loop, try to allocate it for the
7132   // entire loop, if possible.  This avoids loading/storing registers
7133   // inside of the loop.
7134
7135   signed char f_regmap[HOST_REGS];
7136   clear_all_regs(f_regmap);
7137   for(i=0;i<slen-1;i++)
7138   {
7139     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7140     {
7141       if(ba[i]>=start && ba[i]<(start+i*4))
7142       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
7143       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
7144       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
7145       ||itype[i+1]==SHIFT||itype[i+1]==COP1
7146       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
7147       {
7148         int t=(ba[i]-start)>>2;
7149         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
7150         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
7151         for(hr=0;hr<HOST_REGS;hr++)
7152         {
7153           if(regs[i].regmap[hr]>64) {
7154             if(!((regs[i].dirty>>hr)&1))
7155               f_regmap[hr]=regs[i].regmap[hr];
7156             else f_regmap[hr]=-1;
7157           }
7158           else if(regs[i].regmap[hr]>=0) {
7159             if(f_regmap[hr]!=regs[i].regmap[hr]) {
7160               // dealloc old register
7161               int n;
7162               for(n=0;n<HOST_REGS;n++)
7163               {
7164                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
7165               }
7166               // and alloc new one
7167               f_regmap[hr]=regs[i].regmap[hr];
7168             }
7169           }
7170           if(branch_regs[i].regmap[hr]>64) {
7171             if(!((branch_regs[i].dirty>>hr)&1))
7172               f_regmap[hr]=branch_regs[i].regmap[hr];
7173             else f_regmap[hr]=-1;
7174           }
7175           else if(branch_regs[i].regmap[hr]>=0) {
7176             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
7177               // dealloc old register
7178               int n;
7179               for(n=0;n<HOST_REGS;n++)
7180               {
7181                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
7182               }
7183               // and alloc new one
7184               f_regmap[hr]=branch_regs[i].regmap[hr];
7185             }
7186           }
7187           if(ooo[i]) {
7188             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
7189               f_regmap[hr]=branch_regs[i].regmap[hr];
7190           }else{
7191             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
7192               f_regmap[hr]=branch_regs[i].regmap[hr];
7193           }
7194           // Avoid dirty->clean transition
7195           #ifdef DESTRUCTIVE_WRITEBACK
7196           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
7197           #endif
7198           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
7199           // case above, however it's always a good idea.  We can't hoist the
7200           // load if the register was already allocated, so there's no point
7201           // wasting time analyzing most of these cases.  It only "succeeds"
7202           // when the mapping was different and the load can be replaced with
7203           // a mov, which is of negligible benefit.  So such cases are
7204           // skipped below.
7205           if(f_regmap[hr]>0) {
7206             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
7207               int r=f_regmap[hr];
7208               for(j=t;j<=i;j++)
7209               {
7210                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
7211                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
7212                 assert(r < 64);
7213                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
7214                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
7215                   int k;
7216                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
7217                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
7218                     if(r>63) {
7219                       if(get_reg(regs[i].regmap,r&63)<0) break;
7220                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
7221                     }
7222                     k=i;
7223                     while(k>1&&regs[k-1].regmap[hr]==-1) {
7224                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
7225                         //printf("no free regs for store %x\n",start+(k-1)*4);
7226                         break;
7227                       }
7228                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
7229                         //printf("no-match due to different register\n");
7230                         break;
7231                       }
7232                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
7233                         //printf("no-match due to branch\n");
7234                         break;
7235                       }
7236                       // call/ret fast path assumes no registers allocated
7237                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
7238                         break;
7239                       }
7240                       assert(r < 64);
7241                       k--;
7242                     }
7243                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
7244                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
7245                       while(k<i) {
7246                         regs[k].regmap_entry[hr]=f_regmap[hr];
7247                         regs[k].regmap[hr]=f_regmap[hr];
7248                         regmap_pre[k+1][hr]=f_regmap[hr];
7249                         regs[k].wasdirty&=~(1<<hr);
7250                         regs[k].dirty&=~(1<<hr);
7251                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
7252                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
7253                         regs[k].wasconst&=~(1<<hr);
7254                         regs[k].isconst&=~(1<<hr);
7255                         k++;
7256                       }
7257                     }
7258                     else {
7259                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
7260                       break;
7261                     }
7262                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
7263                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
7264                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
7265                       regs[i].regmap_entry[hr]=f_regmap[hr];
7266                       regs[i].regmap[hr]=f_regmap[hr];
7267                       regs[i].wasdirty&=~(1<<hr);
7268                       regs[i].dirty&=~(1<<hr);
7269                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
7270                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
7271                       regs[i].wasconst&=~(1<<hr);
7272                       regs[i].isconst&=~(1<<hr);
7273                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
7274                       branch_regs[i].wasdirty&=~(1<<hr);
7275                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
7276                       branch_regs[i].regmap[hr]=f_regmap[hr];
7277                       branch_regs[i].dirty&=~(1<<hr);
7278                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
7279                       branch_regs[i].wasconst&=~(1<<hr);
7280                       branch_regs[i].isconst&=~(1<<hr);
7281                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7282                         regmap_pre[i+2][hr]=f_regmap[hr];
7283                         regs[i+2].wasdirty&=~(1<<hr);
7284                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
7285                       }
7286                     }
7287                   }
7288                   for(k=t;k<j;k++) {
7289                     // Alloc register clean at beginning of loop,
7290                     // but may dirty it in pass 6
7291                     regs[k].regmap_entry[hr]=f_regmap[hr];
7292                     regs[k].regmap[hr]=f_regmap[hr];
7293                     regs[k].dirty&=~(1<<hr);
7294                     regs[k].wasconst&=~(1<<hr);
7295                     regs[k].isconst&=~(1<<hr);
7296                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP) {
7297                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
7298                       branch_regs[k].regmap[hr]=f_regmap[hr];
7299                       branch_regs[k].dirty&=~(1<<hr);
7300                       branch_regs[k].wasconst&=~(1<<hr);
7301                       branch_regs[k].isconst&=~(1<<hr);
7302                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
7303                         regmap_pre[k+2][hr]=f_regmap[hr];
7304                         regs[k+2].wasdirty&=~(1<<hr);
7305                       }
7306                     }
7307                     else
7308                     {
7309                       regmap_pre[k+1][hr]=f_regmap[hr];
7310                       regs[k+1].wasdirty&=~(1<<hr);
7311                     }
7312                   }
7313                   if(regs[j].regmap[hr]==f_regmap[hr])
7314                     regs[j].regmap_entry[hr]=f_regmap[hr];
7315                   break;
7316                 }
7317                 if(j==i) break;
7318                 if(regs[j].regmap[hr]>=0)
7319                   break;
7320                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
7321                   //printf("no-match due to different register\n");
7322                   break;
7323                 }
7324                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
7325                 {
7326                   // Stop on unconditional branch
7327                   break;
7328                 }
7329                 if(itype[j]==CJUMP||itype[j]==SJUMP)
7330                 {
7331                   if(ooo[j]) {
7332                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
7333                       break;
7334                   }else{
7335                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
7336                       break;
7337                   }
7338                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
7339                     //printf("no-match due to different register (branch)\n");
7340                     break;
7341                   }
7342                 }
7343                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
7344                   //printf("No free regs for store %x\n",start+j*4);
7345                   break;
7346                 }
7347                 assert(f_regmap[hr]<64);
7348               }
7349             }
7350           }
7351         }
7352       }
7353     }else{
7354       // Non branch or undetermined branch target
7355       for(hr=0;hr<HOST_REGS;hr++)
7356       {
7357         if(hr!=EXCLUDE_REG) {
7358           if(regs[i].regmap[hr]>64) {
7359             if(!((regs[i].dirty>>hr)&1))
7360               f_regmap[hr]=regs[i].regmap[hr];
7361           }
7362           else if(regs[i].regmap[hr]>=0) {
7363             if(f_regmap[hr]!=regs[i].regmap[hr]) {
7364               // dealloc old register
7365               int n;
7366               for(n=0;n<HOST_REGS;n++)
7367               {
7368                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
7369               }
7370               // and alloc new one
7371               f_regmap[hr]=regs[i].regmap[hr];
7372             }
7373           }
7374         }
7375       }
7376       // Try to restore cycle count at branch targets
7377       if(bt[i]) {
7378         for(j=i;j<slen-1;j++) {
7379           if(regs[j].regmap[HOST_CCREG]!=-1) break;
7380           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
7381             //printf("no free regs for store %x\n",start+j*4);
7382             break;
7383           }
7384         }
7385         if(regs[j].regmap[HOST_CCREG]==CCREG) {
7386           int k=i;
7387           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
7388           while(k<j) {
7389             regs[k].regmap_entry[HOST_CCREG]=CCREG;
7390             regs[k].regmap[HOST_CCREG]=CCREG;
7391             regmap_pre[k+1][HOST_CCREG]=CCREG;
7392             regs[k+1].wasdirty|=1<<HOST_CCREG;
7393             regs[k].dirty|=1<<HOST_CCREG;
7394             regs[k].wasconst&=~(1<<HOST_CCREG);
7395             regs[k].isconst&=~(1<<HOST_CCREG);
7396             k++;
7397           }
7398           regs[j].regmap_entry[HOST_CCREG]=CCREG;
7399         }
7400         // Work backwards from the branch target
7401         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
7402         {
7403           //printf("Extend backwards\n");
7404           int k;
7405           k=i;
7406           while(regs[k-1].regmap[HOST_CCREG]==-1) {
7407             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
7408               //printf("no free regs for store %x\n",start+(k-1)*4);
7409               break;
7410             }
7411             k--;
7412           }
7413           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
7414             //printf("Extend CC, %x ->\n",start+k*4);
7415             while(k<=i) {
7416               regs[k].regmap_entry[HOST_CCREG]=CCREG;
7417               regs[k].regmap[HOST_CCREG]=CCREG;
7418               regmap_pre[k+1][HOST_CCREG]=CCREG;
7419               regs[k+1].wasdirty|=1<<HOST_CCREG;
7420               regs[k].dirty|=1<<HOST_CCREG;
7421               regs[k].wasconst&=~(1<<HOST_CCREG);
7422               regs[k].isconst&=~(1<<HOST_CCREG);
7423               k++;
7424             }
7425           }
7426           else {
7427             //printf("Fail Extend CC, %x ->\n",start+k*4);
7428           }
7429         }
7430       }
7431       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
7432          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
7433          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1)
7434       {
7435         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
7436       }
7437     }
7438   }
7439
7440   // This allocates registers (if possible) one instruction prior
7441   // to use, which can avoid a load-use penalty on certain CPUs.
7442   for(i=0;i<slen-1;i++)
7443   {
7444     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
7445     {
7446       if(!bt[i+1])
7447       {
7448         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
7449            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
7450         {
7451           if(rs1[i+1]) {
7452             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
7453             {
7454               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7455               {
7456                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
7457                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
7458                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
7459                 regs[i].isconst&=~(1<<hr);
7460                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
7461                 constmap[i][hr]=constmap[i+1][hr];
7462                 regs[i+1].wasdirty&=~(1<<hr);
7463                 regs[i].dirty&=~(1<<hr);
7464               }
7465             }
7466           }
7467           if(rs2[i+1]) {
7468             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
7469             {
7470               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7471               {
7472                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
7473                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
7474                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
7475                 regs[i].isconst&=~(1<<hr);
7476                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
7477                 constmap[i][hr]=constmap[i+1][hr];
7478                 regs[i+1].wasdirty&=~(1<<hr);
7479                 regs[i].dirty&=~(1<<hr);
7480               }
7481             }
7482           }
7483           // Preload target address for load instruction (non-constant)
7484           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7485             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
7486             {
7487               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7488               {
7489                 regs[i].regmap[hr]=rs1[i+1];
7490                 regmap_pre[i+1][hr]=rs1[i+1];
7491                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7492                 regs[i].isconst&=~(1<<hr);
7493                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
7494                 constmap[i][hr]=constmap[i+1][hr];
7495                 regs[i+1].wasdirty&=~(1<<hr);
7496                 regs[i].dirty&=~(1<<hr);
7497               }
7498             }
7499           }
7500           // Load source into target register
7501           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7502             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
7503             {
7504               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7505               {
7506                 regs[i].regmap[hr]=rs1[i+1];
7507                 regmap_pre[i+1][hr]=rs1[i+1];
7508                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7509                 regs[i].isconst&=~(1<<hr);
7510                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
7511                 constmap[i][hr]=constmap[i+1][hr];
7512                 regs[i+1].wasdirty&=~(1<<hr);
7513                 regs[i].dirty&=~(1<<hr);
7514               }
7515             }
7516           }
7517           // Address for store instruction (non-constant)
7518           if(itype[i+1]==STORE||itype[i+1]==STORELR
7519              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
7520             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7521               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
7522               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
7523               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
7524               assert(hr>=0);
7525               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7526               {
7527                 regs[i].regmap[hr]=rs1[i+1];
7528                 regmap_pre[i+1][hr]=rs1[i+1];
7529                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7530                 regs[i].isconst&=~(1<<hr);
7531                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
7532                 constmap[i][hr]=constmap[i+1][hr];
7533                 regs[i+1].wasdirty&=~(1<<hr);
7534                 regs[i].dirty&=~(1<<hr);
7535               }
7536             }
7537           }
7538           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
7539             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7540               int nr;
7541               hr=get_reg(regs[i+1].regmap,FTEMP);
7542               assert(hr>=0);
7543               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7544               {
7545                 regs[i].regmap[hr]=rs1[i+1];
7546                 regmap_pre[i+1][hr]=rs1[i+1];
7547                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7548                 regs[i].isconst&=~(1<<hr);
7549                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
7550                 constmap[i][hr]=constmap[i+1][hr];
7551                 regs[i+1].wasdirty&=~(1<<hr);
7552                 regs[i].dirty&=~(1<<hr);
7553               }
7554               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
7555               {
7556                 // move it to another register
7557                 regs[i+1].regmap[hr]=-1;
7558                 regmap_pre[i+2][hr]=-1;
7559                 regs[i+1].regmap[nr]=FTEMP;
7560                 regmap_pre[i+2][nr]=FTEMP;
7561                 regs[i].regmap[nr]=rs1[i+1];
7562                 regmap_pre[i+1][nr]=rs1[i+1];
7563                 regs[i+1].regmap_entry[nr]=rs1[i+1];
7564                 regs[i].isconst&=~(1<<nr);
7565                 regs[i+1].isconst&=~(1<<nr);
7566                 regs[i].dirty&=~(1<<nr);
7567                 regs[i+1].wasdirty&=~(1<<nr);
7568                 regs[i+1].dirty&=~(1<<nr);
7569                 regs[i+2].wasdirty&=~(1<<nr);
7570               }
7571             }
7572           }
7573           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
7574             if(itype[i+1]==LOAD)
7575               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
7576             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
7577               hr=get_reg(regs[i+1].regmap,FTEMP);
7578             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
7579               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
7580               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
7581             }
7582             if(hr>=0&&regs[i].regmap[hr]<0) {
7583               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
7584               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
7585                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
7586                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
7587                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
7588                 regs[i].isconst&=~(1<<hr);
7589                 regs[i+1].wasdirty&=~(1<<hr);
7590                 regs[i].dirty&=~(1<<hr);
7591               }
7592             }
7593           }
7594         }
7595       }
7596     }
7597   }
7598
7599   /* Pass 6 - Optimize clean/dirty state */
7600   clean_registers(0,slen-1,1);
7601
7602   /* Pass 7 - Identify 32-bit registers */
7603   for (i=slen-1;i>=0;i--)
7604   {
7605     if(itype[i]==CJUMP||itype[i]==SJUMP)
7606     {
7607       // Conditional branch
7608       if((source[i]>>16)!=0x1000&&i<slen-2) {
7609         // Mark this address as a branch target since it may be called
7610         // upon return from interrupt
7611         bt[i+2]=1;
7612       }
7613     }
7614   }
7615
7616   if(itype[slen-1]==SPAN) {
7617     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
7618   }
7619
7620 #ifdef DISASM
7621   /* Debug/disassembly */
7622   for(i=0;i<slen;i++)
7623   {
7624     printf("U:");
7625     int r;
7626     for(r=1;r<=CCREG;r++) {
7627       if((unneeded_reg[i]>>r)&1) {
7628         if(r==HIREG) printf(" HI");
7629         else if(r==LOREG) printf(" LO");
7630         else printf(" r%d",r);
7631       }
7632     }
7633     printf("\n");
7634     #if defined(__i386__) || defined(__x86_64__)
7635     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
7636     #endif
7637     #ifdef __arm__
7638     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
7639     #endif
7640     printf("needs: ");
7641     if(needed_reg[i]&1) printf("eax ");
7642     if((needed_reg[i]>>1)&1) printf("ecx ");
7643     if((needed_reg[i]>>2)&1) printf("edx ");
7644     if((needed_reg[i]>>3)&1) printf("ebx ");
7645     if((needed_reg[i]>>5)&1) printf("ebp ");
7646     if((needed_reg[i]>>6)&1) printf("esi ");
7647     if((needed_reg[i]>>7)&1) printf("edi ");
7648     printf("\n");
7649     #if defined(__i386__) || defined(__x86_64__)
7650     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
7651     printf("dirty: ");
7652     if(regs[i].wasdirty&1) printf("eax ");
7653     if((regs[i].wasdirty>>1)&1) printf("ecx ");
7654     if((regs[i].wasdirty>>2)&1) printf("edx ");
7655     if((regs[i].wasdirty>>3)&1) printf("ebx ");
7656     if((regs[i].wasdirty>>5)&1) printf("ebp ");
7657     if((regs[i].wasdirty>>6)&1) printf("esi ");
7658     if((regs[i].wasdirty>>7)&1) printf("edi ");
7659     #endif
7660     #ifdef __arm__
7661     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
7662     printf("dirty: ");
7663     if(regs[i].wasdirty&1) printf("r0 ");
7664     if((regs[i].wasdirty>>1)&1) printf("r1 ");
7665     if((regs[i].wasdirty>>2)&1) printf("r2 ");
7666     if((regs[i].wasdirty>>3)&1) printf("r3 ");
7667     if((regs[i].wasdirty>>4)&1) printf("r4 ");
7668     if((regs[i].wasdirty>>5)&1) printf("r5 ");
7669     if((regs[i].wasdirty>>6)&1) printf("r6 ");
7670     if((regs[i].wasdirty>>7)&1) printf("r7 ");
7671     if((regs[i].wasdirty>>8)&1) printf("r8 ");
7672     if((regs[i].wasdirty>>9)&1) printf("r9 ");
7673     if((regs[i].wasdirty>>10)&1) printf("r10 ");
7674     if((regs[i].wasdirty>>12)&1) printf("r12 ");
7675     #endif
7676     printf("\n");
7677     disassemble_inst(i);
7678     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
7679     #if defined(__i386__) || defined(__x86_64__)
7680     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
7681     if(regs[i].dirty&1) printf("eax ");
7682     if((regs[i].dirty>>1)&1) printf("ecx ");
7683     if((regs[i].dirty>>2)&1) printf("edx ");
7684     if((regs[i].dirty>>3)&1) printf("ebx ");
7685     if((regs[i].dirty>>5)&1) printf("ebp ");
7686     if((regs[i].dirty>>6)&1) printf("esi ");
7687     if((regs[i].dirty>>7)&1) printf("edi ");
7688     #endif
7689     #ifdef __arm__
7690     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
7691     if(regs[i].dirty&1) printf("r0 ");
7692     if((regs[i].dirty>>1)&1) printf("r1 ");
7693     if((regs[i].dirty>>2)&1) printf("r2 ");
7694     if((regs[i].dirty>>3)&1) printf("r3 ");
7695     if((regs[i].dirty>>4)&1) printf("r4 ");
7696     if((regs[i].dirty>>5)&1) printf("r5 ");
7697     if((regs[i].dirty>>6)&1) printf("r6 ");
7698     if((regs[i].dirty>>7)&1) printf("r7 ");
7699     if((regs[i].dirty>>8)&1) printf("r8 ");
7700     if((regs[i].dirty>>9)&1) printf("r9 ");
7701     if((regs[i].dirty>>10)&1) printf("r10 ");
7702     if((regs[i].dirty>>12)&1) printf("r12 ");
7703     #endif
7704     printf("\n");
7705     if(regs[i].isconst) {
7706       printf("constants: ");
7707       #if defined(__i386__) || defined(__x86_64__)
7708       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
7709       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
7710       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
7711       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
7712       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
7713       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
7714       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
7715       #endif
7716       #ifdef __arm__
7717       int r;
7718       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
7719         if ((regs[i].isconst >> r) & 1)
7720           printf(" r%d=%x", r, (u_int)constmap[i][r]);
7721       #endif
7722       printf("\n");
7723     }
7724     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
7725       #if defined(__i386__) || defined(__x86_64__)
7726       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
7727       if(branch_regs[i].dirty&1) printf("eax ");
7728       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
7729       if((branch_regs[i].dirty>>2)&1) printf("edx ");
7730       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
7731       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
7732       if((branch_regs[i].dirty>>6)&1) printf("esi ");
7733       if((branch_regs[i].dirty>>7)&1) printf("edi ");
7734       #endif
7735       #ifdef __arm__
7736       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
7737       if(branch_regs[i].dirty&1) printf("r0 ");
7738       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
7739       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
7740       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
7741       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
7742       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
7743       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
7744       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
7745       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
7746       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
7747       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
7748       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
7749       #endif
7750     }
7751   }
7752 #endif // DISASM
7753
7754   /* Pass 8 - Assembly */
7755   linkcount=0;stubcount=0;
7756   ds=0;is_delayslot=0;
7757   u_int dirty_pre=0;
7758   void *beginning=start_block();
7759   if((u_int)addr&1) {
7760     ds=1;
7761     pagespan_ds();
7762   }
7763   void *instr_addr0_override = NULL;
7764
7765   if (start == 0x80030000) {
7766     // nasty hack for fastbios thing
7767     // override block entry to this code
7768     instr_addr0_override = out;
7769     emit_movimm(start,0);
7770     // abuse io address var as a flag that we
7771     // have already returned here once
7772     emit_readword(&address,1);
7773     emit_writeword(0,&pcaddr);
7774     emit_writeword(0,&address);
7775     emit_cmp(0,1);
7776     emit_jne(new_dyna_leave);
7777   }
7778   for(i=0;i<slen;i++)
7779   {
7780     //if(ds) printf("ds: ");
7781     disassemble_inst(i);
7782     if(ds) {
7783       ds=0; // Skip delay slot
7784       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
7785       instr_addr[i] = NULL;
7786     } else {
7787       speculate_register_values(i);
7788       #ifndef DESTRUCTIVE_WRITEBACK
7789       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
7790       {
7791         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,unneeded_reg[i]);
7792       }
7793       if((itype[i]==CJUMP||itype[i]==SJUMP)&&!likely[i]) {
7794         dirty_pre=branch_regs[i].dirty;
7795       }else{
7796         dirty_pre=regs[i].dirty;
7797       }
7798       #endif
7799       // write back
7800       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
7801       {
7802         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,unneeded_reg[i]);
7803         loop_preload(regmap_pre[i],regs[i].regmap_entry);
7804       }
7805       // branch target entry point
7806       instr_addr[i] = out;
7807       assem_debug("<->\n");
7808       drc_dbg_emit_do_cmp(i);
7809
7810       // load regs
7811       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
7812         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
7813       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i]);
7814       address_generation(i,&regs[i],regs[i].regmap_entry);
7815       load_consts(regmap_pre[i],regs[i].regmap,i);
7816       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
7817       {
7818         // Load the delay slot registers if necessary
7819         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
7820           load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
7821         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
7822           load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
7823         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
7824           load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
7825       }
7826       else if(i+1<slen)
7827       {
7828         // Preload registers for following instruction
7829         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
7830           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
7831             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1]);
7832         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
7833           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
7834             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1]);
7835       }
7836       // TODO: if(is_ooo(i)) address_generation(i+1);
7837       if(itype[i]==CJUMP)
7838         load_regs(regs[i].regmap_entry,regs[i].regmap,CCREG,CCREG);
7839       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
7840         load_regs(regs[i].regmap_entry,regs[i].regmap,INVCP,INVCP);
7841       // assemble
7842       switch(itype[i]) {
7843         case ALU:
7844           alu_assemble(i,&regs[i]);break;
7845         case IMM16:
7846           imm16_assemble(i,&regs[i]);break;
7847         case SHIFT:
7848           shift_assemble(i,&regs[i]);break;
7849         case SHIFTIMM:
7850           shiftimm_assemble(i,&regs[i]);break;
7851         case LOAD:
7852           load_assemble(i,&regs[i]);break;
7853         case LOADLR:
7854           loadlr_assemble(i,&regs[i]);break;
7855         case STORE:
7856           store_assemble(i,&regs[i]);break;
7857         case STORELR:
7858           storelr_assemble(i,&regs[i]);break;
7859         case COP0:
7860           cop0_assemble(i,&regs[i]);break;
7861         case COP1:
7862           cop1_assemble(i,&regs[i]);break;
7863         case C1LS:
7864           c1ls_assemble(i,&regs[i]);break;
7865         case COP2:
7866           cop2_assemble(i,&regs[i]);break;
7867         case C2LS:
7868           c2ls_assemble(i,&regs[i]);break;
7869         case C2OP:
7870           c2op_assemble(i,&regs[i]);break;
7871         case MULTDIV:
7872           multdiv_assemble(i,&regs[i]);break;
7873         case MOV:
7874           mov_assemble(i,&regs[i]);break;
7875         case SYSCALL:
7876           syscall_assemble(i,&regs[i]);break;
7877         case HLECALL:
7878           hlecall_assemble(i,&regs[i]);break;
7879         case INTCALL:
7880           intcall_assemble(i,&regs[i]);break;
7881         case UJUMP:
7882           ujump_assemble(i,&regs[i]);ds=1;break;
7883         case RJUMP:
7884           rjump_assemble(i,&regs[i]);ds=1;break;
7885         case CJUMP:
7886           cjump_assemble(i,&regs[i]);ds=1;break;
7887         case SJUMP:
7888           sjump_assemble(i,&regs[i]);ds=1;break;
7889         case SPAN:
7890           pagespan_assemble(i,&regs[i]);break;
7891       }
7892       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
7893         literal_pool(1024);
7894       else
7895         literal_pool_jumpover(256);
7896     }
7897   }
7898   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
7899   // If the block did not end with an unconditional branch,
7900   // add a jump to the next instruction.
7901   if(i>1) {
7902     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
7903       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
7904       assert(i==slen);
7905       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP) {
7906         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
7907         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
7908           emit_loadreg(CCREG,HOST_CCREG);
7909         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
7910       }
7911       else if(!likely[i-2])
7912       {
7913         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].dirty,start+i*4);
7914         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
7915       }
7916       else
7917       {
7918         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*4);
7919         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
7920       }
7921       add_to_linker(out,start+i*4,0);
7922       emit_jmp(0);
7923     }
7924   }
7925   else
7926   {
7927     assert(i>0);
7928     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP);
7929     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*4);
7930     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
7931       emit_loadreg(CCREG,HOST_CCREG);
7932     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
7933     add_to_linker(out,start+i*4,0);
7934     emit_jmp(0);
7935   }
7936
7937   // TODO: delay slot stubs?
7938   // Stubs
7939   for(i=0;i<stubcount;i++)
7940   {
7941     switch(stubs[i].type)
7942     {
7943       case LOADB_STUB:
7944       case LOADH_STUB:
7945       case LOADW_STUB:
7946       case LOADD_STUB:
7947       case LOADBU_STUB:
7948       case LOADHU_STUB:
7949         do_readstub(i);break;
7950       case STOREB_STUB:
7951       case STOREH_STUB:
7952       case STOREW_STUB:
7953       case STORED_STUB:
7954         do_writestub(i);break;
7955       case CC_STUB:
7956         do_ccstub(i);break;
7957       case INVCODE_STUB:
7958         do_invstub(i);break;
7959       case FP_STUB:
7960         do_cop1stub(i);break;
7961       case STORELR_STUB:
7962         do_unalignedwritestub(i);break;
7963     }
7964   }
7965
7966   if (instr_addr0_override)
7967     instr_addr[0] = instr_addr0_override;
7968
7969   /* Pass 9 - Linker */
7970   for(i=0;i<linkcount;i++)
7971   {
7972     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
7973     literal_pool(64);
7974     if (!link_addr[i].ext)
7975     {
7976       void *stub = out;
7977       void *addr = check_addr(link_addr[i].target);
7978       emit_extjump(link_addr[i].addr, link_addr[i].target);
7979       if (addr) {
7980         set_jump_target(link_addr[i].addr, addr);
7981         add_link(link_addr[i].target,stub);
7982       }
7983       else
7984         set_jump_target(link_addr[i].addr, stub);
7985     }
7986     else
7987     {
7988       // Internal branch
7989       int target=(link_addr[i].target-start)>>2;
7990       assert(target>=0&&target<slen);
7991       assert(instr_addr[target]);
7992       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
7993       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
7994       //#else
7995       set_jump_target(link_addr[i].addr, instr_addr[target]);
7996       //#endif
7997     }
7998   }
7999   // External Branch Targets (jump_in)
8000   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
8001   for(i=0;i<slen;i++)
8002   {
8003     if(bt[i]||i==0)
8004     {
8005       if(instr_addr[i]) // TODO - delay slots (=null)
8006       {
8007         u_int vaddr=start+i*4;
8008         u_int page=get_page(vaddr);
8009         u_int vpage=get_vpage(vaddr);
8010         literal_pool(256);
8011         {
8012           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
8013           assem_debug("jump_in: %x\n",start+i*4);
8014           ll_add(jump_dirty+vpage,vaddr,out);
8015           void *entry_point = do_dirty_stub(i);
8016           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
8017           // If there was an existing entry in the hash table,
8018           // replace it with the new address.
8019           // Don't add new entries.  We'll insert the
8020           // ones that actually get used in check_addr().
8021           struct ht_entry *ht_bin = hash_table_get(vaddr);
8022           if (ht_bin->vaddr[0] == vaddr)
8023             ht_bin->tcaddr[0] = entry_point;
8024           if (ht_bin->vaddr[1] == vaddr)
8025             ht_bin->tcaddr[1] = entry_point;
8026         }
8027       }
8028     }
8029   }
8030   // Write out the literal pool if necessary
8031   literal_pool(0);
8032   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
8033   // Align code
8034   if(((u_int)out)&7) emit_addnop(13);
8035   #endif
8036   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
8037   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
8038   memcpy(copy,source,slen*4);
8039   copy+=slen*4;
8040
8041   end_block(beginning);
8042
8043   // If we're within 256K of the end of the buffer,
8044   // start over from the beginning. (Is 256K enough?)
8045   if (out > translation_cache+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE)
8046     out = translation_cache;
8047
8048   // Trap writes to any of the pages we compiled
8049   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
8050     invalid_code[i]=0;
8051   }
8052   inv_code_start=inv_code_end=~0;
8053
8054   // for PCSX we need to mark all mirrors too
8055   if(get_page(start)<(RAM_SIZE>>12))
8056     for(i=start>>12;i<=(start+slen*4)>>12;i++)
8057       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
8058       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
8059       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
8060
8061   /* Pass 10 - Free memory by expiring oldest blocks */
8062
8063   int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
8064   while(expirep!=end)
8065   {
8066     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
8067     uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<<shift); // Base address of this block
8068     inv_debug("EXP: Phase %d\n",expirep);
8069     switch((expirep>>11)&3)
8070     {
8071       case 0:
8072         // Clear jump_in and jump_dirty
8073         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
8074         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
8075         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
8076         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
8077         break;
8078       case 1:
8079         // Clear pointers
8080         ll_kill_pointers(jump_out[expirep&2047],base,shift);
8081         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
8082         break;
8083       case 2:
8084         // Clear hash table
8085         for(i=0;i<32;i++) {
8086           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
8087           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
8088              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
8089             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
8090             ht_bin->vaddr[1] = -1;
8091             ht_bin->tcaddr[1] = NULL;
8092           }
8093           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
8094              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
8095             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
8096             ht_bin->vaddr[0] = ht_bin->vaddr[1];
8097             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
8098             ht_bin->vaddr[1] = -1;
8099             ht_bin->tcaddr[1] = NULL;
8100           }
8101         }
8102         break;
8103       case 3:
8104         // Clear jump_out
8105         #ifdef __arm__
8106         if((expirep&2047)==0)
8107           do_clear_cache();
8108         #endif
8109         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
8110         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
8111         break;
8112     }
8113     expirep=(expirep+1)&65535;
8114   }
8115   return 0;
8116 }
8117
8118 // vim:shiftwidth=2:expandtab