a156d3aace6717347c3fec3f20599842d80396ef
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   uint64_t uu;
92   u_int wasconst;
93   u_int isconst;
94   u_int loadedconst;             // host regs that have constants loaded
95   u_int waswritten;              // MIPS regs that were used as store base before
96 };
97
98 // note: asm depends on this layout
99 struct ll_entry
100 {
101   u_int vaddr;
102   u_int reg_sv_flags;
103   void *addr;
104   struct ll_entry *next;
105 };
106
107 struct ht_entry
108 {
109   u_int vaddr[2];
110   void *tcaddr[2];
111 };
112
113 struct code_stub
114 {
115   enum stub_type type;
116   void *addr;
117   void *retaddr;
118   u_int a;
119   uintptr_t b;
120   uintptr_t c;
121   u_int d;
122   u_int e;
123 };
124
125   // used by asm:
126   u_char *out;
127   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
128   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
129   struct ll_entry *jump_dirty[4096];
130
131   static struct ll_entry *jump_out[4096];
132   static u_int start;
133   static u_int *source;
134   static char insn[MAXBLOCK][10];
135   static u_char itype[MAXBLOCK];
136   static u_char opcode[MAXBLOCK];
137   static u_char opcode2[MAXBLOCK];
138   static u_char bt[MAXBLOCK];
139   static u_char rs1[MAXBLOCK];
140   static u_char rs2[MAXBLOCK];
141   static u_char rt1[MAXBLOCK];
142   static u_char rt2[MAXBLOCK];
143   static u_char us1[MAXBLOCK];
144   static u_char us2[MAXBLOCK];
145   static u_char dep1[MAXBLOCK];
146   static u_char dep2[MAXBLOCK];
147   static u_char lt1[MAXBLOCK];
148   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
149   static uint64_t gte_rt[MAXBLOCK];
150   static uint64_t gte_unneeded[MAXBLOCK];
151   static u_int smrv[32]; // speculated MIPS register values
152   static u_int smrv_strong; // mask or regs that are likely to have correct values
153   static u_int smrv_weak; // same, but somewhat less likely
154   static u_int smrv_strong_next; // same, but after current insn executes
155   static u_int smrv_weak_next;
156   static int imm[MAXBLOCK];
157   static u_int ba[MAXBLOCK];
158   static char likely[MAXBLOCK];
159   static char is_ds[MAXBLOCK];
160   static char ooo[MAXBLOCK];
161   static uint64_t unneeded_reg[MAXBLOCK];
162   static uint64_t unneeded_reg_upper[MAXBLOCK];
163   static uint64_t branch_unneeded_reg[MAXBLOCK];
164   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
165   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
166   static uint64_t current_constmap[HOST_REGS];
167   static uint64_t constmap[MAXBLOCK][HOST_REGS];
168   static struct regstat regs[MAXBLOCK];
169   static struct regstat branch_regs[MAXBLOCK];
170   static signed char minimum_free_regs[MAXBLOCK];
171   static u_int needed_reg[MAXBLOCK];
172   static u_int wont_dirty[MAXBLOCK];
173   static u_int will_dirty[MAXBLOCK];
174   static int ccadj[MAXBLOCK];
175   static int slen;
176   static void *instr_addr[MAXBLOCK];
177   static u_int link_addr[MAXBLOCK][3];
178   static int linkcount;
179   static struct code_stub stubs[MAXBLOCK*3];
180   static int stubcount;
181   static u_int literals[1024][2];
182   static int literalcount;
183   static int is_delayslot;
184   static int cop1_usable;
185   static char shadow[1048576]  __attribute__((aligned(16)));
186   static void *copy;
187   static int expirep;
188   static u_int stop_after_jal;
189 #ifndef RAM_FIXED
190   static u_int ram_offset;
191 #else
192   static const u_int ram_offset=0;
193 #endif
194
195   int new_dynarec_hacks;
196   int new_dynarec_did_compile;
197   extern u_char restore_candidate[512];
198   extern int cycle_count;
199
200   /* registers that may be allocated */
201   /* 1-31 gpr */
202 #define HIREG 32 // hi
203 #define LOREG 33 // lo
204 #define FSREG 34 // FPU status (FCSR)
205 #define CSREG 35 // Coprocessor status
206 #define CCREG 36 // Cycle count
207 #define INVCP 37 // Pointer to invalid_code
208 //#define MMREG 38 // Pointer to memory_map
209 #define ROREG 39 // ram offset (if rdram!=0x80000000)
210 #define TEMPREG 40
211 #define FTEMP 40 // FPU temporary register
212 #define PTEMP 41 // Prefetch temporary register
213 //#define TLREG 42 // TLB mapping offset
214 #define RHASH 43 // Return address hash
215 #define RHTBL 44 // Return address hash table address
216 #define RTEMP 45 // JR/JALR address register
217 #define MAXREG 45
218 #define AGEN1 46 // Address generation temporary register
219 //#define AGEN2 47 // Address generation temporary register
220 //#define MGEN1 48 // Maptable address generation temporary register
221 //#define MGEN2 49 // Maptable address generation temporary register
222 #define BTREG 50 // Branch target temporary register
223
224   /* instruction types */
225 #define NOP 0     // No operation
226 #define LOAD 1    // Load
227 #define STORE 2   // Store
228 #define LOADLR 3  // Unaligned load
229 #define STORELR 4 // Unaligned store
230 #define MOV 5     // Move
231 #define ALU 6     // Arithmetic/logic
232 #define MULTDIV 7 // Multiply/divide
233 #define SHIFT 8   // Shift by register
234 #define SHIFTIMM 9// Shift by immediate
235 #define IMM16 10  // 16-bit immediate
236 #define RJUMP 11  // Unconditional jump to register
237 #define UJUMP 12  // Unconditional jump
238 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
239 #define SJUMP 14  // Conditional branch (regimm format)
240 #define COP0 15   // Coprocessor 0
241 #define COP1 16   // Coprocessor 1
242 #define C1LS 17   // Coprocessor 1 load/store
243 #define FJUMP 18  // Conditional branch (floating point)
244 #define FLOAT 19  // Floating point unit
245 #define FCONV 20  // Convert integer to float
246 #define FCOMP 21  // Floating point compare (sets FSREG)
247 #define SYSCALL 22// SYSCALL
248 #define OTHER 23  // Other
249 #define SPAN 24   // Branch/delay slot spans 2 pages
250 #define NI 25     // Not implemented
251 #define HLECALL 26// PCSX fake opcodes for HLE
252 #define COP2 27   // Coprocessor 2 move
253 #define C2LS 28   // Coprocessor 2 load/store
254 #define C2OP 29   // Coprocessor 2 operation
255 #define INTCALL 30// Call interpreter to handle rare corner cases
256
257   /* branch codes */
258 #define TAKEN 1
259 #define NOTTAKEN 2
260 #define NULLDS 3
261
262 // asm linkage
263 int new_recompile_block(int addr);
264 void *get_addr_ht(u_int vaddr);
265 void invalidate_block(u_int block);
266 void invalidate_addr(u_int addr);
267 void remove_hash(int vaddr);
268 void dyna_linker();
269 void dyna_linker_ds();
270 void verify_code();
271 void verify_code_vm();
272 void verify_code_ds();
273 void cc_interrupt();
274 void fp_exception();
275 void fp_exception_ds();
276 void jump_syscall_hle();
277 void jump_hlecall();
278 void jump_intcall();
279 void new_dyna_leave();
280
281 // Needed by assembler
282 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
283 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
284 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
285 static void load_all_regs(signed char i_regmap[]);
286 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
287 static void load_regs_entry(int t);
288 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
289
290 static int verify_dirty(u_int *ptr);
291 static int get_final_value(int hr, int i, int *value);
292 static void add_stub(enum stub_type type, void *addr, void *retaddr,
293   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
294 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
295   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
296 static void add_to_linker(int addr,int target,int ext);
297
298 static int tracedebug=0;
299
300 static void mprotect_w_x(void *start, void *end, int is_x)
301 {
302 #ifdef NO_WRITE_EXEC
303   #if defined(VITA)
304   // *Open* enables write on all memory that was
305   // allocated by sceKernelAllocMemBlockForVM()?
306   if (is_x)
307     sceKernelCloseVMDomain();
308   else
309     sceKernelOpenVMDomain();
310   #else
311   u_long mstart = (u_long)start & ~4095ul;
312   u_long mend = (u_long)end;
313   if (mprotect((void *)mstart, mend - mstart,
314                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
315     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
316   #endif
317 #endif
318 }
319
320 static void start_tcache_write(void *start, void *end)
321 {
322   mprotect_w_x(start, end, 0);
323 }
324
325 static void end_tcache_write(void *start, void *end)
326 {
327 #ifdef __arm__
328   size_t len = (char *)end - (char *)start;
329   #if   defined(__BLACKBERRY_QNX__)
330   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
331   #elif defined(__MACH__)
332   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
333   #elif defined(VITA)
334   sceKernelSyncVMDomain(sceBlock, start, len);
335   #elif defined(_3DS)
336   ctr_flush_invalidate_cache();
337   #else
338   __clear_cache(start, end);
339   #endif
340   (void)len;
341 #endif
342
343   mprotect_w_x(start, end, 1);
344 }
345
346 static void *start_block(void)
347 {
348   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
349   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
350     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
351   start_tcache_write(out, end);
352   return out;
353 }
354
355 static void end_block(void *start)
356 {
357   end_tcache_write(start, out);
358 }
359
360 //#define DEBUG_CYCLE_COUNT 1
361
362 #define NO_CYCLE_PENALTY_THR 12
363
364 int cycle_multiplier; // 100 for 1.0
365
366 static int CLOCK_ADJUST(int x)
367 {
368   int s=(x>>31)|1;
369   return (x * cycle_multiplier + s * 50) / 100;
370 }
371
372 static u_int get_page(u_int vaddr)
373 {
374   u_int page=vaddr&~0xe0000000;
375   if (page < 0x1000000)
376     page &= ~0x0e00000; // RAM mirrors
377   page>>=12;
378   if(page>2048) page=2048+(page&2047);
379   return page;
380 }
381
382 // no virtual mem in PCSX
383 static u_int get_vpage(u_int vaddr)
384 {
385   return get_page(vaddr);
386 }
387
388 static struct ht_entry *hash_table_get(u_int vaddr)
389 {
390   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
391 }
392
393 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
394 {
395   ht_bin->vaddr[1] = ht_bin->vaddr[0];
396   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
397   ht_bin->vaddr[0] = vaddr;
398   ht_bin->tcaddr[0] = tcaddr;
399 }
400
401 // some messy ari64's code, seems to rely on unsigned 32bit overflow
402 static int doesnt_expire_soon(void *tcaddr)
403 {
404   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
405   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
406 }
407
408 // Get address from virtual address
409 // This is called from the recompiled JR/JALR instructions
410 void *get_addr(u_int vaddr)
411 {
412   u_int page=get_page(vaddr);
413   u_int vpage=get_vpage(vaddr);
414   struct ll_entry *head;
415   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
416   head=jump_in[page];
417   while(head!=NULL) {
418     if(head->vaddr==vaddr) {
419   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
420       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
421       return head->addr;
422     }
423     head=head->next;
424   }
425   head=jump_dirty[vpage];
426   while(head!=NULL) {
427     if(head->vaddr==vaddr) {
428       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
429       // Don't restore blocks which are about to expire from the cache
430       if (doesnt_expire_soon(head->addr))
431       if (verify_dirty(head->addr)) {
432         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
433         invalid_code[vaddr>>12]=0;
434         inv_code_start=inv_code_end=~0;
435         if(vpage<2048) {
436           restore_candidate[vpage>>3]|=1<<(vpage&7);
437         }
438         else restore_candidate[page>>3]|=1<<(page&7);
439         struct ht_entry *ht_bin = hash_table_get(vaddr);
440         if (ht_bin->vaddr[0] == vaddr)
441           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
442         else
443           hash_table_add(ht_bin, vaddr, head->addr);
444
445         return head->addr;
446       }
447     }
448     head=head->next;
449   }
450   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
451   int r=new_recompile_block(vaddr);
452   if(r==0) return get_addr(vaddr);
453   // Execute in unmapped page, generate pagefault execption
454   Status|=2;
455   Cause=(vaddr<<31)|0x8;
456   EPC=(vaddr&1)?vaddr-5:vaddr;
457   BadVAddr=(vaddr&~1);
458   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
459   EntryHi=BadVAddr&0xFFFFE000;
460   return get_addr_ht(0x80000000);
461 }
462 // Look up address in hash table first
463 void *get_addr_ht(u_int vaddr)
464 {
465   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
466   const struct ht_entry *ht_bin = hash_table_get(vaddr);
467   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
468   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
469   return get_addr(vaddr);
470 }
471
472 void clear_all_regs(signed char regmap[])
473 {
474   int hr;
475   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
476 }
477
478 signed char get_reg(signed char regmap[],int r)
479 {
480   int hr;
481   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
482   return -1;
483 }
484
485 // Find a register that is available for two consecutive cycles
486 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
487 {
488   int hr;
489   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
490   return -1;
491 }
492
493 int count_free_regs(signed char regmap[])
494 {
495   int count=0;
496   int hr;
497   for(hr=0;hr<HOST_REGS;hr++)
498   {
499     if(hr!=EXCLUDE_REG) {
500       if(regmap[hr]<0) count++;
501     }
502   }
503   return count;
504 }
505
506 void dirty_reg(struct regstat *cur,signed char reg)
507 {
508   int hr;
509   if(!reg) return;
510   for (hr=0;hr<HOST_REGS;hr++) {
511     if((cur->regmap[hr]&63)==reg) {
512       cur->dirty|=1<<hr;
513     }
514   }
515 }
516
517 // If we dirty the lower half of a 64 bit register which is now being
518 // sign-extended, we need to dump the upper half.
519 // Note: Do this only after completion of the instruction, because
520 // some instructions may need to read the full 64-bit value even if
521 // overwriting it (eg SLTI, DSRA32).
522 static void flush_dirty_uppers(struct regstat *cur)
523 {
524   int hr,reg;
525   for (hr=0;hr<HOST_REGS;hr++) {
526     if((cur->dirty>>hr)&1) {
527       reg=cur->regmap[hr];
528       if(reg>=64)
529         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
530     }
531   }
532 }
533
534 void set_const(struct regstat *cur,signed char reg,uint64_t value)
535 {
536   int hr;
537   if(!reg) return;
538   for (hr=0;hr<HOST_REGS;hr++) {
539     if(cur->regmap[hr]==reg) {
540       cur->isconst|=1<<hr;
541       current_constmap[hr]=value;
542     }
543     else if((cur->regmap[hr]^64)==reg) {
544       cur->isconst|=1<<hr;
545       current_constmap[hr]=value>>32;
546     }
547   }
548 }
549
550 void clear_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if((cur->regmap[hr]&63)==reg) {
556       cur->isconst&=~(1<<hr);
557     }
558   }
559 }
560
561 int is_const(struct regstat *cur,signed char reg)
562 {
563   int hr;
564   if(reg<0) return 0;
565   if(!reg) return 1;
566   for (hr=0;hr<HOST_REGS;hr++) {
567     if((cur->regmap[hr]&63)==reg) {
568       return (cur->isconst>>hr)&1;
569     }
570   }
571   return 0;
572 }
573 uint64_t get_const(struct regstat *cur,signed char reg)
574 {
575   int hr;
576   if(!reg) return 0;
577   for (hr=0;hr<HOST_REGS;hr++) {
578     if(cur->regmap[hr]==reg) {
579       return current_constmap[hr];
580     }
581   }
582   SysPrintf("Unknown constant in r%d\n",reg);
583   exit(1);
584 }
585
586 // Least soon needed registers
587 // Look at the next ten instructions and see which registers
588 // will be used.  Try not to reallocate these.
589 void lsn(u_char hsn[], int i, int *preferred_reg)
590 {
591   int j;
592   int b=-1;
593   for(j=0;j<9;j++)
594   {
595     if(i+j>=slen) {
596       j=slen-i-1;
597       break;
598     }
599     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
600     {
601       // Don't go past an unconditonal jump
602       j++;
603       break;
604     }
605   }
606   for(;j>=0;j--)
607   {
608     if(rs1[i+j]) hsn[rs1[i+j]]=j;
609     if(rs2[i+j]) hsn[rs2[i+j]]=j;
610     if(rt1[i+j]) hsn[rt1[i+j]]=j;
611     if(rt2[i+j]) hsn[rt2[i+j]]=j;
612     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
613       // Stores can allocate zero
614       hsn[rs1[i+j]]=j;
615       hsn[rs2[i+j]]=j;
616     }
617     // On some architectures stores need invc_ptr
618     #if defined(HOST_IMM8)
619     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
620       hsn[INVCP]=j;
621     }
622     #endif
623     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
624     {
625       hsn[CCREG]=j;
626       b=j;
627     }
628   }
629   if(b>=0)
630   {
631     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
632     {
633       // Follow first branch
634       int t=(ba[i+b]-start)>>2;
635       j=7-b;if(t+j>=slen) j=slen-t-1;
636       for(;j>=0;j--)
637       {
638         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
639         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
640         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
641         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
642       }
643     }
644     // TODO: preferred register based on backward branch
645   }
646   // Delay slot should preferably not overwrite branch conditions or cycle count
647   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
648     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
649     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
650     hsn[CCREG]=1;
651     // ...or hash tables
652     hsn[RHASH]=1;
653     hsn[RHTBL]=1;
654   }
655   // Coprocessor load/store needs FTEMP, even if not declared
656   if(itype[i]==C1LS||itype[i]==C2LS) {
657     hsn[FTEMP]=0;
658   }
659   // Load L/R also uses FTEMP as a temporary register
660   if(itype[i]==LOADLR) {
661     hsn[FTEMP]=0;
662   }
663   // Also SWL/SWR/SDL/SDR
664   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
665     hsn[FTEMP]=0;
666   }
667   // Don't remove the miniht registers
668   if(itype[i]==UJUMP||itype[i]==RJUMP)
669   {
670     hsn[RHASH]=0;
671     hsn[RHTBL]=0;
672   }
673 }
674
675 // We only want to allocate registers if we're going to use them again soon
676 int needed_again(int r, int i)
677 {
678   int j;
679   int b=-1;
680   int rn=10;
681
682   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
683   {
684     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
685       return 0; // Don't need any registers if exiting the block
686   }
687   for(j=0;j<9;j++)
688   {
689     if(i+j>=slen) {
690       j=slen-i-1;
691       break;
692     }
693     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
694     {
695       // Don't go past an unconditonal jump
696       j++;
697       break;
698     }
699     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
700     {
701       break;
702     }
703   }
704   for(;j>=1;j--)
705   {
706     if(rs1[i+j]==r) rn=j;
707     if(rs2[i+j]==r) rn=j;
708     if((unneeded_reg[i+j]>>r)&1) rn=10;
709     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
710     {
711       b=j;
712     }
713   }
714   /*
715   if(b>=0)
716   {
717     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
718     {
719       // Follow first branch
720       int o=rn;
721       int t=(ba[i+b]-start)>>2;
722       j=7-b;if(t+j>=slen) j=slen-t-1;
723       for(;j>=0;j--)
724       {
725         if(!((unneeded_reg[t+j]>>r)&1)) {
726           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
727           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
728         }
729         else rn=o;
730       }
731     }
732   }*/
733   if(rn<10) return 1;
734   (void)b;
735   return 0;
736 }
737
738 // Try to match register allocations at the end of a loop with those
739 // at the beginning
740 int loop_reg(int i, int r, int hr)
741 {
742   int j,k;
743   for(j=0;j<9;j++)
744   {
745     if(i+j>=slen) {
746       j=slen-i-1;
747       break;
748     }
749     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
750     {
751       // Don't go past an unconditonal jump
752       j++;
753       break;
754     }
755   }
756   k=0;
757   if(i>0){
758     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
759       k--;
760   }
761   for(;k<j;k++)
762   {
763     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
764     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
765     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
766     {
767       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
768       {
769         int t=(ba[i+k]-start)>>2;
770         int reg=get_reg(regs[t].regmap_entry,r);
771         if(reg>=0) return reg;
772         //reg=get_reg(regs[t+1].regmap_entry,r);
773         //if(reg>=0) return reg;
774       }
775     }
776   }
777   return hr;
778 }
779
780
781 // Allocate every register, preserving source/target regs
782 void alloc_all(struct regstat *cur,int i)
783 {
784   int hr;
785
786   for(hr=0;hr<HOST_REGS;hr++) {
787     if(hr!=EXCLUDE_REG) {
788       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
789          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
790       {
791         cur->regmap[hr]=-1;
792         cur->dirty&=~(1<<hr);
793       }
794       // Don't need zeros
795       if((cur->regmap[hr]&63)==0)
796       {
797         cur->regmap[hr]=-1;
798         cur->dirty&=~(1<<hr);
799       }
800     }
801   }
802 }
803
804 #ifdef __i386__
805 #include "assem_x86.c"
806 #endif
807 #ifdef __x86_64__
808 #include "assem_x64.c"
809 #endif
810 #ifdef __arm__
811 #include "assem_arm.c"
812 #endif
813
814 // Add virtual address mapping to linked list
815 void ll_add(struct ll_entry **head,int vaddr,void *addr)
816 {
817   struct ll_entry *new_entry;
818   new_entry=malloc(sizeof(struct ll_entry));
819   assert(new_entry!=NULL);
820   new_entry->vaddr=vaddr;
821   new_entry->reg_sv_flags=0;
822   new_entry->addr=addr;
823   new_entry->next=*head;
824   *head=new_entry;
825 }
826
827 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
828 {
829   ll_add(head,vaddr,addr);
830   (*head)->reg_sv_flags=reg_sv_flags;
831 }
832
833 // Check if an address is already compiled
834 // but don't return addresses which are about to expire from the cache
835 void *check_addr(u_int vaddr)
836 {
837   struct ht_entry *ht_bin = hash_table_get(vaddr);
838   size_t i;
839   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
840     if (ht_bin->vaddr[i] == vaddr)
841       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
842         if (isclean(ht_bin->tcaddr[i]))
843           return ht_bin->tcaddr[i];
844   }
845   u_int page=get_page(vaddr);
846   struct ll_entry *head;
847   head=jump_in[page];
848   while (head != NULL) {
849     if (head->vaddr == vaddr) {
850       if (doesnt_expire_soon(head->addr)) {
851         // Update existing entry with current address
852         if (ht_bin->vaddr[0] == vaddr) {
853           ht_bin->tcaddr[0] = head->addr;
854           return head->addr;
855         }
856         if (ht_bin->vaddr[1] == vaddr) {
857           ht_bin->tcaddr[1] = head->addr;
858           return head->addr;
859         }
860         // Insert into hash table with low priority.
861         // Don't evict existing entries, as they are probably
862         // addresses that are being accessed frequently.
863         if (ht_bin->vaddr[0] == -1) {
864           ht_bin->vaddr[0] = vaddr;
865           ht_bin->tcaddr[0] = head->addr;
866         }
867         else if (ht_bin->vaddr[1] == -1) {
868           ht_bin->vaddr[1] = vaddr;
869           ht_bin->tcaddr[1] = head->addr;
870         }
871         return head->addr;
872       }
873     }
874     head=head->next;
875   }
876   return 0;
877 }
878
879 void remove_hash(int vaddr)
880 {
881   //printf("remove hash: %x\n",vaddr);
882   struct ht_entry *ht_bin = hash_table_get(vaddr);
883   if (ht_bin->vaddr[1] == vaddr) {
884     ht_bin->vaddr[1] = -1;
885     ht_bin->tcaddr[1] = NULL;
886   }
887   if (ht_bin->vaddr[0] == vaddr) {
888     ht_bin->vaddr[0] = ht_bin->vaddr[1];
889     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
890     ht_bin->vaddr[1] = -1;
891     ht_bin->tcaddr[1] = NULL;
892   }
893 }
894
895 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
896 {
897   struct ll_entry *next;
898   while(*head) {
899     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
900        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
901     {
902       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
903       remove_hash((*head)->vaddr);
904       next=(*head)->next;
905       free(*head);
906       *head=next;
907     }
908     else
909     {
910       head=&((*head)->next);
911     }
912   }
913 }
914
915 // Remove all entries from linked list
916 void ll_clear(struct ll_entry **head)
917 {
918   struct ll_entry *cur;
919   struct ll_entry *next;
920   if((cur=*head)) {
921     *head=0;
922     while(cur) {
923       next=cur->next;
924       free(cur);
925       cur=next;
926     }
927   }
928 }
929
930 // Dereference the pointers and remove if it matches
931 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
932 {
933   while(head) {
934     int ptr=get_pointer(head->addr);
935     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
936     if(((ptr>>shift)==(addr>>shift)) ||
937        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
938     {
939       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
940       void *host_addr=find_extjump_insn(head->addr);
941       #ifdef __arm__
942         mark_clear_cache(host_addr);
943       #endif
944       set_jump_target(host_addr, head->addr);
945     }
946     head=head->next;
947   }
948 }
949
950 // This is called when we write to a compiled block (see do_invstub)
951 void invalidate_page(u_int page)
952 {
953   struct ll_entry *head;
954   struct ll_entry *next;
955   head=jump_in[page];
956   jump_in[page]=0;
957   while(head!=NULL) {
958     inv_debug("INVALIDATE: %x\n",head->vaddr);
959     remove_hash(head->vaddr);
960     next=head->next;
961     free(head);
962     head=next;
963   }
964   head=jump_out[page];
965   jump_out[page]=0;
966   while(head!=NULL) {
967     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
968     void *host_addr=find_extjump_insn(head->addr);
969     #ifdef __arm__
970       mark_clear_cache(host_addr);
971     #endif
972     set_jump_target(host_addr, head->addr);
973     next=head->next;
974     free(head);
975     head=next;
976   }
977 }
978
979 static void invalidate_block_range(u_int block, u_int first, u_int last)
980 {
981   u_int page=get_page(block<<12);
982   //printf("first=%d last=%d\n",first,last);
983   invalidate_page(page);
984   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
985   assert(last<page+5);
986   // Invalidate the adjacent pages if a block crosses a 4K boundary
987   while(first<page) {
988     invalidate_page(first);
989     first++;
990   }
991   for(first=page+1;first<last;first++) {
992     invalidate_page(first);
993   }
994   #ifdef __arm__
995     do_clear_cache();
996   #endif
997
998   // Don't trap writes
999   invalid_code[block]=1;
1000
1001   #ifdef USE_MINI_HT
1002   memset(mini_ht,-1,sizeof(mini_ht));
1003   #endif
1004 }
1005
1006 void invalidate_block(u_int block)
1007 {
1008   u_int page=get_page(block<<12);
1009   u_int vpage=get_vpage(block<<12);
1010   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1011   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1012   u_int first,last;
1013   first=last=page;
1014   struct ll_entry *head;
1015   head=jump_dirty[vpage];
1016   //printf("page=%d vpage=%d\n",page,vpage);
1017   while(head!=NULL) {
1018     u_int start,end;
1019     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1020       get_bounds((int)head->addr,&start,&end);
1021       //printf("start: %x end: %x\n",start,end);
1022       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1023         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1024           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1025           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1026         }
1027       }
1028     }
1029     head=head->next;
1030   }
1031   invalidate_block_range(block,first,last);
1032 }
1033
1034 void invalidate_addr(u_int addr)
1035 {
1036   //static int rhits;
1037   // this check is done by the caller
1038   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1039   u_int page=get_vpage(addr);
1040   if(page<2048) { // RAM
1041     struct ll_entry *head;
1042     u_int addr_min=~0, addr_max=0;
1043     u_int mask=RAM_SIZE-1;
1044     u_int addr_main=0x80000000|(addr&mask);
1045     int pg1;
1046     inv_code_start=addr_main&~0xfff;
1047     inv_code_end=addr_main|0xfff;
1048     pg1=page;
1049     if (pg1>0) {
1050       // must check previous page too because of spans..
1051       pg1--;
1052       inv_code_start-=0x1000;
1053     }
1054     for(;pg1<=page;pg1++) {
1055       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1056         u_int start,end;
1057         get_bounds((int)head->addr,&start,&end);
1058         if(ram_offset) {
1059           start-=ram_offset;
1060           end-=ram_offset;
1061         }
1062         if(start<=addr_main&&addr_main<end) {
1063           if(start<addr_min) addr_min=start;
1064           if(end>addr_max) addr_max=end;
1065         }
1066         else if(addr_main<start) {
1067           if(start<inv_code_end)
1068             inv_code_end=start-1;
1069         }
1070         else {
1071           if(end>inv_code_start)
1072             inv_code_start=end;
1073         }
1074       }
1075     }
1076     if (addr_min!=~0) {
1077       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1078       inv_code_start=inv_code_end=~0;
1079       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1080       return;
1081     }
1082     else {
1083       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1084       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1085       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1086       return;
1087     }
1088   }
1089   invalidate_block(addr>>12);
1090 }
1091
1092 // This is called when loading a save state.
1093 // Anything could have changed, so invalidate everything.
1094 void invalidate_all_pages()
1095 {
1096   u_int page;
1097   for(page=0;page<4096;page++)
1098     invalidate_page(page);
1099   for(page=0;page<1048576;page++)
1100     if(!invalid_code[page]) {
1101       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1102       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1103     }
1104   #ifdef USE_MINI_HT
1105   memset(mini_ht,-1,sizeof(mini_ht));
1106   #endif
1107 }
1108
1109 // Add an entry to jump_out after making a link
1110 void add_link(u_int vaddr,void *src)
1111 {
1112   u_int page=get_page(vaddr);
1113   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1114   int *ptr=(int *)(src+4);
1115   assert((*ptr&0x0fff0000)==0x059f0000);
1116   (void)ptr;
1117   ll_add(jump_out+page,vaddr,src);
1118   //int ptr=get_pointer(src);
1119   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1120 }
1121
1122 // If a code block was found to be unmodified (bit was set in
1123 // restore_candidate) and it remains unmodified (bit is clear
1124 // in invalid_code) then move the entries for that 4K page from
1125 // the dirty list to the clean list.
1126 void clean_blocks(u_int page)
1127 {
1128   struct ll_entry *head;
1129   inv_debug("INV: clean_blocks page=%d\n",page);
1130   head=jump_dirty[page];
1131   while(head!=NULL) {
1132     if(!invalid_code[head->vaddr>>12]) {
1133       // Don't restore blocks which are about to expire from the cache
1134       if (doesnt_expire_soon(head->addr)) {
1135         u_int start,end;
1136         if(verify_dirty(head->addr)) {
1137           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1138           u_int i;
1139           u_int inv=0;
1140           get_bounds((int)head->addr,&start,&end);
1141           if(start-(u_int)rdram<RAM_SIZE) {
1142             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1143               inv|=invalid_code[i];
1144             }
1145           }
1146           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1147             inv=1;
1148           }
1149           if(!inv) {
1150             void *clean_addr = get_clean_addr(head->addr);
1151             if (doesnt_expire_soon(clean_addr)) {
1152               u_int ppage=page;
1153               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1154               //printf("page=%x, addr=%x\n",page,head->vaddr);
1155               //assert(head->vaddr>>12==(page|0x80000));
1156               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1157               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1158               if (ht_bin->vaddr[0] == head->vaddr)
1159                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1160               if (ht_bin->vaddr[1] == head->vaddr)
1161                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1162             }
1163           }
1164         }
1165       }
1166     }
1167     head=head->next;
1168   }
1169 }
1170
1171
1172 void mov_alloc(struct regstat *current,int i)
1173 {
1174   // Note: Don't need to actually alloc the source registers
1175   if((~current->is32>>rs1[i])&1) {
1176     //alloc_reg64(current,i,rs1[i]);
1177     alloc_reg64(current,i,rt1[i]);
1178     current->is32&=~(1LL<<rt1[i]);
1179   } else {
1180     //alloc_reg(current,i,rs1[i]);
1181     alloc_reg(current,i,rt1[i]);
1182     current->is32|=(1LL<<rt1[i]);
1183   }
1184   clear_const(current,rs1[i]);
1185   clear_const(current,rt1[i]);
1186   dirty_reg(current,rt1[i]);
1187 }
1188
1189 void shiftimm_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1192   {
1193     if(rt1[i]) {
1194       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1195       else lt1[i]=rs1[i];
1196       alloc_reg(current,i,rt1[i]);
1197       current->is32|=1LL<<rt1[i];
1198       dirty_reg(current,rt1[i]);
1199       if(is_const(current,rs1[i])) {
1200         int v=get_const(current,rs1[i]);
1201         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1202         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1203         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1204       }
1205       else clear_const(current,rt1[i]);
1206     }
1207   }
1208   else
1209   {
1210     clear_const(current,rs1[i]);
1211     clear_const(current,rt1[i]);
1212   }
1213
1214   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1215   {
1216     if(rt1[i]) {
1217       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1218       alloc_reg64(current,i,rt1[i]);
1219       current->is32&=~(1LL<<rt1[i]);
1220       dirty_reg(current,rt1[i]);
1221     }
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     if(rt1[i]) {
1226       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1227       alloc_reg64(current,i,rt1[i]);
1228       current->is32&=~(1LL<<rt1[i]);
1229       dirty_reg(current,rt1[i]);
1230     }
1231   }
1232   if(opcode2[i]==0x3e) // DSRL32
1233   {
1234     if(rt1[i]) {
1235       alloc_reg64(current,i,rs1[i]);
1236       if(imm[i]==32) {
1237         alloc_reg64(current,i,rt1[i]);
1238         current->is32&=~(1LL<<rt1[i]);
1239       } else {
1240         alloc_reg(current,i,rt1[i]);
1241         current->is32|=1LL<<rt1[i];
1242       }
1243       dirty_reg(current,rt1[i]);
1244     }
1245   }
1246   if(opcode2[i]==0x3f) // DSRA32
1247   {
1248     if(rt1[i]) {
1249       alloc_reg64(current,i,rs1[i]);
1250       alloc_reg(current,i,rt1[i]);
1251       current->is32|=1LL<<rt1[i];
1252       dirty_reg(current,rt1[i]);
1253     }
1254   }
1255 }
1256
1257 void shift_alloc(struct regstat *current,int i)
1258 {
1259   if(rt1[i]) {
1260     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1261     {
1262       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1263       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1264       alloc_reg(current,i,rt1[i]);
1265       if(rt1[i]==rs2[i]) {
1266         alloc_reg_temp(current,i,-1);
1267         minimum_free_regs[i]=1;
1268       }
1269       current->is32|=1LL<<rt1[i];
1270     } else { // DSLLV/DSRLV/DSRAV
1271       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1272       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1273       alloc_reg64(current,i,rt1[i]);
1274       current->is32&=~(1LL<<rt1[i]);
1275       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1276       {
1277         alloc_reg_temp(current,i,-1);
1278         minimum_free_regs[i]=1;
1279       }
1280     }
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rs2[i]);
1283     clear_const(current,rt1[i]);
1284     dirty_reg(current,rt1[i]);
1285   }
1286 }
1287
1288 void alu_alloc(struct regstat *current,int i)
1289 {
1290   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1291     if(rt1[i]) {
1292       if(rs1[i]&&rs2[i]) {
1293         alloc_reg(current,i,rs1[i]);
1294         alloc_reg(current,i,rs2[i]);
1295       }
1296       else {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301     }
1302     current->is32|=1LL<<rt1[i];
1303   }
1304   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1305     if(rt1[i]) {
1306       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1307       {
1308         alloc_reg64(current,i,rs1[i]);
1309         alloc_reg64(current,i,rs2[i]);
1310         alloc_reg(current,i,rt1[i]);
1311       } else {
1312         alloc_reg(current,i,rs1[i]);
1313         alloc_reg(current,i,rs2[i]);
1314         alloc_reg(current,i,rt1[i]);
1315       }
1316     }
1317     current->is32|=1LL<<rt1[i];
1318   }
1319   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1320     if(rt1[i]) {
1321       if(rs1[i]&&rs2[i]) {
1322         alloc_reg(current,i,rs1[i]);
1323         alloc_reg(current,i,rs2[i]);
1324       }
1325       else
1326       {
1327         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1328         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1329       }
1330       alloc_reg(current,i,rt1[i]);
1331       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1332       {
1333         if(!((current->uu>>rt1[i])&1)) {
1334           alloc_reg64(current,i,rt1[i]);
1335         }
1336         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1337           if(rs1[i]&&rs2[i]) {
1338             alloc_reg64(current,i,rs1[i]);
1339             alloc_reg64(current,i,rs2[i]);
1340           }
1341           else
1342           {
1343             // Is is really worth it to keep 64-bit values in registers?
1344             #ifdef NATIVE_64BIT
1345             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1346             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1347             #endif
1348           }
1349         }
1350         current->is32&=~(1LL<<rt1[i]);
1351       } else {
1352         current->is32|=1LL<<rt1[i];
1353       }
1354     }
1355   }
1356   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1357     if(rt1[i]) {
1358       if(rs1[i]&&rs2[i]) {
1359         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1360           alloc_reg64(current,i,rs1[i]);
1361           alloc_reg64(current,i,rs2[i]);
1362           alloc_reg64(current,i,rt1[i]);
1363         } else {
1364           alloc_reg(current,i,rs1[i]);
1365           alloc_reg(current,i,rs2[i]);
1366           alloc_reg(current,i,rt1[i]);
1367         }
1368       }
1369       else {
1370         alloc_reg(current,i,rt1[i]);
1371         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1372           // DADD used as move, or zeroing
1373           // If we have a 64-bit source, then make the target 64 bits too
1374           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1375             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1376             alloc_reg64(current,i,rt1[i]);
1377           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1378             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1379             alloc_reg64(current,i,rt1[i]);
1380           }
1381           if(opcode2[i]>=0x2e&&rs2[i]) {
1382             // DSUB used as negation - 64-bit result
1383             // If we have a 32-bit register, extend it to 64 bits
1384             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1385             alloc_reg64(current,i,rt1[i]);
1386           }
1387         }
1388       }
1389       if(rs1[i]&&rs2[i]) {
1390         current->is32&=~(1LL<<rt1[i]);
1391       } else if(rs1[i]) {
1392         current->is32&=~(1LL<<rt1[i]);
1393         if((current->is32>>rs1[i])&1)
1394           current->is32|=1LL<<rt1[i];
1395       } else if(rs2[i]) {
1396         current->is32&=~(1LL<<rt1[i]);
1397         if((current->is32>>rs2[i])&1)
1398           current->is32|=1LL<<rt1[i];
1399       } else {
1400         current->is32|=1LL<<rt1[i];
1401       }
1402     }
1403   }
1404   clear_const(current,rs1[i]);
1405   clear_const(current,rs2[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void imm16_alloc(struct regstat *current,int i)
1411 {
1412   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   else lt1[i]=rs1[i];
1414   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1415   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1416     current->is32&=~(1LL<<rt1[i]);
1417     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1418       // TODO: Could preserve the 32-bit flag if the immediate is zero
1419       alloc_reg64(current,i,rt1[i]);
1420       alloc_reg64(current,i,rs1[i]);
1421     }
1422     clear_const(current,rs1[i]);
1423     clear_const(current,rt1[i]);
1424   }
1425   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1426     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1427     current->is32|=1LL<<rt1[i];
1428     clear_const(current,rs1[i]);
1429     clear_const(current,rt1[i]);
1430   }
1431   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1432     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1433       if(rs1[i]!=rt1[i]) {
1434         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rt1[i]);
1436         current->is32&=~(1LL<<rt1[i]);
1437       }
1438     }
1439     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1440     if(is_const(current,rs1[i])) {
1441       int v=get_const(current,rs1[i]);
1442       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1443       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1444       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1445     }
1446     else clear_const(current,rt1[i]);
1447   }
1448   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1449     if(is_const(current,rs1[i])) {
1450       int v=get_const(current,rs1[i]);
1451       set_const(current,rt1[i],v+imm[i]);
1452     }
1453     else clear_const(current,rt1[i]);
1454     current->is32|=1LL<<rt1[i];
1455   }
1456   else {
1457     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1458     current->is32|=1LL<<rt1[i];
1459   }
1460   dirty_reg(current,rt1[i]);
1461 }
1462
1463 void load_alloc(struct regstat *current,int i)
1464 {
1465   clear_const(current,rt1[i]);
1466   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1467   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1468   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1469   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1470     alloc_reg(current,i,rt1[i]);
1471     assert(get_reg(current->regmap,rt1[i])>=0);
1472     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1473     {
1474       current->is32&=~(1LL<<rt1[i]);
1475       alloc_reg64(current,i,rt1[i]);
1476     }
1477     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481       alloc_all(current,i);
1482       alloc_reg64(current,i,FTEMP);
1483       minimum_free_regs[i]=HOST_REGS;
1484     }
1485     else current->is32|=1LL<<rt1[i];
1486     dirty_reg(current,rt1[i]);
1487     // LWL/LWR need a temporary register for the old value
1488     if(opcode[i]==0x22||opcode[i]==0x26)
1489     {
1490       alloc_reg(current,i,FTEMP);
1491       alloc_reg_temp(current,i,-1);
1492       minimum_free_regs[i]=1;
1493     }
1494   }
1495   else
1496   {
1497     // Load to r0 or unneeded register (dummy load)
1498     // but we still need a register to calculate the address
1499     if(opcode[i]==0x22||opcode[i]==0x26)
1500     {
1501       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1502     }
1503     alloc_reg_temp(current,i,-1);
1504     minimum_free_regs[i]=1;
1505     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1506     {
1507       alloc_all(current,i);
1508       alloc_reg64(current,i,FTEMP);
1509       minimum_free_regs[i]=HOST_REGS;
1510     }
1511   }
1512 }
1513
1514 void store_alloc(struct regstat *current,int i)
1515 {
1516   clear_const(current,rs2[i]);
1517   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1518   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519   alloc_reg(current,i,rs2[i]);
1520   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1521     alloc_reg64(current,i,rs2[i]);
1522     if(rs2[i]) alloc_reg(current,i,FTEMP);
1523   }
1524   #if defined(HOST_IMM8)
1525   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1526   else alloc_reg(current,i,INVCP);
1527   #endif
1528   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1529     alloc_reg(current,i,FTEMP);
1530   }
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533   minimum_free_regs[i]=1;
1534 }
1535
1536 void c1ls_alloc(struct regstat *current,int i)
1537 {
1538   //clear_const(current,rs1[i]); // FIXME
1539   clear_const(current,rt1[i]);
1540   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   alloc_reg(current,i,CSREG); // Status
1542   alloc_reg(current,i,FTEMP);
1543   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1544     alloc_reg64(current,i,FTEMP);
1545   }
1546   #if defined(HOST_IMM8)
1547   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1548   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1549     alloc_reg(current,i,INVCP);
1550   #endif
1551   // We need a temporary register for address generation
1552   alloc_reg_temp(current,i,-1);
1553 }
1554
1555 void c2ls_alloc(struct regstat *current,int i)
1556 {
1557   clear_const(current,rt1[i]);
1558   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1559   alloc_reg(current,i,FTEMP);
1560   #if defined(HOST_IMM8)
1561   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1562   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1563     alloc_reg(current,i,INVCP);
1564   #endif
1565   // We need a temporary register for address generation
1566   alloc_reg_temp(current,i,-1);
1567   minimum_free_regs[i]=1;
1568 }
1569
1570 #ifndef multdiv_alloc
1571 void multdiv_alloc(struct regstat *current,int i)
1572 {
1573   //  case 0x18: MULT
1574   //  case 0x19: MULTU
1575   //  case 0x1A: DIV
1576   //  case 0x1B: DIVU
1577   //  case 0x1C: DMULT
1578   //  case 0x1D: DMULTU
1579   //  case 0x1E: DDIV
1580   //  case 0x1F: DDIVU
1581   clear_const(current,rs1[i]);
1582   clear_const(current,rs2[i]);
1583   if(rs1[i]&&rs2[i])
1584   {
1585     if((opcode2[i]&4)==0) // 32-bit
1586     {
1587       current->u&=~(1LL<<HIREG);
1588       current->u&=~(1LL<<LOREG);
1589       alloc_reg(current,i,HIREG);
1590       alloc_reg(current,i,LOREG);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       current->is32|=1LL<<HIREG;
1594       current->is32|=1LL<<LOREG;
1595       dirty_reg(current,HIREG);
1596       dirty_reg(current,LOREG);
1597     }
1598     else // 64-bit
1599     {
1600       current->u&=~(1LL<<HIREG);
1601       current->u&=~(1LL<<LOREG);
1602       current->uu&=~(1LL<<HIREG);
1603       current->uu&=~(1LL<<LOREG);
1604       alloc_reg64(current,i,HIREG);
1605       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1606       alloc_reg64(current,i,rs1[i]);
1607       alloc_reg64(current,i,rs2[i]);
1608       alloc_all(current,i);
1609       current->is32&=~(1LL<<HIREG);
1610       current->is32&=~(1LL<<LOREG);
1611       dirty_reg(current,HIREG);
1612       dirty_reg(current,LOREG);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615   }
1616   else
1617   {
1618     // Multiply by zero is zero.
1619     // MIPS does not have a divide by zero exception.
1620     // The result is undefined, we return zero.
1621     alloc_reg(current,i,HIREG);
1622     alloc_reg(current,i,LOREG);
1623     current->is32|=1LL<<HIREG;
1624     current->is32|=1LL<<LOREG;
1625     dirty_reg(current,HIREG);
1626     dirty_reg(current,LOREG);
1627   }
1628 }
1629 #endif
1630
1631 void cop0_alloc(struct regstat *current,int i)
1632 {
1633   if(opcode2[i]==0) // MFC0
1634   {
1635     if(rt1[i]) {
1636       clear_const(current,rt1[i]);
1637       alloc_all(current,i);
1638       alloc_reg(current,i,rt1[i]);
1639       current->is32|=1LL<<rt1[i];
1640       dirty_reg(current,rt1[i]);
1641     }
1642   }
1643   else if(opcode2[i]==4) // MTC0
1644   {
1645     if(rs1[i]){
1646       clear_const(current,rs1[i]);
1647       alloc_reg(current,i,rs1[i]);
1648       alloc_all(current,i);
1649     }
1650     else {
1651       alloc_all(current,i); // FIXME: Keep r0
1652       current->u&=~1LL;
1653       alloc_reg(current,i,0);
1654     }
1655   }
1656   else
1657   {
1658     // TLBR/TLBWI/TLBWR/TLBP/ERET
1659     assert(opcode2[i]==0x10);
1660     alloc_all(current,i);
1661   }
1662   minimum_free_regs[i]=HOST_REGS;
1663 }
1664
1665 void cop1_alloc(struct regstat *current,int i)
1666 {
1667   alloc_reg(current,i,CSREG); // Load status
1668   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1669   {
1670     if(rt1[i]){
1671       clear_const(current,rt1[i]);
1672       if(opcode2[i]==1) {
1673         alloc_reg64(current,i,rt1[i]); // DMFC1
1674         current->is32&=~(1LL<<rt1[i]);
1675       }else{
1676         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1677         current->is32|=1LL<<rt1[i];
1678       }
1679       dirty_reg(current,rt1[i]);
1680     }
1681     alloc_reg_temp(current,i,-1);
1682   }
1683   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1684   {
1685     if(rs1[i]){
1686       clear_const(current,rs1[i]);
1687       if(opcode2[i]==5)
1688         alloc_reg64(current,i,rs1[i]); // DMTC1
1689       else
1690         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1691       alloc_reg_temp(current,i,-1);
1692     }
1693     else {
1694       current->u&=~1LL;
1695       alloc_reg(current,i,0);
1696       alloc_reg_temp(current,i,-1);
1697     }
1698   }
1699   minimum_free_regs[i]=1;
1700 }
1701 void fconv_alloc(struct regstat *current,int i)
1702 {
1703   alloc_reg(current,i,CSREG); // Load status
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707 void float_alloc(struct regstat *current,int i)
1708 {
1709   alloc_reg(current,i,CSREG); // Load status
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713 void c2op_alloc(struct regstat *current,int i)
1714 {
1715   alloc_reg_temp(current,i,-1);
1716 }
1717 void fcomp_alloc(struct regstat *current,int i)
1718 {
1719   alloc_reg(current,i,CSREG); // Load status
1720   alloc_reg(current,i,FSREG); // Load flags
1721   dirty_reg(current,FSREG); // Flag will be modified
1722   alloc_reg_temp(current,i,-1);
1723   minimum_free_regs[i]=1;
1724 }
1725
1726 void syscall_alloc(struct regstat *current,int i)
1727 {
1728   alloc_cc(current,i);
1729   dirty_reg(current,CCREG);
1730   alloc_all(current,i);
1731   minimum_free_regs[i]=HOST_REGS;
1732   current->isconst=0;
1733 }
1734
1735 void delayslot_alloc(struct regstat *current,int i)
1736 {
1737   switch(itype[i]) {
1738     case UJUMP:
1739     case CJUMP:
1740     case SJUMP:
1741     case RJUMP:
1742     case FJUMP:
1743     case SYSCALL:
1744     case HLECALL:
1745     case SPAN:
1746       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1747       SysPrintf("Disabled speculative precompilation\n");
1748       stop_after_jal=1;
1749       break;
1750     case IMM16:
1751       imm16_alloc(current,i);
1752       break;
1753     case LOAD:
1754     case LOADLR:
1755       load_alloc(current,i);
1756       break;
1757     case STORE:
1758     case STORELR:
1759       store_alloc(current,i);
1760       break;
1761     case ALU:
1762       alu_alloc(current,i);
1763       break;
1764     case SHIFT:
1765       shift_alloc(current,i);
1766       break;
1767     case MULTDIV:
1768       multdiv_alloc(current,i);
1769       break;
1770     case SHIFTIMM:
1771       shiftimm_alloc(current,i);
1772       break;
1773     case MOV:
1774       mov_alloc(current,i);
1775       break;
1776     case COP0:
1777       cop0_alloc(current,i);
1778       break;
1779     case COP1:
1780     case COP2:
1781       cop1_alloc(current,i);
1782       break;
1783     case C1LS:
1784       c1ls_alloc(current,i);
1785       break;
1786     case C2LS:
1787       c2ls_alloc(current,i);
1788       break;
1789     case FCONV:
1790       fconv_alloc(current,i);
1791       break;
1792     case FLOAT:
1793       float_alloc(current,i);
1794       break;
1795     case FCOMP:
1796       fcomp_alloc(current,i);
1797       break;
1798     case C2OP:
1799       c2op_alloc(current,i);
1800       break;
1801   }
1802 }
1803
1804 // Special case where a branch and delay slot span two pages in virtual memory
1805 static void pagespan_alloc(struct regstat *current,int i)
1806 {
1807   current->isconst=0;
1808   current->wasconst=0;
1809   regs[i].wasconst=0;
1810   minimum_free_regs[i]=HOST_REGS;
1811   alloc_all(current,i);
1812   alloc_cc(current,i);
1813   dirty_reg(current,CCREG);
1814   if(opcode[i]==3) // JAL
1815   {
1816     alloc_reg(current,i,31);
1817     dirty_reg(current,31);
1818   }
1819   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1820   {
1821     alloc_reg(current,i,rs1[i]);
1822     if (rt1[i]!=0) {
1823       alloc_reg(current,i,rt1[i]);
1824       dirty_reg(current,rt1[i]);
1825     }
1826   }
1827   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1828   {
1829     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1830     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1831     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1832     {
1833       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1834       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1835     }
1836   }
1837   else
1838   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1839   {
1840     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1841     if(!((current->is32>>rs1[i])&1))
1842     {
1843       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1844     }
1845   }
1846   else
1847   if(opcode[i]==0x11) // BC1
1848   {
1849     alloc_reg(current,i,FSREG);
1850     alloc_reg(current,i,CSREG);
1851   }
1852   //else ...
1853 }
1854
1855 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1856   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1857 {
1858   assert(a < ARRAY_SIZE(stubs));
1859   stubs[stubcount].type = type;
1860   stubs[stubcount].addr = addr;
1861   stubs[stubcount].retaddr = retaddr;
1862   stubs[stubcount].a = a;
1863   stubs[stubcount].b = b;
1864   stubs[stubcount].c = c;
1865   stubs[stubcount].d = d;
1866   stubs[stubcount].e = e;
1867   stubcount++;
1868 }
1869
1870 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1871   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1872 {
1873   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1874 }
1875
1876 // Write out a single register
1877 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1878 {
1879   int hr;
1880   for(hr=0;hr<HOST_REGS;hr++) {
1881     if(hr!=EXCLUDE_REG) {
1882       if((regmap[hr]&63)==r) {
1883         if((dirty>>hr)&1) {
1884           if(regmap[hr]<64) {
1885             emit_storereg(r,hr);
1886           }else{
1887             emit_storereg(r|64,hr);
1888           }
1889         }
1890       }
1891     }
1892   }
1893 }
1894
1895 int mchecksum()
1896 {
1897   //if(!tracedebug) return 0;
1898   int i;
1899   int sum=0;
1900   for(i=0;i<2097152;i++) {
1901     unsigned int temp=sum;
1902     sum<<=1;
1903     sum|=(~temp)>>31;
1904     sum^=((u_int *)rdram)[i];
1905   }
1906   return sum;
1907 }
1908 int rchecksum()
1909 {
1910   int i;
1911   int sum=0;
1912   for(i=0;i<64;i++)
1913     sum^=((u_int *)reg)[i];
1914   return sum;
1915 }
1916 void rlist()
1917 {
1918   int i;
1919   printf("TRACE: ");
1920   for(i=0;i<32;i++)
1921     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1922   printf("\n");
1923 }
1924
1925 void enabletrace()
1926 {
1927   tracedebug=1;
1928 }
1929
1930 void memdebug(int i)
1931 {
1932   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1933   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1934   //rlist();
1935   //if(tracedebug) {
1936   //if(Count>=-2084597794) {
1937   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1938   //if(0) {
1939     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1940     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1941     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1942     rlist();
1943     #ifdef __i386__
1944     printf("TRACE: %x\n",(&i)[-1]);
1945     #endif
1946     #ifdef __arm__
1947     int j;
1948     printf("TRACE: %x \n",(&j)[10]);
1949     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1950     #endif
1951     //fflush(stdout);
1952   }
1953   //printf("TRACE: %x\n",(&i)[-1]);
1954 }
1955
1956 void alu_assemble(int i,struct regstat *i_regs)
1957 {
1958   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1959     if(rt1[i]) {
1960       signed char s1,s2,t;
1961       t=get_reg(i_regs->regmap,rt1[i]);
1962       if(t>=0) {
1963         s1=get_reg(i_regs->regmap,rs1[i]);
1964         s2=get_reg(i_regs->regmap,rs2[i]);
1965         if(rs1[i]&&rs2[i]) {
1966           assert(s1>=0);
1967           assert(s2>=0);
1968           if(opcode2[i]&2) emit_sub(s1,s2,t);
1969           else emit_add(s1,s2,t);
1970         }
1971         else if(rs1[i]) {
1972           if(s1>=0) emit_mov(s1,t);
1973           else emit_loadreg(rs1[i],t);
1974         }
1975         else if(rs2[i]) {
1976           if(s2>=0) {
1977             if(opcode2[i]&2) emit_neg(s2,t);
1978             else emit_mov(s2,t);
1979           }
1980           else {
1981             emit_loadreg(rs2[i],t);
1982             if(opcode2[i]&2) emit_neg(t,t);
1983           }
1984         }
1985         else emit_zeroreg(t);
1986       }
1987     }
1988   }
1989   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1990     if(rt1[i]) {
1991       signed char s1l,s2l,s1h,s2h,tl,th;
1992       tl=get_reg(i_regs->regmap,rt1[i]);
1993       th=get_reg(i_regs->regmap,rt1[i]|64);
1994       if(tl>=0) {
1995         s1l=get_reg(i_regs->regmap,rs1[i]);
1996         s2l=get_reg(i_regs->regmap,rs2[i]);
1997         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1998         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1999         if(rs1[i]&&rs2[i]) {
2000           assert(s1l>=0);
2001           assert(s2l>=0);
2002           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2003           else emit_adds(s1l,s2l,tl);
2004           if(th>=0) {
2005             #ifdef INVERTED_CARRY
2006             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2007             #else
2008             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2009             #endif
2010             else emit_add(s1h,s2h,th);
2011           }
2012         }
2013         else if(rs1[i]) {
2014           if(s1l>=0) emit_mov(s1l,tl);
2015           else emit_loadreg(rs1[i],tl);
2016           if(th>=0) {
2017             if(s1h>=0) emit_mov(s1h,th);
2018             else emit_loadreg(rs1[i]|64,th);
2019           }
2020         }
2021         else if(rs2[i]) {
2022           if(s2l>=0) {
2023             if(opcode2[i]&2) emit_negs(s2l,tl);
2024             else emit_mov(s2l,tl);
2025           }
2026           else {
2027             emit_loadreg(rs2[i],tl);
2028             if(opcode2[i]&2) emit_negs(tl,tl);
2029           }
2030           if(th>=0) {
2031             #ifdef INVERTED_CARRY
2032             if(s2h>=0) emit_mov(s2h,th);
2033             else emit_loadreg(rs2[i]|64,th);
2034             if(opcode2[i]&2) {
2035               emit_adcimm(-1,th); // x86 has inverted carry flag
2036               emit_not(th,th);
2037             }
2038             #else
2039             if(opcode2[i]&2) {
2040               if(s2h>=0) emit_rscimm(s2h,0,th);
2041               else {
2042                 emit_loadreg(rs2[i]|64,th);
2043                 emit_rscimm(th,0,th);
2044               }
2045             }else{
2046               if(s2h>=0) emit_mov(s2h,th);
2047               else emit_loadreg(rs2[i]|64,th);
2048             }
2049             #endif
2050           }
2051         }
2052         else {
2053           emit_zeroreg(tl);
2054           if(th>=0) emit_zeroreg(th);
2055         }
2056       }
2057     }
2058   }
2059   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2060     if(rt1[i]) {
2061       signed char s1l,s1h,s2l,s2h,t;
2062       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2063       {
2064         t=get_reg(i_regs->regmap,rt1[i]);
2065         //assert(t>=0);
2066         if(t>=0) {
2067           s1l=get_reg(i_regs->regmap,rs1[i]);
2068           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2069           s2l=get_reg(i_regs->regmap,rs2[i]);
2070           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2071           if(rs2[i]==0) // rx<r0
2072           {
2073             assert(s1h>=0);
2074             if(opcode2[i]==0x2a) // SLT
2075               emit_shrimm(s1h,31,t);
2076             else // SLTU (unsigned can not be less than zero)
2077               emit_zeroreg(t);
2078           }
2079           else if(rs1[i]==0) // r0<rx
2080           {
2081             assert(s2h>=0);
2082             if(opcode2[i]==0x2a) // SLT
2083               emit_set_gz64_32(s2h,s2l,t);
2084             else // SLTU (set if not zero)
2085               emit_set_nz64_32(s2h,s2l,t);
2086           }
2087           else {
2088             assert(s1l>=0);assert(s1h>=0);
2089             assert(s2l>=0);assert(s2h>=0);
2090             if(opcode2[i]==0x2a) // SLT
2091               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2092             else // SLTU
2093               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2094           }
2095         }
2096       } else {
2097         t=get_reg(i_regs->regmap,rt1[i]);
2098         //assert(t>=0);
2099         if(t>=0) {
2100           s1l=get_reg(i_regs->regmap,rs1[i]);
2101           s2l=get_reg(i_regs->regmap,rs2[i]);
2102           if(rs2[i]==0) // rx<r0
2103           {
2104             assert(s1l>=0);
2105             if(opcode2[i]==0x2a) // SLT
2106               emit_shrimm(s1l,31,t);
2107             else // SLTU (unsigned can not be less than zero)
2108               emit_zeroreg(t);
2109           }
2110           else if(rs1[i]==0) // r0<rx
2111           {
2112             assert(s2l>=0);
2113             if(opcode2[i]==0x2a) // SLT
2114               emit_set_gz32(s2l,t);
2115             else // SLTU (set if not zero)
2116               emit_set_nz32(s2l,t);
2117           }
2118           else{
2119             assert(s1l>=0);assert(s2l>=0);
2120             if(opcode2[i]==0x2a) // SLT
2121               emit_set_if_less32(s1l,s2l,t);
2122             else // SLTU
2123               emit_set_if_carry32(s1l,s2l,t);
2124           }
2125         }
2126       }
2127     }
2128   }
2129   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2130     if(rt1[i]) {
2131       signed char s1l,s1h,s2l,s2h,th,tl;
2132       tl=get_reg(i_regs->regmap,rt1[i]);
2133       th=get_reg(i_regs->regmap,rt1[i]|64);
2134       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2135       {
2136         assert(tl>=0);
2137         if(tl>=0) {
2138           s1l=get_reg(i_regs->regmap,rs1[i]);
2139           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2140           s2l=get_reg(i_regs->regmap,rs2[i]);
2141           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2142           if(rs1[i]&&rs2[i]) {
2143             assert(s1l>=0);assert(s1h>=0);
2144             assert(s2l>=0);assert(s2h>=0);
2145             if(opcode2[i]==0x24) { // AND
2146               emit_and(s1l,s2l,tl);
2147               emit_and(s1h,s2h,th);
2148             } else
2149             if(opcode2[i]==0x25) { // OR
2150               emit_or(s1l,s2l,tl);
2151               emit_or(s1h,s2h,th);
2152             } else
2153             if(opcode2[i]==0x26) { // XOR
2154               emit_xor(s1l,s2l,tl);
2155               emit_xor(s1h,s2h,th);
2156             } else
2157             if(opcode2[i]==0x27) { // NOR
2158               emit_or(s1l,s2l,tl);
2159               emit_or(s1h,s2h,th);
2160               emit_not(tl,tl);
2161               emit_not(th,th);
2162             }
2163           }
2164           else
2165           {
2166             if(opcode2[i]==0x24) { // AND
2167               emit_zeroreg(tl);
2168               emit_zeroreg(th);
2169             } else
2170             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2171               if(rs1[i]){
2172                 if(s1l>=0) emit_mov(s1l,tl);
2173                 else emit_loadreg(rs1[i],tl);
2174                 if(s1h>=0) emit_mov(s1h,th);
2175                 else emit_loadreg(rs1[i]|64,th);
2176               }
2177               else
2178               if(rs2[i]){
2179                 if(s2l>=0) emit_mov(s2l,tl);
2180                 else emit_loadreg(rs2[i],tl);
2181                 if(s2h>=0) emit_mov(s2h,th);
2182                 else emit_loadreg(rs2[i]|64,th);
2183               }
2184               else{
2185                 emit_zeroreg(tl);
2186                 emit_zeroreg(th);
2187               }
2188             } else
2189             if(opcode2[i]==0x27) { // NOR
2190               if(rs1[i]){
2191                 if(s1l>=0) emit_not(s1l,tl);
2192                 else{
2193                   emit_loadreg(rs1[i],tl);
2194                   emit_not(tl,tl);
2195                 }
2196                 if(s1h>=0) emit_not(s1h,th);
2197                 else{
2198                   emit_loadreg(rs1[i]|64,th);
2199                   emit_not(th,th);
2200                 }
2201               }
2202               else
2203               if(rs2[i]){
2204                 if(s2l>=0) emit_not(s2l,tl);
2205                 else{
2206                   emit_loadreg(rs2[i],tl);
2207                   emit_not(tl,tl);
2208                 }
2209                 if(s2h>=0) emit_not(s2h,th);
2210                 else{
2211                   emit_loadreg(rs2[i]|64,th);
2212                   emit_not(th,th);
2213                 }
2214               }
2215               else {
2216                 emit_movimm(-1,tl);
2217                 emit_movimm(-1,th);
2218               }
2219             }
2220           }
2221         }
2222       }
2223       else
2224       {
2225         // 32 bit
2226         if(tl>=0) {
2227           s1l=get_reg(i_regs->regmap,rs1[i]);
2228           s2l=get_reg(i_regs->regmap,rs2[i]);
2229           if(rs1[i]&&rs2[i]) {
2230             assert(s1l>=0);
2231             assert(s2l>=0);
2232             if(opcode2[i]==0x24) { // AND
2233               emit_and(s1l,s2l,tl);
2234             } else
2235             if(opcode2[i]==0x25) { // OR
2236               emit_or(s1l,s2l,tl);
2237             } else
2238             if(opcode2[i]==0x26) { // XOR
2239               emit_xor(s1l,s2l,tl);
2240             } else
2241             if(opcode2[i]==0x27) { // NOR
2242               emit_or(s1l,s2l,tl);
2243               emit_not(tl,tl);
2244             }
2245           }
2246           else
2247           {
2248             if(opcode2[i]==0x24) { // AND
2249               emit_zeroreg(tl);
2250             } else
2251             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2252               if(rs1[i]){
2253                 if(s1l>=0) emit_mov(s1l,tl);
2254                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2255               }
2256               else
2257               if(rs2[i]){
2258                 if(s2l>=0) emit_mov(s2l,tl);
2259                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2260               }
2261               else emit_zeroreg(tl);
2262             } else
2263             if(opcode2[i]==0x27) { // NOR
2264               if(rs1[i]){
2265                 if(s1l>=0) emit_not(s1l,tl);
2266                 else {
2267                   emit_loadreg(rs1[i],tl);
2268                   emit_not(tl,tl);
2269                 }
2270               }
2271               else
2272               if(rs2[i]){
2273                 if(s2l>=0) emit_not(s2l,tl);
2274                 else {
2275                   emit_loadreg(rs2[i],tl);
2276                   emit_not(tl,tl);
2277                 }
2278               }
2279               else emit_movimm(-1,tl);
2280             }
2281           }
2282         }
2283       }
2284     }
2285   }
2286 }
2287
2288 void imm16_assemble(int i,struct regstat *i_regs)
2289 {
2290   if (opcode[i]==0x0f) { // LUI
2291     if(rt1[i]) {
2292       signed char t;
2293       t=get_reg(i_regs->regmap,rt1[i]);
2294       //assert(t>=0);
2295       if(t>=0) {
2296         if(!((i_regs->isconst>>t)&1))
2297           emit_movimm(imm[i]<<16,t);
2298       }
2299     }
2300   }
2301   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2302     if(rt1[i]) {
2303       signed char s,t;
2304       t=get_reg(i_regs->regmap,rt1[i]);
2305       s=get_reg(i_regs->regmap,rs1[i]);
2306       if(rs1[i]) {
2307         //assert(t>=0);
2308         //assert(s>=0);
2309         if(t>=0) {
2310           if(!((i_regs->isconst>>t)&1)) {
2311             if(s<0) {
2312               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2313               emit_addimm(t,imm[i],t);
2314             }else{
2315               if(!((i_regs->wasconst>>s)&1))
2316                 emit_addimm(s,imm[i],t);
2317               else
2318                 emit_movimm(constmap[i][s]+imm[i],t);
2319             }
2320           }
2321         }
2322       } else {
2323         if(t>=0) {
2324           if(!((i_regs->isconst>>t)&1))
2325             emit_movimm(imm[i],t);
2326         }
2327       }
2328     }
2329   }
2330   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2331     if(rt1[i]) {
2332       signed char sh,sl,th,tl;
2333       th=get_reg(i_regs->regmap,rt1[i]|64);
2334       tl=get_reg(i_regs->regmap,rt1[i]);
2335       sh=get_reg(i_regs->regmap,rs1[i]|64);
2336       sl=get_reg(i_regs->regmap,rs1[i]);
2337       if(tl>=0) {
2338         if(rs1[i]) {
2339           assert(sh>=0);
2340           assert(sl>=0);
2341           if(th>=0) {
2342             emit_addimm64_32(sh,sl,imm[i],th,tl);
2343           }
2344           else {
2345             emit_addimm(sl,imm[i],tl);
2346           }
2347         } else {
2348           emit_movimm(imm[i],tl);
2349           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2350         }
2351       }
2352     }
2353   }
2354   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2355     if(rt1[i]) {
2356       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2357       signed char sh,sl,t;
2358       t=get_reg(i_regs->regmap,rt1[i]);
2359       sh=get_reg(i_regs->regmap,rs1[i]|64);
2360       sl=get_reg(i_regs->regmap,rs1[i]);
2361       //assert(t>=0);
2362       if(t>=0) {
2363         if(rs1[i]>0) {
2364           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2365           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2366             if(opcode[i]==0x0a) { // SLTI
2367               if(sl<0) {
2368                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2369                 emit_slti32(t,imm[i],t);
2370               }else{
2371                 emit_slti32(sl,imm[i],t);
2372               }
2373             }
2374             else { // SLTIU
2375               if(sl<0) {
2376                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2377                 emit_sltiu32(t,imm[i],t);
2378               }else{
2379                 emit_sltiu32(sl,imm[i],t);
2380               }
2381             }
2382           }else{ // 64-bit
2383             assert(sl>=0);
2384             if(opcode[i]==0x0a) // SLTI
2385               emit_slti64_32(sh,sl,imm[i],t);
2386             else // SLTIU
2387               emit_sltiu64_32(sh,sl,imm[i],t);
2388           }
2389         }else{
2390           // SLTI(U) with r0 is just stupid,
2391           // nonetheless examples can be found
2392           if(opcode[i]==0x0a) // SLTI
2393             if(0<imm[i]) emit_movimm(1,t);
2394             else emit_zeroreg(t);
2395           else // SLTIU
2396           {
2397             if(imm[i]) emit_movimm(1,t);
2398             else emit_zeroreg(t);
2399           }
2400         }
2401       }
2402     }
2403   }
2404   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2405     if(rt1[i]) {
2406       signed char sh,sl,th,tl;
2407       th=get_reg(i_regs->regmap,rt1[i]|64);
2408       tl=get_reg(i_regs->regmap,rt1[i]);
2409       sh=get_reg(i_regs->regmap,rs1[i]|64);
2410       sl=get_reg(i_regs->regmap,rs1[i]);
2411       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2412         if(opcode[i]==0x0c) //ANDI
2413         {
2414           if(rs1[i]) {
2415             if(sl<0) {
2416               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2417               emit_andimm(tl,imm[i],tl);
2418             }else{
2419               if(!((i_regs->wasconst>>sl)&1))
2420                 emit_andimm(sl,imm[i],tl);
2421               else
2422                 emit_movimm(constmap[i][sl]&imm[i],tl);
2423             }
2424           }
2425           else
2426             emit_zeroreg(tl);
2427           if(th>=0) emit_zeroreg(th);
2428         }
2429         else
2430         {
2431           if(rs1[i]) {
2432             if(sl<0) {
2433               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2434             }
2435             if(th>=0) {
2436               if(sh<0) {
2437                 emit_loadreg(rs1[i]|64,th);
2438               }else{
2439                 emit_mov(sh,th);
2440               }
2441             }
2442             if(opcode[i]==0x0d) { // ORI
2443               if(sl<0) {
2444                 emit_orimm(tl,imm[i],tl);
2445               }else{
2446                 if(!((i_regs->wasconst>>sl)&1))
2447                   emit_orimm(sl,imm[i],tl);
2448                 else
2449                   emit_movimm(constmap[i][sl]|imm[i],tl);
2450               }
2451             }
2452             if(opcode[i]==0x0e) { // XORI
2453               if(sl<0) {
2454                 emit_xorimm(tl,imm[i],tl);
2455               }else{
2456                 if(!((i_regs->wasconst>>sl)&1))
2457                   emit_xorimm(sl,imm[i],tl);
2458                 else
2459                   emit_movimm(constmap[i][sl]^imm[i],tl);
2460               }
2461             }
2462           }
2463           else {
2464             emit_movimm(imm[i],tl);
2465             if(th>=0) emit_zeroreg(th);
2466           }
2467         }
2468       }
2469     }
2470   }
2471 }
2472
2473 void shiftimm_assemble(int i,struct regstat *i_regs)
2474 {
2475   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2476   {
2477     if(rt1[i]) {
2478       signed char s,t;
2479       t=get_reg(i_regs->regmap,rt1[i]);
2480       s=get_reg(i_regs->regmap,rs1[i]);
2481       //assert(t>=0);
2482       if(t>=0&&!((i_regs->isconst>>t)&1)){
2483         if(rs1[i]==0)
2484         {
2485           emit_zeroreg(t);
2486         }
2487         else
2488         {
2489           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2490           if(imm[i]) {
2491             if(opcode2[i]==0) // SLL
2492             {
2493               emit_shlimm(s<0?t:s,imm[i],t);
2494             }
2495             if(opcode2[i]==2) // SRL
2496             {
2497               emit_shrimm(s<0?t:s,imm[i],t);
2498             }
2499             if(opcode2[i]==3) // SRA
2500             {
2501               emit_sarimm(s<0?t:s,imm[i],t);
2502             }
2503           }else{
2504             // Shift by zero
2505             if(s>=0 && s!=t) emit_mov(s,t);
2506           }
2507         }
2508       }
2509       //emit_storereg(rt1[i],t); //DEBUG
2510     }
2511   }
2512   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2513   {
2514     if(rt1[i]) {
2515       signed char sh,sl,th,tl;
2516       th=get_reg(i_regs->regmap,rt1[i]|64);
2517       tl=get_reg(i_regs->regmap,rt1[i]);
2518       sh=get_reg(i_regs->regmap,rs1[i]|64);
2519       sl=get_reg(i_regs->regmap,rs1[i]);
2520       if(tl>=0) {
2521         if(rs1[i]==0)
2522         {
2523           emit_zeroreg(tl);
2524           if(th>=0) emit_zeroreg(th);
2525         }
2526         else
2527         {
2528           assert(sl>=0);
2529           assert(sh>=0);
2530           if(imm[i]) {
2531             if(opcode2[i]==0x38) // DSLL
2532             {
2533               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2534               emit_shlimm(sl,imm[i],tl);
2535             }
2536             if(opcode2[i]==0x3a) // DSRL
2537             {
2538               emit_shrdimm(sl,sh,imm[i],tl);
2539               if(th>=0) emit_shrimm(sh,imm[i],th);
2540             }
2541             if(opcode2[i]==0x3b) // DSRA
2542             {
2543               emit_shrdimm(sl,sh,imm[i],tl);
2544               if(th>=0) emit_sarimm(sh,imm[i],th);
2545             }
2546           }else{
2547             // Shift by zero
2548             if(sl!=tl) emit_mov(sl,tl);
2549             if(th>=0&&sh!=th) emit_mov(sh,th);
2550           }
2551         }
2552       }
2553     }
2554   }
2555   if(opcode2[i]==0x3c) // DSLL32
2556   {
2557     if(rt1[i]) {
2558       signed char sl,tl,th;
2559       tl=get_reg(i_regs->regmap,rt1[i]);
2560       th=get_reg(i_regs->regmap,rt1[i]|64);
2561       sl=get_reg(i_regs->regmap,rs1[i]);
2562       if(th>=0||tl>=0){
2563         assert(tl>=0);
2564         assert(th>=0);
2565         assert(sl>=0);
2566         emit_mov(sl,th);
2567         emit_zeroreg(tl);
2568         if(imm[i]>32)
2569         {
2570           emit_shlimm(th,imm[i]&31,th);
2571         }
2572       }
2573     }
2574   }
2575   if(opcode2[i]==0x3e) // DSRL32
2576   {
2577     if(rt1[i]) {
2578       signed char sh,tl,th;
2579       tl=get_reg(i_regs->regmap,rt1[i]);
2580       th=get_reg(i_regs->regmap,rt1[i]|64);
2581       sh=get_reg(i_regs->regmap,rs1[i]|64);
2582       if(tl>=0){
2583         assert(sh>=0);
2584         emit_mov(sh,tl);
2585         if(th>=0) emit_zeroreg(th);
2586         if(imm[i]>32)
2587         {
2588           emit_shrimm(tl,imm[i]&31,tl);
2589         }
2590       }
2591     }
2592   }
2593   if(opcode2[i]==0x3f) // DSRA32
2594   {
2595     if(rt1[i]) {
2596       signed char sh,tl;
2597       tl=get_reg(i_regs->regmap,rt1[i]);
2598       sh=get_reg(i_regs->regmap,rs1[i]|64);
2599       if(tl>=0){
2600         assert(sh>=0);
2601         emit_mov(sh,tl);
2602         if(imm[i]>32)
2603         {
2604           emit_sarimm(tl,imm[i]&31,tl);
2605         }
2606       }
2607     }
2608   }
2609 }
2610
2611 #ifndef shift_assemble
2612 void shift_assemble(int i,struct regstat *i_regs)
2613 {
2614   printf("Need shift_assemble for this architecture.\n");
2615   exit(1);
2616 }
2617 #endif
2618
2619 void load_assemble(int i,struct regstat *i_regs)
2620 {
2621   int s,th,tl,addr,map=-1;
2622   int offset;
2623   void *jaddr=0;
2624   int memtarget=0,c=0;
2625   int fastload_reg_override=0;
2626   u_int hr,reglist=0;
2627   th=get_reg(i_regs->regmap,rt1[i]|64);
2628   tl=get_reg(i_regs->regmap,rt1[i]);
2629   s=get_reg(i_regs->regmap,rs1[i]);
2630   offset=imm[i];
2631   for(hr=0;hr<HOST_REGS;hr++) {
2632     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2633   }
2634   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2635   if(s>=0) {
2636     c=(i_regs->wasconst>>s)&1;
2637     if (c) {
2638       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2639     }
2640   }
2641   //printf("load_assemble: c=%d\n",c);
2642   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2643   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2644   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2645     ||rt1[i]==0) {
2646       // could be FIFO, must perform the read
2647       // ||dummy read
2648       assem_debug("(forced read)\n");
2649       tl=get_reg(i_regs->regmap,-1);
2650       assert(tl>=0);
2651   }
2652   if(offset||s<0||c) addr=tl;
2653   else addr=s;
2654   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2655  if(tl>=0) {
2656   //printf("load_assemble: c=%d\n",c);
2657   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2658   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2659   reglist&=~(1<<tl);
2660   if(th>=0) reglist&=~(1<<th);
2661   if(!c) {
2662     #ifdef RAM_OFFSET
2663     map=get_reg(i_regs->regmap,ROREG);
2664     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2665     #endif
2666     #ifdef R29_HACK
2667     // Strmnnrmn's speed hack
2668     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2669     #endif
2670     {
2671       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2672     }
2673   }
2674   else if(ram_offset&&memtarget) {
2675     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2676     fastload_reg_override=HOST_TEMPREG;
2677   }
2678   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2679   if (opcode[i]==0x20) { // LB
2680     if(!c||memtarget) {
2681       if(!dummy) {
2682         #ifdef HOST_IMM_ADDR32
2683         if(c)
2684           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2685         else
2686         #endif
2687         {
2688           //emit_xorimm(addr,3,tl);
2689           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2690           int x=0,a=tl;
2691 #ifdef BIG_ENDIAN_MIPS
2692           if(!c) emit_xorimm(addr,3,tl);
2693           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2694 #else
2695           if(!c) a=addr;
2696 #endif
2697           if(fastload_reg_override) a=fastload_reg_override;
2698
2699           emit_movsbl_indexed_tlb(x,a,map,tl);
2700         }
2701       }
2702       if(jaddr)
2703         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2704     }
2705     else
2706       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2707   }
2708   if (opcode[i]==0x21) { // LH
2709     if(!c||memtarget) {
2710       if(!dummy) {
2711         #ifdef HOST_IMM_ADDR32
2712         if(c)
2713           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2714         else
2715         #endif
2716         {
2717           int x=0,a=tl;
2718 #ifdef BIG_ENDIAN_MIPS
2719           if(!c) emit_xorimm(addr,2,tl);
2720           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2721 #else
2722           if(!c) a=addr;
2723 #endif
2724           if(fastload_reg_override) a=fastload_reg_override;
2725           //#ifdef
2726           //emit_movswl_indexed_tlb(x,tl,map,tl);
2727           //else
2728           if(map>=0) {
2729             emit_movswl_indexed(x,a,tl);
2730           }else{
2731             #if 1 //def RAM_OFFSET
2732             emit_movswl_indexed(x,a,tl);
2733             #else
2734             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2735             #endif
2736           }
2737         }
2738       }
2739       if(jaddr)
2740         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2741     }
2742     else
2743       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2744   }
2745   if (opcode[i]==0x23) { // LW
2746     if(!c||memtarget) {
2747       if(!dummy) {
2748         int a=addr;
2749         if(fastload_reg_override) a=fastload_reg_override;
2750         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2751         #ifdef HOST_IMM_ADDR32
2752         if(c)
2753           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2754         else
2755         #endif
2756         emit_readword_indexed_tlb(0,a,map,tl);
2757       }
2758       if(jaddr)
2759         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2760     }
2761     else
2762       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2763   }
2764   if (opcode[i]==0x24) { // LBU
2765     if(!c||memtarget) {
2766       if(!dummy) {
2767         #ifdef HOST_IMM_ADDR32
2768         if(c)
2769           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2770         else
2771         #endif
2772         {
2773           //emit_xorimm(addr,3,tl);
2774           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2775           int x=0,a=tl;
2776 #ifdef BIG_ENDIAN_MIPS
2777           if(!c) emit_xorimm(addr,3,tl);
2778           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2779 #else
2780           if(!c) a=addr;
2781 #endif
2782           if(fastload_reg_override) a=fastload_reg_override;
2783
2784           emit_movzbl_indexed_tlb(x,a,map,tl);
2785         }
2786       }
2787       if(jaddr)
2788         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2789     }
2790     else
2791       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2792   }
2793   if (opcode[i]==0x25) { // LHU
2794     if(!c||memtarget) {
2795       if(!dummy) {
2796         #ifdef HOST_IMM_ADDR32
2797         if(c)
2798           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2799         else
2800         #endif
2801         {
2802           int x=0,a=tl;
2803 #ifdef BIG_ENDIAN_MIPS
2804           if(!c) emit_xorimm(addr,2,tl);
2805           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2806 #else
2807           if(!c) a=addr;
2808 #endif
2809           if(fastload_reg_override) a=fastload_reg_override;
2810           //#ifdef
2811           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2812           //#else
2813           if(map>=0) {
2814             emit_movzwl_indexed(x,a,tl);
2815           }else{
2816             #if 1 //def RAM_OFFSET
2817             emit_movzwl_indexed(x,a,tl);
2818             #else
2819             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2820             #endif
2821           }
2822         }
2823       }
2824       if(jaddr)
2825         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2826     }
2827     else
2828       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2829   }
2830   if (opcode[i]==0x27) { // LWU
2831     assert(th>=0);
2832     if(!c||memtarget) {
2833       if(!dummy) {
2834         int a=addr;
2835         if(fastload_reg_override) a=fastload_reg_override;
2836         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2837         #ifdef HOST_IMM_ADDR32
2838         if(c)
2839           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2840         else
2841         #endif
2842         emit_readword_indexed_tlb(0,a,map,tl);
2843       }
2844       if(jaddr)
2845         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2846     }
2847     else {
2848       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2849     }
2850     emit_zeroreg(th);
2851   }
2852   if (opcode[i]==0x37) { // LD
2853     if(!c||memtarget) {
2854       if(!dummy) {
2855         int a=addr;
2856         if(fastload_reg_override) a=fastload_reg_override;
2857         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2858         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2859         #ifdef HOST_IMM_ADDR32
2860         if(c)
2861           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2862         else
2863         #endif
2864         emit_readdword_indexed_tlb(0,a,map,th,tl);
2865       }
2866       if(jaddr)
2867         add_stub_r(LOADD_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2868     }
2869     else
2870       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2871   }
2872  }
2873   //emit_storereg(rt1[i],tl); // DEBUG
2874   //if(opcode[i]==0x23)
2875   //if(opcode[i]==0x24)
2876   //if(opcode[i]==0x23||opcode[i]==0x24)
2877   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2878   {
2879     //emit_pusha();
2880     save_regs(0x100f);
2881         emit_readword((int)&last_count,ECX);
2882         #ifdef __i386__
2883         if(get_reg(i_regs->regmap,CCREG)<0)
2884           emit_loadreg(CCREG,HOST_CCREG);
2885         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2886         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2887         emit_writeword(HOST_CCREG,(int)&Count);
2888         #endif
2889         #ifdef __arm__
2890         if(get_reg(i_regs->regmap,CCREG)<0)
2891           emit_loadreg(CCREG,0);
2892         else
2893           emit_mov(HOST_CCREG,0);
2894         emit_add(0,ECX,0);
2895         emit_addimm(0,2*ccadj[i],0);
2896         emit_writeword(0,(int)&Count);
2897         #endif
2898     emit_call((int)memdebug);
2899     //emit_popa();
2900     restore_regs(0x100f);
2901   }*/
2902 }
2903
2904 #ifndef loadlr_assemble
2905 void loadlr_assemble(int i,struct regstat *i_regs)
2906 {
2907   printf("Need loadlr_assemble for this architecture.\n");
2908   exit(1);
2909 }
2910 #endif
2911
2912 void store_assemble(int i,struct regstat *i_regs)
2913 {
2914   int s,th,tl,map=-1;
2915   int addr,temp;
2916   int offset;
2917   void *jaddr=0;
2918   enum stub_type type;
2919   int memtarget=0,c=0;
2920   int agr=AGEN1+(i&1);
2921   int faststore_reg_override=0;
2922   u_int hr,reglist=0;
2923   th=get_reg(i_regs->regmap,rs2[i]|64);
2924   tl=get_reg(i_regs->regmap,rs2[i]);
2925   s=get_reg(i_regs->regmap,rs1[i]);
2926   temp=get_reg(i_regs->regmap,agr);
2927   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2928   offset=imm[i];
2929   if(s>=0) {
2930     c=(i_regs->wasconst>>s)&1;
2931     if(c) {
2932       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2933     }
2934   }
2935   assert(tl>=0);
2936   assert(temp>=0);
2937   for(hr=0;hr<HOST_REGS;hr++) {
2938     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2939   }
2940   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2941   if(offset||s<0||c) addr=temp;
2942   else addr=s;
2943   if(!c) {
2944     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2945   }
2946   else if(ram_offset&&memtarget) {
2947     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2948     faststore_reg_override=HOST_TEMPREG;
2949   }
2950
2951   if (opcode[i]==0x28) { // SB
2952     if(!c||memtarget) {
2953       int x=0,a=temp;
2954 #ifdef BIG_ENDIAN_MIPS
2955       if(!c) emit_xorimm(addr,3,temp);
2956       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2957 #else
2958       if(!c) a=addr;
2959 #endif
2960       if(faststore_reg_override) a=faststore_reg_override;
2961       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2962       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2963     }
2964     type=STOREB_STUB;
2965   }
2966   if (opcode[i]==0x29) { // SH
2967     if(!c||memtarget) {
2968       int x=0,a=temp;
2969 #ifdef BIG_ENDIAN_MIPS
2970       if(!c) emit_xorimm(addr,2,temp);
2971       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2972 #else
2973       if(!c) a=addr;
2974 #endif
2975       if(faststore_reg_override) a=faststore_reg_override;
2976       //#ifdef
2977       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2978       //#else
2979       if(map>=0) {
2980         emit_writehword_indexed(tl,x,a);
2981       }else
2982         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2983         emit_writehword_indexed(tl,x,a);
2984     }
2985     type=STOREH_STUB;
2986   }
2987   if (opcode[i]==0x2B) { // SW
2988     if(!c||memtarget) {
2989       int a=addr;
2990       if(faststore_reg_override) a=faststore_reg_override;
2991       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2992       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2993     }
2994     type=STOREW_STUB;
2995   }
2996   if (opcode[i]==0x3F) { // SD
2997     if(!c||memtarget) {
2998       int a=addr;
2999       if(faststore_reg_override) a=faststore_reg_override;
3000       if(rs2[i]) {
3001         assert(th>=0);
3002         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3003         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3004         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3005       }else{
3006         // Store zero
3007         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3008         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3009         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3010       }
3011     }
3012     type=STORED_STUB;
3013   }
3014   if(jaddr) {
3015     // PCSX store handlers don't check invcode again
3016     reglist|=1<<addr;
3017     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
3018     jaddr=0;
3019   }
3020   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3021     if(!c||memtarget) {
3022       #ifdef DESTRUCTIVE_SHIFT
3023       // The x86 shift operation is 'destructive'; it overwrites the
3024       // source register, so we need to make a copy first and use that.
3025       addr=temp;
3026       #endif
3027       #if defined(HOST_IMM8)
3028       int ir=get_reg(i_regs->regmap,INVCP);
3029       assert(ir>=0);
3030       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3031       #else
3032       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3033       #endif
3034       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3035       emit_callne(invalidate_addr_reg[addr]);
3036       #else
3037       void *jaddr2 = out;
3038       emit_jne(0);
3039       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3040       #endif
3041     }
3042   }
3043   u_int addr_val=constmap[i][s]+offset;
3044   if(jaddr) {
3045     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
3046   } else if(c&&!memtarget) {
3047     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3048   }
3049   // basic current block modification detection..
3050   // not looking back as that should be in mips cache already
3051   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3052     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3053     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3054     if(i_regs->regmap==regs[i].regmap) {
3055       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3056       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3057       emit_movimm(start+i*4+4,0);
3058       emit_writeword(0,(int)&pcaddr);
3059       emit_jmp(do_interrupt);
3060     }
3061   }
3062   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3063   //if(opcode[i]==0x2B || opcode[i]==0x28)
3064   //if(opcode[i]==0x2B || opcode[i]==0x29)
3065   //if(opcode[i]==0x2B)
3066   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3067   {
3068     #ifdef __i386__
3069     emit_pusha();
3070     #endif
3071     #ifdef __arm__
3072     save_regs(0x100f);
3073     #endif
3074         emit_readword((int)&last_count,ECX);
3075         #ifdef __i386__
3076         if(get_reg(i_regs->regmap,CCREG)<0)
3077           emit_loadreg(CCREG,HOST_CCREG);
3078         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3079         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3080         emit_writeword(HOST_CCREG,(int)&Count);
3081         #endif
3082         #ifdef __arm__
3083         if(get_reg(i_regs->regmap,CCREG)<0)
3084           emit_loadreg(CCREG,0);
3085         else
3086           emit_mov(HOST_CCREG,0);
3087         emit_add(0,ECX,0);
3088         emit_addimm(0,2*ccadj[i],0);
3089         emit_writeword(0,(int)&Count);
3090         #endif
3091     emit_call((int)memdebug);
3092     #ifdef __i386__
3093     emit_popa();
3094     #endif
3095     #ifdef __arm__
3096     restore_regs(0x100f);
3097     #endif
3098   }*/
3099 }
3100
3101 void storelr_assemble(int i,struct regstat *i_regs)
3102 {
3103   int s,th,tl;
3104   int temp;
3105   int temp2=-1;
3106   int offset;
3107   void *jaddr=0;
3108   void *case1, *case2, *case3;
3109   void *done0, *done1, *done2;
3110   int memtarget=0,c=0;
3111   int agr=AGEN1+(i&1);
3112   u_int hr,reglist=0;
3113   th=get_reg(i_regs->regmap,rs2[i]|64);
3114   tl=get_reg(i_regs->regmap,rs2[i]);
3115   s=get_reg(i_regs->regmap,rs1[i]);
3116   temp=get_reg(i_regs->regmap,agr);
3117   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3118   offset=imm[i];
3119   if(s>=0) {
3120     c=(i_regs->isconst>>s)&1;
3121     if(c) {
3122       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3123     }
3124   }
3125   assert(tl>=0);
3126   for(hr=0;hr<HOST_REGS;hr++) {
3127     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3128   }
3129   assert(temp>=0);
3130   if(!c) {
3131     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3132     if(!offset&&s!=temp) emit_mov(s,temp);
3133     jaddr=out;
3134     emit_jno(0);
3135   }
3136   else
3137   {
3138     if(!memtarget||!rs1[i]) {
3139       jaddr=out;
3140       emit_jmp(0);
3141     }
3142   }
3143   #ifdef RAM_OFFSET
3144   int map=get_reg(i_regs->regmap,ROREG);
3145   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3146   #else
3147   if((u_int)rdram!=0x80000000)
3148     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3149   #endif
3150
3151   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3152     temp2=get_reg(i_regs->regmap,FTEMP);
3153     if(!rs2[i]) temp2=th=tl;
3154   }
3155
3156 #ifndef BIG_ENDIAN_MIPS
3157     emit_xorimm(temp,3,temp);
3158 #endif
3159   emit_testimm(temp,2);
3160   case2=out;
3161   emit_jne(0);
3162   emit_testimm(temp,1);
3163   case1=out;
3164   emit_jne(0);
3165   // 0
3166   if (opcode[i]==0x2A) { // SWL
3167     emit_writeword_indexed(tl,0,temp);
3168   }
3169   if (opcode[i]==0x2E) { // SWR
3170     emit_writebyte_indexed(tl,3,temp);
3171   }
3172   if (opcode[i]==0x2C) { // SDL
3173     emit_writeword_indexed(th,0,temp);
3174     if(rs2[i]) emit_mov(tl,temp2);
3175   }
3176   if (opcode[i]==0x2D) { // SDR
3177     emit_writebyte_indexed(tl,3,temp);
3178     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3179   }
3180   done0=out;
3181   emit_jmp(0);
3182   // 1
3183   set_jump_target(case1, out);
3184   if (opcode[i]==0x2A) { // SWL
3185     // Write 3 msb into three least significant bytes
3186     if(rs2[i]) emit_rorimm(tl,8,tl);
3187     emit_writehword_indexed(tl,-1,temp);
3188     if(rs2[i]) emit_rorimm(tl,16,tl);
3189     emit_writebyte_indexed(tl,1,temp);
3190     if(rs2[i]) emit_rorimm(tl,8,tl);
3191   }
3192   if (opcode[i]==0x2E) { // SWR
3193     // Write two lsb into two most significant bytes
3194     emit_writehword_indexed(tl,1,temp);
3195   }
3196   if (opcode[i]==0x2C) { // SDL
3197     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3198     // Write 3 msb into three least significant bytes
3199     if(rs2[i]) emit_rorimm(th,8,th);
3200     emit_writehword_indexed(th,-1,temp);
3201     if(rs2[i]) emit_rorimm(th,16,th);
3202     emit_writebyte_indexed(th,1,temp);
3203     if(rs2[i]) emit_rorimm(th,8,th);
3204   }
3205   if (opcode[i]==0x2D) { // SDR
3206     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3207     // Write two lsb into two most significant bytes
3208     emit_writehword_indexed(tl,1,temp);
3209   }
3210   done1=out;
3211   emit_jmp(0);
3212   // 2
3213   set_jump_target(case2, out);
3214   emit_testimm(temp,1);
3215   case3=out;
3216   emit_jne(0);
3217   if (opcode[i]==0x2A) { // SWL
3218     // Write two msb into two least significant bytes
3219     if(rs2[i]) emit_rorimm(tl,16,tl);
3220     emit_writehword_indexed(tl,-2,temp);
3221     if(rs2[i]) emit_rorimm(tl,16,tl);
3222   }
3223   if (opcode[i]==0x2E) { // SWR
3224     // Write 3 lsb into three most significant bytes
3225     emit_writebyte_indexed(tl,-1,temp);
3226     if(rs2[i]) emit_rorimm(tl,8,tl);
3227     emit_writehword_indexed(tl,0,temp);
3228     if(rs2[i]) emit_rorimm(tl,24,tl);
3229   }
3230   if (opcode[i]==0x2C) { // SDL
3231     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3232     // Write two msb into two least significant bytes
3233     if(rs2[i]) emit_rorimm(th,16,th);
3234     emit_writehword_indexed(th,-2,temp);
3235     if(rs2[i]) emit_rorimm(th,16,th);
3236   }
3237   if (opcode[i]==0x2D) { // SDR
3238     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3239     // Write 3 lsb into three most significant bytes
3240     emit_writebyte_indexed(tl,-1,temp);
3241     if(rs2[i]) emit_rorimm(tl,8,tl);
3242     emit_writehword_indexed(tl,0,temp);
3243     if(rs2[i]) emit_rorimm(tl,24,tl);
3244   }
3245   done2=out;
3246   emit_jmp(0);
3247   // 3
3248   set_jump_target(case3, out);
3249   if (opcode[i]==0x2A) { // SWL
3250     // Write msb into least significant byte
3251     if(rs2[i]) emit_rorimm(tl,24,tl);
3252     emit_writebyte_indexed(tl,-3,temp);
3253     if(rs2[i]) emit_rorimm(tl,8,tl);
3254   }
3255   if (opcode[i]==0x2E) { // SWR
3256     // Write entire word
3257     emit_writeword_indexed(tl,-3,temp);
3258   }
3259   if (opcode[i]==0x2C) { // SDL
3260     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3261     // Write msb into least significant byte
3262     if(rs2[i]) emit_rorimm(th,24,th);
3263     emit_writebyte_indexed(th,-3,temp);
3264     if(rs2[i]) emit_rorimm(th,8,th);
3265   }
3266   if (opcode[i]==0x2D) { // SDR
3267     if(rs2[i]) emit_mov(th,temp2);
3268     // Write entire word
3269     emit_writeword_indexed(tl,-3,temp);
3270   }
3271   set_jump_target(done0, out);
3272   set_jump_target(done1, out);
3273   set_jump_target(done2, out);
3274   if (opcode[i]==0x2C) { // SDL
3275     emit_testimm(temp,4);
3276     done0=out;
3277     emit_jne(0);
3278     emit_andimm(temp,~3,temp);
3279     emit_writeword_indexed(temp2,4,temp);
3280     set_jump_target(done0, out);
3281   }
3282   if (opcode[i]==0x2D) { // SDR
3283     emit_testimm(temp,4);
3284     done0=out;
3285     emit_jeq(0);
3286     emit_andimm(temp,~3,temp);
3287     emit_writeword_indexed(temp2,-4,temp);
3288     set_jump_target(done0, out);
3289   }
3290   if(!c||!memtarget)
3291     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3292   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3293     #ifdef RAM_OFFSET
3294     int map=get_reg(i_regs->regmap,ROREG);
3295     if(map<0) map=HOST_TEMPREG;
3296     gen_orig_addr_w(temp,map);
3297     #else
3298     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3299     #endif
3300     #if defined(HOST_IMM8)
3301     int ir=get_reg(i_regs->regmap,INVCP);
3302     assert(ir>=0);
3303     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3304     #else
3305     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3306     #endif
3307     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3308     emit_callne(invalidate_addr_reg[temp]);
3309     #else
3310     void *jaddr2 = out;
3311     emit_jne(0);
3312     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3313     #endif
3314   }
3315   /*
3316     emit_pusha();
3317     //save_regs(0x100f);
3318         emit_readword((int)&last_count,ECX);
3319         if(get_reg(i_regs->regmap,CCREG)<0)
3320           emit_loadreg(CCREG,HOST_CCREG);
3321         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3322         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3323         emit_writeword(HOST_CCREG,(int)&Count);
3324     emit_call((int)memdebug);
3325     emit_popa();
3326     //restore_regs(0x100f);
3327   */
3328 }
3329
3330 void c1ls_assemble(int i,struct regstat *i_regs)
3331 {
3332   cop1_unusable(i, i_regs);
3333 }
3334
3335 void c2ls_assemble(int i,struct regstat *i_regs)
3336 {
3337   int s,tl;
3338   int ar;
3339   int offset;
3340   int memtarget=0,c=0;
3341   void *jaddr2=NULL;
3342   enum stub_type type;
3343   int agr=AGEN1+(i&1);
3344   int fastio_reg_override=0;
3345   u_int hr,reglist=0;
3346   u_int copr=(source[i]>>16)&0x1f;
3347   s=get_reg(i_regs->regmap,rs1[i]);
3348   tl=get_reg(i_regs->regmap,FTEMP);
3349   offset=imm[i];
3350   assert(rs1[i]>0);
3351   assert(tl>=0);
3352
3353   for(hr=0;hr<HOST_REGS;hr++) {
3354     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3355   }
3356   if(i_regs->regmap[HOST_CCREG]==CCREG)
3357     reglist&=~(1<<HOST_CCREG);
3358
3359   // get the address
3360   if (opcode[i]==0x3a) { // SWC2
3361     ar=get_reg(i_regs->regmap,agr);
3362     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3363     reglist|=1<<ar;
3364   } else { // LWC2
3365     ar=tl;
3366   }
3367   if(s>=0) c=(i_regs->wasconst>>s)&1;
3368   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3369   if (!offset&&!c&&s>=0) ar=s;
3370   assert(ar>=0);
3371
3372   if (opcode[i]==0x3a) { // SWC2
3373     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3374     type=STOREW_STUB;
3375   }
3376   else
3377     type=LOADW_STUB;
3378
3379   if(c&&!memtarget) {
3380     jaddr2=out;
3381     emit_jmp(0); // inline_readstub/inline_writestub?
3382   }
3383   else {
3384     if(!c) {
3385       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3386     }
3387     else if(ram_offset&&memtarget) {
3388       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3389       fastio_reg_override=HOST_TEMPREG;
3390     }
3391     if (opcode[i]==0x32) { // LWC2
3392       #ifdef HOST_IMM_ADDR32
3393       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3394       else
3395       #endif
3396       int a=ar;
3397       if(fastio_reg_override) a=fastio_reg_override;
3398       emit_readword_indexed(0,a,tl);
3399     }
3400     if (opcode[i]==0x3a) { // SWC2
3401       #ifdef DESTRUCTIVE_SHIFT
3402       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3403       #endif
3404       int a=ar;
3405       if(fastio_reg_override) a=fastio_reg_override;
3406       emit_writeword_indexed(tl,0,a);
3407     }
3408   }
3409   if(jaddr2)
3410     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3411   if(opcode[i]==0x3a) // SWC2
3412   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3413 #if defined(HOST_IMM8)
3414     int ir=get_reg(i_regs->regmap,INVCP);
3415     assert(ir>=0);
3416     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3417 #else
3418     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3419 #endif
3420     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3421     emit_callne(invalidate_addr_reg[ar]);
3422     #else
3423     void *jaddr3 = out;
3424     emit_jne(0);
3425     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3426     #endif
3427   }
3428   if (opcode[i]==0x32) { // LWC2
3429     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3430   }
3431 }
3432
3433 #ifndef multdiv_assemble
3434 void multdiv_assemble(int i,struct regstat *i_regs)
3435 {
3436   printf("Need multdiv_assemble for this architecture.\n");
3437   exit(1);
3438 }
3439 #endif
3440
3441 void mov_assemble(int i,struct regstat *i_regs)
3442 {
3443   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3444   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3445   if(rt1[i]) {
3446     signed char sh,sl,th,tl;
3447     th=get_reg(i_regs->regmap,rt1[i]|64);
3448     tl=get_reg(i_regs->regmap,rt1[i]);
3449     //assert(tl>=0);
3450     if(tl>=0) {
3451       sh=get_reg(i_regs->regmap,rs1[i]|64);
3452       sl=get_reg(i_regs->regmap,rs1[i]);
3453       if(sl>=0) emit_mov(sl,tl);
3454       else emit_loadreg(rs1[i],tl);
3455       if(th>=0) {
3456         if(sh>=0) emit_mov(sh,th);
3457         else emit_loadreg(rs1[i]|64,th);
3458       }
3459     }
3460   }
3461 }
3462
3463 #ifndef fconv_assemble
3464 void fconv_assemble(int i,struct regstat *i_regs)
3465 {
3466   printf("Need fconv_assemble for this architecture.\n");
3467   exit(1);
3468 }
3469 #endif
3470
3471 #if 0
3472 void float_assemble(int i,struct regstat *i_regs)
3473 {
3474   printf("Need float_assemble for this architecture.\n");
3475   exit(1);
3476 }
3477 #endif
3478
3479 void syscall_assemble(int i,struct regstat *i_regs)
3480 {
3481   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3482   assert(ccreg==HOST_CCREG);
3483   assert(!is_delayslot);
3484   (void)ccreg;
3485   emit_movimm(start+i*4,EAX); // Get PC
3486   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3487   emit_jmp(jump_syscall_hle); // XXX
3488 }
3489
3490 void hlecall_assemble(int i,struct regstat *i_regs)
3491 {
3492   extern void psxNULL();
3493   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3494   assert(ccreg==HOST_CCREG);
3495   assert(!is_delayslot);
3496   (void)ccreg;
3497   emit_movimm(start+i*4+4,0); // Get PC
3498   uint32_t hleCode = source[i] & 0x03ffffff;
3499   if (hleCode >= ARRAY_SIZE(psxHLEt))
3500     emit_movimm((int)psxNULL,1);
3501   else
3502     emit_movimm((int)psxHLEt[hleCode],1);
3503   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3504   emit_jmp(jump_hlecall);
3505 }
3506
3507 void intcall_assemble(int i,struct regstat *i_regs)
3508 {
3509   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3510   assert(ccreg==HOST_CCREG);
3511   assert(!is_delayslot);
3512   (void)ccreg;
3513   emit_movimm(start+i*4,0); // Get PC
3514   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3515   emit_jmp(jump_intcall);
3516 }
3517
3518 void ds_assemble(int i,struct regstat *i_regs)
3519 {
3520   speculate_register_values(i);
3521   is_delayslot=1;
3522   switch(itype[i]) {
3523     case ALU:
3524       alu_assemble(i,i_regs);break;
3525     case IMM16:
3526       imm16_assemble(i,i_regs);break;
3527     case SHIFT:
3528       shift_assemble(i,i_regs);break;
3529     case SHIFTIMM:
3530       shiftimm_assemble(i,i_regs);break;
3531     case LOAD:
3532       load_assemble(i,i_regs);break;
3533     case LOADLR:
3534       loadlr_assemble(i,i_regs);break;
3535     case STORE:
3536       store_assemble(i,i_regs);break;
3537     case STORELR:
3538       storelr_assemble(i,i_regs);break;
3539     case COP0:
3540       cop0_assemble(i,i_regs);break;
3541     case COP1:
3542       cop1_assemble(i,i_regs);break;
3543     case C1LS:
3544       c1ls_assemble(i,i_regs);break;
3545     case COP2:
3546       cop2_assemble(i,i_regs);break;
3547     case C2LS:
3548       c2ls_assemble(i,i_regs);break;
3549     case C2OP:
3550       c2op_assemble(i,i_regs);break;
3551     case FCONV:
3552       fconv_assemble(i,i_regs);break;
3553     case FLOAT:
3554       float_assemble(i,i_regs);break;
3555     case FCOMP:
3556       fcomp_assemble(i,i_regs);break;
3557     case MULTDIV:
3558       multdiv_assemble(i,i_regs);break;
3559     case MOV:
3560       mov_assemble(i,i_regs);break;
3561     case SYSCALL:
3562     case HLECALL:
3563     case INTCALL:
3564     case SPAN:
3565     case UJUMP:
3566     case RJUMP:
3567     case CJUMP:
3568     case SJUMP:
3569     case FJUMP:
3570       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3571   }
3572   is_delayslot=0;
3573 }
3574
3575 // Is the branch target a valid internal jump?
3576 int internal_branch(uint64_t i_is32,int addr)
3577 {
3578   if(addr&1) return 0; // Indirect (register) jump
3579   if(addr>=start && addr<start+slen*4-4)
3580   {
3581     //int t=(addr-start)>>2;
3582     // Delay slots are not valid branch targets
3583     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3584     // 64 -> 32 bit transition requires a recompile
3585     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3586     {
3587       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3588       else printf("optimizable: yes\n");
3589     }*/
3590     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3591     return 1;
3592   }
3593   return 0;
3594 }
3595
3596 #ifndef wb_invalidate
3597 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3598   uint64_t u,uint64_t uu)
3599 {
3600   int hr;
3601   for(hr=0;hr<HOST_REGS;hr++) {
3602     if(hr!=EXCLUDE_REG) {
3603       if(pre[hr]!=entry[hr]) {
3604         if(pre[hr]>=0) {
3605           if((dirty>>hr)&1) {
3606             if(get_reg(entry,pre[hr])<0) {
3607               if(pre[hr]<64) {
3608                 if(!((u>>pre[hr])&1)) {
3609                   emit_storereg(pre[hr],hr);
3610                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3611                     emit_sarimm(hr,31,hr);
3612                     emit_storereg(pre[hr]|64,hr);
3613                   }
3614                 }
3615               }else{
3616                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3617                   emit_storereg(pre[hr],hr);
3618                 }
3619               }
3620             }
3621           }
3622         }
3623       }
3624     }
3625   }
3626   // Move from one register to another (no writeback)
3627   for(hr=0;hr<HOST_REGS;hr++) {
3628     if(hr!=EXCLUDE_REG) {
3629       if(pre[hr]!=entry[hr]) {
3630         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3631           int nr;
3632           if((nr=get_reg(entry,pre[hr]))>=0) {
3633             emit_mov(hr,nr);
3634           }
3635         }
3636       }
3637     }
3638   }
3639 }
3640 #endif
3641
3642 // Load the specified registers
3643 // This only loads the registers given as arguments because
3644 // we don't want to load things that will be overwritten
3645 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3646 {
3647   int hr;
3648   // Load 32-bit regs
3649   for(hr=0;hr<HOST_REGS;hr++) {
3650     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3651       if(entry[hr]!=regmap[hr]) {
3652         if(regmap[hr]==rs1||regmap[hr]==rs2)
3653         {
3654           if(regmap[hr]==0) {
3655             emit_zeroreg(hr);
3656           }
3657           else
3658           {
3659             emit_loadreg(regmap[hr],hr);
3660           }
3661         }
3662       }
3663     }
3664   }
3665   //Load 64-bit regs
3666   for(hr=0;hr<HOST_REGS;hr++) {
3667     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3668       if(entry[hr]!=regmap[hr]) {
3669         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3670         {
3671           assert(regmap[hr]!=64);
3672           if((is32>>(regmap[hr]&63))&1) {
3673             int lr=get_reg(regmap,regmap[hr]-64);
3674             if(lr>=0)
3675               emit_sarimm(lr,31,hr);
3676             else
3677               emit_loadreg(regmap[hr],hr);
3678           }
3679           else
3680           {
3681             emit_loadreg(regmap[hr],hr);
3682           }
3683         }
3684       }
3685     }
3686   }
3687 }
3688
3689 // Load registers prior to the start of a loop
3690 // so that they are not loaded within the loop
3691 static void loop_preload(signed char pre[],signed char entry[])
3692 {
3693   int hr;
3694   for(hr=0;hr<HOST_REGS;hr++) {
3695     if(hr!=EXCLUDE_REG) {
3696       if(pre[hr]!=entry[hr]) {
3697         if(entry[hr]>=0) {
3698           if(get_reg(pre,entry[hr])<0) {
3699             assem_debug("loop preload:\n");
3700             //printf("loop preload: %d\n",hr);
3701             if(entry[hr]==0) {
3702               emit_zeroreg(hr);
3703             }
3704             else if(entry[hr]<TEMPREG)
3705             {
3706               emit_loadreg(entry[hr],hr);
3707             }
3708             else if(entry[hr]-64<TEMPREG)
3709             {
3710               emit_loadreg(entry[hr],hr);
3711             }
3712           }
3713         }
3714       }
3715     }
3716   }
3717 }
3718
3719 // Generate address for load/store instruction
3720 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3721 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3722 {
3723   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3724     int ra=-1;
3725     int agr=AGEN1+(i&1);
3726     if(itype[i]==LOAD) {
3727       ra=get_reg(i_regs->regmap,rt1[i]);
3728       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3729       assert(ra>=0);
3730     }
3731     if(itype[i]==LOADLR) {
3732       ra=get_reg(i_regs->regmap,FTEMP);
3733     }
3734     if(itype[i]==STORE||itype[i]==STORELR) {
3735       ra=get_reg(i_regs->regmap,agr);
3736       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3737     }
3738     if(itype[i]==C1LS||itype[i]==C2LS) {
3739       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3740         ra=get_reg(i_regs->regmap,FTEMP);
3741       else { // SWC1/SDC1/SWC2/SDC2
3742         ra=get_reg(i_regs->regmap,agr);
3743         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3744       }
3745     }
3746     int rs=get_reg(i_regs->regmap,rs1[i]);
3747     if(ra>=0) {
3748       int offset=imm[i];
3749       int c=(i_regs->wasconst>>rs)&1;
3750       if(rs1[i]==0) {
3751         // Using r0 as a base address
3752         if(!entry||entry[ra]!=agr) {
3753           if (opcode[i]==0x22||opcode[i]==0x26) {
3754             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3755           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3756             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3757           }else{
3758             emit_movimm(offset,ra);
3759           }
3760         } // else did it in the previous cycle
3761       }
3762       else if(rs<0) {
3763         if(!entry||entry[ra]!=rs1[i])
3764           emit_loadreg(rs1[i],ra);
3765         //if(!entry||entry[ra]!=rs1[i])
3766         //  printf("poor load scheduling!\n");
3767       }
3768       else if(c) {
3769         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3770           if(!entry||entry[ra]!=agr) {
3771             if (opcode[i]==0x22||opcode[i]==0x26) {
3772               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3773             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3774               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3775             }else{
3776               #ifdef HOST_IMM_ADDR32
3777               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3778               #endif
3779               emit_movimm(constmap[i][rs]+offset,ra);
3780               regs[i].loadedconst|=1<<ra;
3781             }
3782           } // else did it in the previous cycle
3783         } // else load_consts already did it
3784       }
3785       if(offset&&!c&&rs1[i]) {
3786         if(rs>=0) {
3787           emit_addimm(rs,offset,ra);
3788         }else{
3789           emit_addimm(ra,offset,ra);
3790         }
3791       }
3792     }
3793   }
3794   // Preload constants for next instruction
3795   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3796     int agr,ra;
3797     // Actual address
3798     agr=AGEN1+((i+1)&1);
3799     ra=get_reg(i_regs->regmap,agr);
3800     if(ra>=0) {
3801       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3802       int offset=imm[i+1];
3803       int c=(regs[i+1].wasconst>>rs)&1;
3804       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3805         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3806           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3807         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3808           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3809         }else{
3810           #ifdef HOST_IMM_ADDR32
3811           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3812           #endif
3813           emit_movimm(constmap[i+1][rs]+offset,ra);
3814           regs[i+1].loadedconst|=1<<ra;
3815         }
3816       }
3817       else if(rs1[i+1]==0) {
3818         // Using r0 as a base address
3819         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3820           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3821         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3822           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3823         }else{
3824           emit_movimm(offset,ra);
3825         }
3826       }
3827     }
3828   }
3829 }
3830
3831 static int get_final_value(int hr, int i, int *value)
3832 {
3833   int reg=regs[i].regmap[hr];
3834   while(i<slen-1) {
3835     if(regs[i+1].regmap[hr]!=reg) break;
3836     if(!((regs[i+1].isconst>>hr)&1)) break;
3837     if(bt[i+1]) break;
3838     i++;
3839   }
3840   if(i<slen-1) {
3841     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3842       *value=constmap[i][hr];
3843       return 1;
3844     }
3845     if(!bt[i+1]) {
3846       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3847         // Load in delay slot, out-of-order execution
3848         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3849         {
3850           // Precompute load address
3851           *value=constmap[i][hr]+imm[i+2];
3852           return 1;
3853         }
3854       }
3855       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3856       {
3857         // Precompute load address
3858         *value=constmap[i][hr]+imm[i+1];
3859         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3860         return 1;
3861       }
3862     }
3863   }
3864   *value=constmap[i][hr];
3865   //printf("c=%x\n",(int)constmap[i][hr]);
3866   if(i==slen-1) return 1;
3867   if(reg<64) {
3868     return !((unneeded_reg[i+1]>>reg)&1);
3869   }else{
3870     return !((unneeded_reg_upper[i+1]>>reg)&1);
3871   }
3872 }
3873
3874 // Load registers with known constants
3875 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3876 {
3877   int hr,hr2;
3878   // propagate loaded constant flags
3879   if(i==0||bt[i])
3880     regs[i].loadedconst=0;
3881   else {
3882     for(hr=0;hr<HOST_REGS;hr++) {
3883       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3884          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3885       {
3886         regs[i].loadedconst|=1<<hr;
3887       }
3888     }
3889   }
3890   // Load 32-bit regs
3891   for(hr=0;hr<HOST_REGS;hr++) {
3892     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3893       //if(entry[hr]!=regmap[hr]) {
3894       if(!((regs[i].loadedconst>>hr)&1)) {
3895         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3896           int value,similar=0;
3897           if(get_final_value(hr,i,&value)) {
3898             // see if some other register has similar value
3899             for(hr2=0;hr2<HOST_REGS;hr2++) {
3900               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3901                 if(is_similar_value(value,constmap[i][hr2])) {
3902                   similar=1;
3903                   break;
3904                 }
3905               }
3906             }
3907             if(similar) {
3908               int value2;
3909               if(get_final_value(hr2,i,&value2)) // is this needed?
3910                 emit_movimm_from(value2,hr2,value,hr);
3911               else
3912                 emit_movimm(value,hr);
3913             }
3914             else if(value==0) {
3915               emit_zeroreg(hr);
3916             }
3917             else {
3918               emit_movimm(value,hr);
3919             }
3920           }
3921           regs[i].loadedconst|=1<<hr;
3922         }
3923       }
3924     }
3925   }
3926   // Load 64-bit regs
3927   for(hr=0;hr<HOST_REGS;hr++) {
3928     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3929       //if(entry[hr]!=regmap[hr]) {
3930       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3931         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3932           if((is32>>(regmap[hr]&63))&1) {
3933             int lr=get_reg(regmap,regmap[hr]-64);
3934             assert(lr>=0);
3935             emit_sarimm(lr,31,hr);
3936           }
3937           else
3938           {
3939             int value;
3940             if(get_final_value(hr,i,&value)) {
3941               if(value==0) {
3942                 emit_zeroreg(hr);
3943               }
3944               else {
3945                 emit_movimm(value,hr);
3946               }
3947             }
3948           }
3949         }
3950       }
3951     }
3952   }
3953 }
3954 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3955 {
3956   int hr;
3957   // Load 32-bit regs
3958   for(hr=0;hr<HOST_REGS;hr++) {
3959     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3960       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3961         int value=constmap[i][hr];
3962         if(value==0) {
3963           emit_zeroreg(hr);
3964         }
3965         else {
3966           emit_movimm(value,hr);
3967         }
3968       }
3969     }
3970   }
3971   // Load 64-bit regs
3972   for(hr=0;hr<HOST_REGS;hr++) {
3973     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3974       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3975         if((is32>>(regmap[hr]&63))&1) {
3976           int lr=get_reg(regmap,regmap[hr]-64);
3977           assert(lr>=0);
3978           emit_sarimm(lr,31,hr);
3979         }
3980         else
3981         {
3982           int value=constmap[i][hr];
3983           if(value==0) {
3984             emit_zeroreg(hr);
3985           }
3986           else {
3987             emit_movimm(value,hr);
3988           }
3989         }
3990       }
3991     }
3992   }
3993 }
3994
3995 // Write out all dirty registers (except cycle count)
3996 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3997 {
3998   int hr;
3999   for(hr=0;hr<HOST_REGS;hr++) {
4000     if(hr!=EXCLUDE_REG) {
4001       if(i_regmap[hr]>0) {
4002         if(i_regmap[hr]!=CCREG) {
4003           if((i_dirty>>hr)&1) {
4004             if(i_regmap[hr]<64) {
4005               emit_storereg(i_regmap[hr],hr);
4006             }else{
4007               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4008                 emit_storereg(i_regmap[hr],hr);
4009               }
4010             }
4011           }
4012         }
4013       }
4014     }
4015   }
4016 }
4017 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4018 // This writes the registers not written by store_regs_bt
4019 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4020 {
4021   int hr;
4022   int t=(addr-start)>>2;
4023   for(hr=0;hr<HOST_REGS;hr++) {
4024     if(hr!=EXCLUDE_REG) {
4025       if(i_regmap[hr]>0) {
4026         if(i_regmap[hr]!=CCREG) {
4027           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4028             if((i_dirty>>hr)&1) {
4029               if(i_regmap[hr]<64) {
4030                 emit_storereg(i_regmap[hr],hr);
4031               }else{
4032                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4033                   emit_storereg(i_regmap[hr],hr);
4034                 }
4035               }
4036             }
4037           }
4038         }
4039       }
4040     }
4041   }
4042 }
4043
4044 // Load all registers (except cycle count)
4045 void load_all_regs(signed char i_regmap[])
4046 {
4047   int hr;
4048   for(hr=0;hr<HOST_REGS;hr++) {
4049     if(hr!=EXCLUDE_REG) {
4050       if(i_regmap[hr]==0) {
4051         emit_zeroreg(hr);
4052       }
4053       else
4054       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4055       {
4056         emit_loadreg(i_regmap[hr],hr);
4057       }
4058     }
4059   }
4060 }
4061
4062 // Load all current registers also needed by next instruction
4063 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4064 {
4065   int hr;
4066   for(hr=0;hr<HOST_REGS;hr++) {
4067     if(hr!=EXCLUDE_REG) {
4068       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4069         if(i_regmap[hr]==0) {
4070           emit_zeroreg(hr);
4071         }
4072         else
4073         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4074         {
4075           emit_loadreg(i_regmap[hr],hr);
4076         }
4077       }
4078     }
4079   }
4080 }
4081
4082 // Load all regs, storing cycle count if necessary
4083 void load_regs_entry(int t)
4084 {
4085   int hr;
4086   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4087   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4088   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4089     emit_storereg(CCREG,HOST_CCREG);
4090   }
4091   // Load 32-bit regs
4092   for(hr=0;hr<HOST_REGS;hr++) {
4093     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4094       if(regs[t].regmap_entry[hr]==0) {
4095         emit_zeroreg(hr);
4096       }
4097       else if(regs[t].regmap_entry[hr]!=CCREG)
4098       {
4099         emit_loadreg(regs[t].regmap_entry[hr],hr);
4100       }
4101     }
4102   }
4103   // Load 64-bit regs
4104   for(hr=0;hr<HOST_REGS;hr++) {
4105     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4106       assert(regs[t].regmap_entry[hr]!=64);
4107       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4108         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4109         if(lr<0) {
4110           emit_loadreg(regs[t].regmap_entry[hr],hr);
4111         }
4112         else
4113         {
4114           emit_sarimm(lr,31,hr);
4115         }
4116       }
4117       else
4118       {
4119         emit_loadreg(regs[t].regmap_entry[hr],hr);
4120       }
4121     }
4122   }
4123 }
4124
4125 // Store dirty registers prior to branch
4126 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4127 {
4128   if(internal_branch(i_is32,addr))
4129   {
4130     int t=(addr-start)>>2;
4131     int hr;
4132     for(hr=0;hr<HOST_REGS;hr++) {
4133       if(hr!=EXCLUDE_REG) {
4134         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4135           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4136             if((i_dirty>>hr)&1) {
4137               if(i_regmap[hr]<64) {
4138                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4139                   emit_storereg(i_regmap[hr],hr);
4140                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4141                     #ifdef DESTRUCTIVE_WRITEBACK
4142                     emit_sarimm(hr,31,hr);
4143                     emit_storereg(i_regmap[hr]|64,hr);
4144                     #else
4145                     emit_sarimm(hr,31,HOST_TEMPREG);
4146                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4147                     #endif
4148                   }
4149                 }
4150               }else{
4151                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4152                   emit_storereg(i_regmap[hr],hr);
4153                 }
4154               }
4155             }
4156           }
4157         }
4158       }
4159     }
4160   }
4161   else
4162   {
4163     // Branch out of this block, write out all dirty regs
4164     wb_dirtys(i_regmap,i_is32,i_dirty);
4165   }
4166 }
4167
4168 // Load all needed registers for branch target
4169 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4170 {
4171   //if(addr>=start && addr<(start+slen*4))
4172   if(internal_branch(i_is32,addr))
4173   {
4174     int t=(addr-start)>>2;
4175     int hr;
4176     // Store the cycle count before loading something else
4177     if(i_regmap[HOST_CCREG]!=CCREG) {
4178       assert(i_regmap[HOST_CCREG]==-1);
4179     }
4180     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4181       emit_storereg(CCREG,HOST_CCREG);
4182     }
4183     // Load 32-bit regs
4184     for(hr=0;hr<HOST_REGS;hr++) {
4185       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4186         #ifdef DESTRUCTIVE_WRITEBACK
4187         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4188         #else
4189         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4190         #endif
4191           if(regs[t].regmap_entry[hr]==0) {
4192             emit_zeroreg(hr);
4193           }
4194           else if(regs[t].regmap_entry[hr]!=CCREG)
4195           {
4196             emit_loadreg(regs[t].regmap_entry[hr],hr);
4197           }
4198         }
4199       }
4200     }
4201     //Load 64-bit regs
4202     for(hr=0;hr<HOST_REGS;hr++) {
4203       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4204         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4205           assert(regs[t].regmap_entry[hr]!=64);
4206           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4207             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4208             if(lr<0) {
4209               emit_loadreg(regs[t].regmap_entry[hr],hr);
4210             }
4211             else
4212             {
4213               emit_sarimm(lr,31,hr);
4214             }
4215           }
4216           else
4217           {
4218             emit_loadreg(regs[t].regmap_entry[hr],hr);
4219           }
4220         }
4221         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4222           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4223           assert(lr>=0);
4224           emit_sarimm(lr,31,hr);
4225         }
4226       }
4227     }
4228   }
4229 }
4230
4231 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4232 {
4233   if(addr>=start && addr<start+slen*4-4)
4234   {
4235     int t=(addr-start)>>2;
4236     int hr;
4237     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4238     for(hr=0;hr<HOST_REGS;hr++)
4239     {
4240       if(hr!=EXCLUDE_REG)
4241       {
4242         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4243         {
4244           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4245           {
4246             return 0;
4247           }
4248           else
4249           if((i_dirty>>hr)&1)
4250           {
4251             if(i_regmap[hr]<TEMPREG)
4252             {
4253               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4254                 return 0;
4255             }
4256             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4257             {
4258               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4259                 return 0;
4260             }
4261           }
4262         }
4263         else // Same register but is it 32-bit or dirty?
4264         if(i_regmap[hr]>=0)
4265         {
4266           if(!((regs[t].dirty>>hr)&1))
4267           {
4268             if((i_dirty>>hr)&1)
4269             {
4270               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4271               {
4272                 //printf("%x: dirty no match\n",addr);
4273                 return 0;
4274               }
4275             }
4276           }
4277           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4278           {
4279             //printf("%x: is32 no match\n",addr);
4280             return 0;
4281           }
4282         }
4283       }
4284     }
4285     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4286     // Delay slots are not valid branch targets
4287     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4288     // Delay slots require additional processing, so do not match
4289     if(is_ds[t]) return 0;
4290   }
4291   else
4292   {
4293     int hr;
4294     for(hr=0;hr<HOST_REGS;hr++)
4295     {
4296       if(hr!=EXCLUDE_REG)
4297       {
4298         if(i_regmap[hr]>=0)
4299         {
4300           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4301           {
4302             if((i_dirty>>hr)&1)
4303             {
4304               return 0;
4305             }
4306           }
4307         }
4308       }
4309     }
4310   }
4311   return 1;
4312 }
4313
4314 #ifdef DRC_DBG
4315 static void drc_dbg_emit_do_cmp(int i)
4316 {
4317   extern void do_insn_cmp();
4318   extern int cycle;
4319   u_int hr,reglist=0;
4320
4321   for(hr=0;hr<HOST_REGS;hr++)
4322     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4323   save_regs(reglist);
4324   emit_movimm(start+i*4,0);
4325   emit_writeword(0,(int)&pcaddr);
4326   emit_call((int)do_insn_cmp);
4327   //emit_readword((int)&cycle,0);
4328   //emit_addimm(0,2,0);
4329   //emit_writeword(0,(int)&cycle);
4330   restore_regs(reglist);
4331 }
4332 #else
4333 #define drc_dbg_emit_do_cmp(x)
4334 #endif
4335
4336 // Used when a branch jumps into the delay slot of another branch
4337 void ds_assemble_entry(int i)
4338 {
4339   int t=(ba[i]-start)>>2;
4340   if (!instr_addr[t])
4341     instr_addr[t] = out;
4342   assem_debug("Assemble delay slot at %x\n",ba[i]);
4343   assem_debug("<->\n");
4344   drc_dbg_emit_do_cmp(t);
4345   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4346     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4347   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4348   address_generation(t,&regs[t],regs[t].regmap_entry);
4349   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4350     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4351   cop1_usable=0;
4352   is_delayslot=0;
4353   switch(itype[t]) {
4354     case ALU:
4355       alu_assemble(t,&regs[t]);break;
4356     case IMM16:
4357       imm16_assemble(t,&regs[t]);break;
4358     case SHIFT:
4359       shift_assemble(t,&regs[t]);break;
4360     case SHIFTIMM:
4361       shiftimm_assemble(t,&regs[t]);break;
4362     case LOAD:
4363       load_assemble(t,&regs[t]);break;
4364     case LOADLR:
4365       loadlr_assemble(t,&regs[t]);break;
4366     case STORE:
4367       store_assemble(t,&regs[t]);break;
4368     case STORELR:
4369       storelr_assemble(t,&regs[t]);break;
4370     case COP0:
4371       cop0_assemble(t,&regs[t]);break;
4372     case COP1:
4373       cop1_assemble(t,&regs[t]);break;
4374     case C1LS:
4375       c1ls_assemble(t,&regs[t]);break;
4376     case COP2:
4377       cop2_assemble(t,&regs[t]);break;
4378     case C2LS:
4379       c2ls_assemble(t,&regs[t]);break;
4380     case C2OP:
4381       c2op_assemble(t,&regs[t]);break;
4382     case FCONV:
4383       fconv_assemble(t,&regs[t]);break;
4384     case FLOAT:
4385       float_assemble(t,&regs[t]);break;
4386     case FCOMP:
4387       fcomp_assemble(t,&regs[t]);break;
4388     case MULTDIV:
4389       multdiv_assemble(t,&regs[t]);break;
4390     case MOV:
4391       mov_assemble(t,&regs[t]);break;
4392     case SYSCALL:
4393     case HLECALL:
4394     case INTCALL:
4395     case SPAN:
4396     case UJUMP:
4397     case RJUMP:
4398     case CJUMP:
4399     case SJUMP:
4400     case FJUMP:
4401       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4402   }
4403   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4404   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4405   if(internal_branch(regs[t].is32,ba[i]+4))
4406     assem_debug("branch: internal\n");
4407   else
4408     assem_debug("branch: external\n");
4409   assert(internal_branch(regs[t].is32,ba[i]+4));
4410   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4411   emit_jmp(0);
4412 }
4413
4414 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4415 {
4416   int count;
4417   void *jaddr;
4418   void *idle=NULL;
4419   int t=0;
4420   if(itype[i]==RJUMP)
4421   {
4422     *adj=0;
4423   }
4424   //if(ba[i]>=start && ba[i]<(start+slen*4))
4425   if(internal_branch(branch_regs[i].is32,ba[i]))
4426   {
4427     t=(ba[i]-start)>>2;
4428     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4429     else *adj=ccadj[t];
4430   }
4431   else
4432   {
4433     *adj=0;
4434   }
4435   count=ccadj[i];
4436   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4437     // Idle loop
4438     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4439     idle=out;
4440     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4441     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4442     jaddr=out;
4443     emit_jmp(0);
4444   }
4445   else if(*adj==0||invert) {
4446     int cycles=CLOCK_ADJUST(count+2);
4447     // faster loop HACK
4448     if (t&&*adj) {
4449       int rel=t-i;
4450       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4451         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4452     }
4453     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4454     jaddr=out;
4455     emit_jns(0);
4456   }
4457   else
4458   {
4459     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4460     jaddr=out;
4461     emit_jns(0);
4462   }
4463   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4464 }
4465
4466 static void do_ccstub(int n)
4467 {
4468   literal_pool(256);
4469   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
4470   set_jump_target(stubs[n].addr, out);
4471   int i=stubs[n].b;
4472   if(stubs[n].d==NULLDS) {
4473     // Delay slot instruction is nullified ("likely" branch)
4474     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4475   }
4476   else if(stubs[n].d!=TAKEN) {
4477     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4478   }
4479   else {
4480     if(internal_branch(branch_regs[i].is32,ba[i]))
4481       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4482   }
4483   if(stubs[n].c!=-1)
4484   {
4485     // Save PC as return address
4486     emit_movimm(stubs[n].c,EAX);
4487     emit_writeword(EAX,(int)&pcaddr);
4488   }
4489   else
4490   {
4491     // Return address depends on which way the branch goes
4492     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4493     {
4494       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4495       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4496       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4497       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4498       if(rs1[i]==0)
4499       {
4500         s1l=s2l;s1h=s2h;
4501         s2l=s2h=-1;
4502       }
4503       else if(rs2[i]==0)
4504       {
4505         s2l=s2h=-1;
4506       }
4507       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4508         s1h=s2h=-1;
4509       }
4510       assert(s1l>=0);
4511       #ifdef DESTRUCTIVE_WRITEBACK
4512       if(rs1[i]) {
4513         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4514           emit_loadreg(rs1[i],s1l);
4515       }
4516       else {
4517         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4518           emit_loadreg(rs2[i],s1l);
4519       }
4520       if(s2l>=0)
4521         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4522           emit_loadreg(rs2[i],s2l);
4523       #endif
4524       int hr=0;
4525       int addr=-1,alt=-1,ntaddr=-1;
4526       while(hr<HOST_REGS)
4527       {
4528         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4529            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4530            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4531         {
4532           addr=hr++;break;
4533         }
4534         hr++;
4535       }
4536       while(hr<HOST_REGS)
4537       {
4538         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4539            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4540            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4541         {
4542           alt=hr++;break;
4543         }
4544         hr++;
4545       }
4546       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4547       {
4548         while(hr<HOST_REGS)
4549         {
4550           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4551              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4552              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4553           {
4554             ntaddr=hr;break;
4555           }
4556           hr++;
4557         }
4558         assert(hr<HOST_REGS);
4559       }
4560       if((opcode[i]&0x2f)==4) // BEQ
4561       {
4562         #ifdef HAVE_CMOV_IMM
4563         if(s1h<0) {
4564           if(s2l>=0) emit_cmp(s1l,s2l);
4565           else emit_test(s1l,s1l);
4566           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4567         }
4568         else
4569         #endif
4570         {
4571           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4572           if(s1h>=0) {
4573             if(s2h>=0) emit_cmp(s1h,s2h);
4574             else emit_test(s1h,s1h);
4575             emit_cmovne_reg(alt,addr);
4576           }
4577           if(s2l>=0) emit_cmp(s1l,s2l);
4578           else emit_test(s1l,s1l);
4579           emit_cmovne_reg(alt,addr);
4580         }
4581       }
4582       if((opcode[i]&0x2f)==5) // BNE
4583       {
4584         #ifdef HAVE_CMOV_IMM
4585         if(s1h<0) {
4586           if(s2l>=0) emit_cmp(s1l,s2l);
4587           else emit_test(s1l,s1l);
4588           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4589         }
4590         else
4591         #endif
4592         {
4593           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4594           if(s1h>=0) {
4595             if(s2h>=0) emit_cmp(s1h,s2h);
4596             else emit_test(s1h,s1h);
4597             emit_cmovne_reg(alt,addr);
4598           }
4599           if(s2l>=0) emit_cmp(s1l,s2l);
4600           else emit_test(s1l,s1l);
4601           emit_cmovne_reg(alt,addr);
4602         }
4603       }
4604       if((opcode[i]&0x2f)==6) // BLEZ
4605       {
4606         //emit_movimm(ba[i],alt);
4607         //emit_movimm(start+i*4+8,addr);
4608         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4609         emit_cmpimm(s1l,1);
4610         if(s1h>=0) emit_mov(addr,ntaddr);
4611         emit_cmovl_reg(alt,addr);
4612         if(s1h>=0) {
4613           emit_test(s1h,s1h);
4614           emit_cmovne_reg(ntaddr,addr);
4615           emit_cmovs_reg(alt,addr);
4616         }
4617       }
4618       if((opcode[i]&0x2f)==7) // BGTZ
4619       {
4620         //emit_movimm(ba[i],addr);
4621         //emit_movimm(start+i*4+8,ntaddr);
4622         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4623         emit_cmpimm(s1l,1);
4624         if(s1h>=0) emit_mov(addr,alt);
4625         emit_cmovl_reg(ntaddr,addr);
4626         if(s1h>=0) {
4627           emit_test(s1h,s1h);
4628           emit_cmovne_reg(alt,addr);
4629           emit_cmovs_reg(ntaddr,addr);
4630         }
4631       }
4632       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4633       {
4634         //emit_movimm(ba[i],alt);
4635         //emit_movimm(start+i*4+8,addr);
4636         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4637         if(s1h>=0) emit_test(s1h,s1h);
4638         else emit_test(s1l,s1l);
4639         emit_cmovs_reg(alt,addr);
4640       }
4641       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4642       {
4643         //emit_movimm(ba[i],addr);
4644         //emit_movimm(start+i*4+8,alt);
4645         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4646         if(s1h>=0) emit_test(s1h,s1h);
4647         else emit_test(s1l,s1l);
4648         emit_cmovs_reg(alt,addr);
4649       }
4650       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4651         if(source[i]&0x10000) // BC1T
4652         {
4653           //emit_movimm(ba[i],alt);
4654           //emit_movimm(start+i*4+8,addr);
4655           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4656           emit_testimm(s1l,0x800000);
4657           emit_cmovne_reg(alt,addr);
4658         }
4659         else // BC1F
4660         {
4661           //emit_movimm(ba[i],addr);
4662           //emit_movimm(start+i*4+8,alt);
4663           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4664           emit_testimm(s1l,0x800000);
4665           emit_cmovne_reg(alt,addr);
4666         }
4667       }
4668       emit_writeword(addr,(int)&pcaddr);
4669     }
4670     else
4671     if(itype[i]==RJUMP)
4672     {
4673       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4674       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4675         r=get_reg(branch_regs[i].regmap,RTEMP);
4676       }
4677       emit_writeword(r,(int)&pcaddr);
4678     }
4679     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4680   }
4681   // Update cycle count
4682   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4683   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n].a),HOST_CCREG);
4684   emit_call((int)cc_interrupt);
4685   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n].a),HOST_CCREG);
4686   if(stubs[n].d==TAKEN) {
4687     if(internal_branch(branch_regs[i].is32,ba[i]))
4688       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4689     else if(itype[i]==RJUMP) {
4690       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4691         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4692       else
4693         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4694     }
4695   }else if(stubs[n].d==NOTTAKEN) {
4696     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4697     else load_all_regs(branch_regs[i].regmap);
4698   }else if(stubs[n].d==NULLDS) {
4699     // Delay slot instruction is nullified ("likely" branch)
4700     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4701     else load_all_regs(regs[i].regmap);
4702   }else{
4703     load_all_regs(branch_regs[i].regmap);
4704   }
4705   emit_jmp(stubs[n].retaddr);
4706
4707   /* This works but uses a lot of memory...
4708   emit_readword((int)&last_count,ECX);
4709   emit_add(HOST_CCREG,ECX,EAX);
4710   emit_writeword(EAX,(int)&Count);
4711   emit_call((int)gen_interupt);
4712   emit_readword((int)&Count,HOST_CCREG);
4713   emit_readword((int)&next_interupt,EAX);
4714   emit_readword((int)&pending_exception,EBX);
4715   emit_writeword(EAX,(int)&last_count);
4716   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4717   emit_test(EBX,EBX);
4718   int jne_instr=(int)out;
4719   emit_jne(0);
4720   if(stubs[n].a) emit_addimm(HOST_CCREG,-2*stubs[n].a,HOST_CCREG);
4721   load_all_regs(branch_regs[i].regmap);
4722   emit_jmp(stubs[n].retaddr); // return address
4723   set_jump_target(jne_instr,(int)out);
4724   emit_readword((int)&pcaddr,EAX);
4725   // Call get_addr_ht instead of doing the hash table here.
4726   // This code is executed infrequently and takes up a lot of space
4727   // so smaller is better.
4728   emit_storereg(CCREG,HOST_CCREG);
4729   emit_pushreg(EAX);
4730   emit_call((int)get_addr_ht);
4731   emit_loadreg(CCREG,HOST_CCREG);
4732   emit_addimm(ESP,4,ESP);
4733   emit_jmpreg(EAX);*/
4734 }
4735
4736 static void add_to_linker(int addr,int target,int ext)
4737 {
4738   link_addr[linkcount][0]=addr;
4739   link_addr[linkcount][1]=target;
4740   link_addr[linkcount][2]=ext;
4741   linkcount++;
4742 }
4743
4744 static void ujump_assemble_write_ra(int i)
4745 {
4746   int rt;
4747   unsigned int return_address;
4748   rt=get_reg(branch_regs[i].regmap,31);
4749   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4750   //assert(rt>=0);
4751   return_address=start+i*4+8;
4752   if(rt>=0) {
4753     #ifdef USE_MINI_HT
4754     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4755       int temp=-1; // note: must be ds-safe
4756       #ifdef HOST_TEMPREG
4757       temp=HOST_TEMPREG;
4758       #endif
4759       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4760       else emit_movimm(return_address,rt);
4761     }
4762     else
4763     #endif
4764     {
4765       #ifdef REG_PREFETCH
4766       if(temp>=0)
4767       {
4768         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4769       }
4770       #endif
4771       emit_movimm(return_address,rt); // PC into link register
4772       #ifdef IMM_PREFETCH
4773       emit_prefetch(hash_table_get(return_address));
4774       #endif
4775     }
4776   }
4777 }
4778
4779 void ujump_assemble(int i,struct regstat *i_regs)
4780 {
4781   int ra_done=0;
4782   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4783   address_generation(i+1,i_regs,regs[i].regmap_entry);
4784   #ifdef REG_PREFETCH
4785   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4786   if(rt1[i]==31&&temp>=0)
4787   {
4788     signed char *i_regmap=i_regs->regmap;
4789     int return_address=start+i*4+8;
4790     if(get_reg(branch_regs[i].regmap,31)>0)
4791     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4792   }
4793   #endif
4794   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4795     ujump_assemble_write_ra(i); // writeback ra for DS
4796     ra_done=1;
4797   }
4798   ds_assemble(i+1,i_regs);
4799   uint64_t bc_unneeded=branch_regs[i].u;
4800   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4801   bc_unneeded|=1|(1LL<<rt1[i]);
4802   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4803   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4804                 bc_unneeded,bc_unneeded_upper);
4805   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4806   if(!ra_done&&rt1[i]==31)
4807     ujump_assemble_write_ra(i);
4808   int cc,adj;
4809   cc=get_reg(branch_regs[i].regmap,CCREG);
4810   assert(cc==HOST_CCREG);
4811   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4812   #ifdef REG_PREFETCH
4813   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4814   #endif
4815   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4816   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4817   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4818   if(internal_branch(branch_regs[i].is32,ba[i]))
4819     assem_debug("branch: internal\n");
4820   else
4821     assem_debug("branch: external\n");
4822   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4823     ds_assemble_entry(i);
4824   }
4825   else {
4826     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4827     emit_jmp(0);
4828   }
4829 }
4830
4831 static void rjump_assemble_write_ra(int i)
4832 {
4833   int rt,return_address;
4834   assert(rt1[i+1]!=rt1[i]);
4835   assert(rt2[i+1]!=rt1[i]);
4836   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4837   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4838   assert(rt>=0);
4839   return_address=start+i*4+8;
4840   #ifdef REG_PREFETCH
4841   if(temp>=0)
4842   {
4843     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4844   }
4845   #endif
4846   emit_movimm(return_address,rt); // PC into link register
4847   #ifdef IMM_PREFETCH
4848   emit_prefetch(hash_table_get(return_address));
4849   #endif
4850 }
4851
4852 void rjump_assemble(int i,struct regstat *i_regs)
4853 {
4854   int temp;
4855   int rs,cc;
4856   int ra_done=0;
4857   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4858   assert(rs>=0);
4859   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4860     // Delay slot abuse, make a copy of the branch address register
4861     temp=get_reg(branch_regs[i].regmap,RTEMP);
4862     assert(temp>=0);
4863     assert(regs[i].regmap[temp]==RTEMP);
4864     emit_mov(rs,temp);
4865     rs=temp;
4866   }
4867   address_generation(i+1,i_regs,regs[i].regmap_entry);
4868   #ifdef REG_PREFETCH
4869   if(rt1[i]==31)
4870   {
4871     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4872       signed char *i_regmap=i_regs->regmap;
4873       int return_address=start+i*4+8;
4874       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4875     }
4876   }
4877   #endif
4878   #ifdef USE_MINI_HT
4879   if(rs1[i]==31) {
4880     int rh=get_reg(regs[i].regmap,RHASH);
4881     if(rh>=0) do_preload_rhash(rh);
4882   }
4883   #endif
4884   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4885     rjump_assemble_write_ra(i);
4886     ra_done=1;
4887   }
4888   ds_assemble(i+1,i_regs);
4889   uint64_t bc_unneeded=branch_regs[i].u;
4890   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4891   bc_unneeded|=1|(1LL<<rt1[i]);
4892   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4893   bc_unneeded&=~(1LL<<rs1[i]);
4894   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4895                 bc_unneeded,bc_unneeded_upper);
4896   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4897   if(!ra_done&&rt1[i]!=0)
4898     rjump_assemble_write_ra(i);
4899   cc=get_reg(branch_regs[i].regmap,CCREG);
4900   assert(cc==HOST_CCREG);
4901   (void)cc;
4902   #ifdef USE_MINI_HT
4903   int rh=get_reg(branch_regs[i].regmap,RHASH);
4904   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4905   if(rs1[i]==31) {
4906     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4907     do_preload_rhtbl(ht);
4908     do_rhash(rs,rh);
4909   }
4910   #endif
4911   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4912   #ifdef DESTRUCTIVE_WRITEBACK
4913   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4914     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4915       emit_loadreg(rs1[i],rs);
4916     }
4917   }
4918   #endif
4919   #ifdef REG_PREFETCH
4920   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4921   #endif
4922   #ifdef USE_MINI_HT
4923   if(rs1[i]==31) {
4924     do_miniht_load(ht,rh);
4925   }
4926   #endif
4927   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4928   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4929   //assert(adj==0);
4930   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4931   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4932   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4933     // special case for RFE
4934     emit_jmp(0);
4935   else
4936     emit_jns(0);
4937   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4938   #ifdef USE_MINI_HT
4939   if(rs1[i]==31) {
4940     do_miniht_jump(rs,rh,ht);
4941   }
4942   else
4943   #endif
4944   {
4945     //if(rs!=EAX) emit_mov(rs,EAX);
4946     //emit_jmp(jump_vaddr_eax);
4947     emit_jmp(jump_vaddr_reg[rs]);
4948   }
4949   /* Check hash table
4950   temp=!rs;
4951   emit_mov(rs,temp);
4952   emit_shrimm(rs,16,rs);
4953   emit_xor(temp,rs,rs);
4954   emit_movzwl_reg(rs,rs);
4955   emit_shlimm(rs,4,rs);
4956   emit_cmpmem_indexed((int)hash_table,rs,temp);
4957   emit_jne((int)out+14);
4958   emit_readword_indexed((int)hash_table+4,rs,rs);
4959   emit_jmpreg(rs);
4960   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4961   emit_addimm_no_flags(8,rs);
4962   emit_jeq((int)out-17);
4963   // No hit on hash table, call compiler
4964   emit_pushreg(temp);
4965 //DEBUG >
4966 #ifdef DEBUG_CYCLE_COUNT
4967   emit_readword((int)&last_count,ECX);
4968   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4969   emit_readword((int)&next_interupt,ECX);
4970   emit_writeword(HOST_CCREG,(int)&Count);
4971   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4972   emit_writeword(ECX,(int)&last_count);
4973 #endif
4974 //DEBUG <
4975   emit_storereg(CCREG,HOST_CCREG);
4976   emit_call((int)get_addr);
4977   emit_loadreg(CCREG,HOST_CCREG);
4978   emit_addimm(ESP,4,ESP);
4979   emit_jmpreg(EAX);*/
4980   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4981   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4982   #endif
4983 }
4984
4985 void cjump_assemble(int i,struct regstat *i_regs)
4986 {
4987   signed char *i_regmap=i_regs->regmap;
4988   int cc;
4989   int match;
4990   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4991   assem_debug("match=%d\n",match);
4992   int s1h,s1l,s2h,s2l;
4993   int prev_cop1_usable=cop1_usable;
4994   int unconditional=0,nop=0;
4995   int only32=0;
4996   int invert=0;
4997   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4998   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4999   if(!match) invert=1;
5000   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5001   if(i>(ba[i]-start)>>2) invert=1;
5002   #endif
5003
5004   if(ooo[i]) {
5005     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5006     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5007     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5008     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5009   }
5010   else {
5011     s1l=get_reg(i_regmap,rs1[i]);
5012     s1h=get_reg(i_regmap,rs1[i]|64);
5013     s2l=get_reg(i_regmap,rs2[i]);
5014     s2h=get_reg(i_regmap,rs2[i]|64);
5015   }
5016   if(rs1[i]==0&&rs2[i]==0)
5017   {
5018     if(opcode[i]&1) nop=1;
5019     else unconditional=1;
5020     //assert(opcode[i]!=5);
5021     //assert(opcode[i]!=7);
5022     //assert(opcode[i]!=0x15);
5023     //assert(opcode[i]!=0x17);
5024   }
5025   else if(rs1[i]==0)
5026   {
5027     s1l=s2l;s1h=s2h;
5028     s2l=s2h=-1;
5029     only32=(regs[i].was32>>rs2[i])&1;
5030   }
5031   else if(rs2[i]==0)
5032   {
5033     s2l=s2h=-1;
5034     only32=(regs[i].was32>>rs1[i])&1;
5035   }
5036   else {
5037     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5038   }
5039
5040   if(ooo[i]) {
5041     // Out of order execution (delay slot first)
5042     //printf("OOOE\n");
5043     address_generation(i+1,i_regs,regs[i].regmap_entry);
5044     ds_assemble(i+1,i_regs);
5045     int adj;
5046     uint64_t bc_unneeded=branch_regs[i].u;
5047     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5048     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5049     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5050     bc_unneeded|=1;
5051     bc_unneeded_upper|=1;
5052     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5053                   bc_unneeded,bc_unneeded_upper);
5054     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5055     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5056     cc=get_reg(branch_regs[i].regmap,CCREG);
5057     assert(cc==HOST_CCREG);
5058     if(unconditional)
5059       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5060     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5061     //assem_debug("cycle count (adj)\n");
5062     if(unconditional) {
5063       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5064       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5065         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5066         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5067         if(internal)
5068           assem_debug("branch: internal\n");
5069         else
5070           assem_debug("branch: external\n");
5071         if(internal&&is_ds[(ba[i]-start)>>2]) {
5072           ds_assemble_entry(i);
5073         }
5074         else {
5075           add_to_linker((int)out,ba[i],internal);
5076           emit_jmp(0);
5077         }
5078         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5079         if(((u_int)out)&7) emit_addnop(0);
5080         #endif
5081       }
5082     }
5083     else if(nop) {
5084       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5085       void *jaddr=out;
5086       emit_jns(0);
5087       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5088     }
5089     else {
5090       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5091       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5092       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5093       if(!only32)
5094       {
5095         assert(s1h>=0);
5096         if(opcode[i]==4) // BEQ
5097         {
5098           if(s2h>=0) emit_cmp(s1h,s2h);
5099           else emit_test(s1h,s1h);
5100           nottaken1=out;
5101           emit_jne(1);
5102         }
5103         if(opcode[i]==5) // BNE
5104         {
5105           if(s2h>=0) emit_cmp(s1h,s2h);
5106           else emit_test(s1h,s1h);
5107           if(invert) taken=out;
5108           else add_to_linker((int)out,ba[i],internal);
5109           emit_jne(0);
5110         }
5111         if(opcode[i]==6) // BLEZ
5112         {
5113           emit_test(s1h,s1h);
5114           if(invert) taken=out;
5115           else add_to_linker((int)out,ba[i],internal);
5116           emit_js(0);
5117           nottaken1=out;
5118           emit_jne(1);
5119         }
5120         if(opcode[i]==7) // BGTZ
5121         {
5122           emit_test(s1h,s1h);
5123           nottaken1=out;
5124           emit_js(1);
5125           if(invert) taken=out;
5126           else add_to_linker((int)out,ba[i],internal);
5127           emit_jne(0);
5128         }
5129       } // if(!only32)
5130
5131       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5132       assert(s1l>=0);
5133       if(opcode[i]==4) // BEQ
5134       {
5135         if(s2l>=0) emit_cmp(s1l,s2l);
5136         else emit_test(s1l,s1l);
5137         if(invert){
5138           nottaken=out;
5139           emit_jne(1);
5140         }else{
5141           add_to_linker((int)out,ba[i],internal);
5142           emit_jeq(0);
5143         }
5144       }
5145       if(opcode[i]==5) // BNE
5146       {
5147         if(s2l>=0) emit_cmp(s1l,s2l);
5148         else emit_test(s1l,s1l);
5149         if(invert){
5150           nottaken=out;
5151           emit_jeq(1);
5152         }else{
5153           add_to_linker((int)out,ba[i],internal);
5154           emit_jne(0);
5155         }
5156       }
5157       if(opcode[i]==6) // BLEZ
5158       {
5159         emit_cmpimm(s1l,1);
5160         if(invert){
5161           nottaken=out;
5162           emit_jge(1);
5163         }else{
5164           add_to_linker((int)out,ba[i],internal);
5165           emit_jl(0);
5166         }
5167       }
5168       if(opcode[i]==7) // BGTZ
5169       {
5170         emit_cmpimm(s1l,1);
5171         if(invert){
5172           nottaken=out;
5173           emit_jl(1);
5174         }else{
5175           add_to_linker((int)out,ba[i],internal);
5176           emit_jge(0);
5177         }
5178       }
5179       if(invert) {
5180         if(taken) set_jump_target(taken, out);
5181         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5182         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5183           if(adj) {
5184             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5185             add_to_linker((int)out,ba[i],internal);
5186           }else{
5187             emit_addnop(13);
5188             add_to_linker((int)out,ba[i],internal*2);
5189           }
5190           emit_jmp(0);
5191         }else
5192         #endif
5193         {
5194           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5195           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5196           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5197           if(internal)
5198             assem_debug("branch: internal\n");
5199           else
5200             assem_debug("branch: external\n");
5201           if(internal&&is_ds[(ba[i]-start)>>2]) {
5202             ds_assemble_entry(i);
5203           }
5204           else {
5205             add_to_linker((int)out,ba[i],internal);
5206             emit_jmp(0);
5207           }
5208         }
5209         set_jump_target(nottaken, out);
5210       }
5211
5212       if(nottaken1) set_jump_target(nottaken1, out);
5213       if(adj) {
5214         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5215       }
5216     } // (!unconditional)
5217   } // if(ooo)
5218   else
5219   {
5220     // In-order execution (branch first)
5221     //if(likely[i]) printf("IOL\n");
5222     //else
5223     //printf("IOE\n");
5224     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5225     if(!unconditional&&!nop) {
5226       if(!only32)
5227       {
5228         assert(s1h>=0);
5229         if((opcode[i]&0x2f)==4) // BEQ
5230         {
5231           if(s2h>=0) emit_cmp(s1h,s2h);
5232           else emit_test(s1h,s1h);
5233           nottaken1=out;
5234           emit_jne(2);
5235         }
5236         if((opcode[i]&0x2f)==5) // BNE
5237         {
5238           if(s2h>=0) emit_cmp(s1h,s2h);
5239           else emit_test(s1h,s1h);
5240           taken=out;
5241           emit_jne(1);
5242         }
5243         if((opcode[i]&0x2f)==6) // BLEZ
5244         {
5245           emit_test(s1h,s1h);
5246           taken=out;
5247           emit_js(1);
5248           nottaken1=out;
5249           emit_jne(2);
5250         }
5251         if((opcode[i]&0x2f)==7) // BGTZ
5252         {
5253           emit_test(s1h,s1h);
5254           nottaken1=out;
5255           emit_js(2);
5256           taken=out;
5257           emit_jne(1);
5258         }
5259       } // if(!only32)
5260
5261       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5262       assert(s1l>=0);
5263       if((opcode[i]&0x2f)==4) // BEQ
5264       {
5265         if(s2l>=0) emit_cmp(s1l,s2l);
5266         else emit_test(s1l,s1l);
5267         nottaken=out;
5268         emit_jne(2);
5269       }
5270       if((opcode[i]&0x2f)==5) // BNE
5271       {
5272         if(s2l>=0) emit_cmp(s1l,s2l);
5273         else emit_test(s1l,s1l);
5274         nottaken=out;
5275         emit_jeq(2);
5276       }
5277       if((opcode[i]&0x2f)==6) // BLEZ
5278       {
5279         emit_cmpimm(s1l,1);
5280         nottaken=out;
5281         emit_jge(2);
5282       }
5283       if((opcode[i]&0x2f)==7) // BGTZ
5284       {
5285         emit_cmpimm(s1l,1);
5286         nottaken=out;
5287         emit_jl(2);
5288       }
5289     } // if(!unconditional)
5290     int adj;
5291     uint64_t ds_unneeded=branch_regs[i].u;
5292     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5293     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5294     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5295     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5296     ds_unneeded|=1;
5297     ds_unneeded_upper|=1;
5298     // branch taken
5299     if(!nop) {
5300       if(taken) set_jump_target(taken, out);
5301       assem_debug("1:\n");
5302       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5303                     ds_unneeded,ds_unneeded_upper);
5304       // load regs
5305       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5306       address_generation(i+1,&branch_regs[i],0);
5307       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5308       ds_assemble(i+1,&branch_regs[i]);
5309       cc=get_reg(branch_regs[i].regmap,CCREG);
5310       if(cc==-1) {
5311         emit_loadreg(CCREG,cc=HOST_CCREG);
5312         // CHECK: Is the following instruction (fall thru) allocated ok?
5313       }
5314       assert(cc==HOST_CCREG);
5315       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5316       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5317       assem_debug("cycle count (adj)\n");
5318       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5319       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5320       if(internal)
5321         assem_debug("branch: internal\n");
5322       else
5323         assem_debug("branch: external\n");
5324       if(internal&&is_ds[(ba[i]-start)>>2]) {
5325         ds_assemble_entry(i);
5326       }
5327       else {
5328         add_to_linker((int)out,ba[i],internal);
5329         emit_jmp(0);
5330       }
5331     }
5332     // branch not taken
5333     cop1_usable=prev_cop1_usable;
5334     if(!unconditional) {
5335       if(nottaken1) set_jump_target(nottaken1, out);
5336       set_jump_target(nottaken, out);
5337       assem_debug("2:\n");
5338       if(!likely[i]) {
5339         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5340                       ds_unneeded,ds_unneeded_upper);
5341         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5342         address_generation(i+1,&branch_regs[i],0);
5343         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5344         ds_assemble(i+1,&branch_regs[i]);
5345       }
5346       cc=get_reg(branch_regs[i].regmap,CCREG);
5347       if(cc==-1&&!likely[i]) {
5348         // Cycle count isn't in a register, temporarily load it then write it out
5349         emit_loadreg(CCREG,HOST_CCREG);
5350         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5351         void *jaddr=out;
5352         emit_jns(0);
5353         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5354         emit_storereg(CCREG,HOST_CCREG);
5355       }
5356       else{
5357         cc=get_reg(i_regmap,CCREG);
5358         assert(cc==HOST_CCREG);
5359         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5360         void *jaddr=out;
5361         emit_jns(0);
5362         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5363       }
5364     }
5365   }
5366 }
5367
5368 void sjump_assemble(int i,struct regstat *i_regs)
5369 {
5370   signed char *i_regmap=i_regs->regmap;
5371   int cc;
5372   int match;
5373   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5374   assem_debug("smatch=%d\n",match);
5375   int s1h,s1l;
5376   int prev_cop1_usable=cop1_usable;
5377   int unconditional=0,nevertaken=0;
5378   int only32=0;
5379   int invert=0;
5380   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5381   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5382   if(!match) invert=1;
5383   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5384   if(i>(ba[i]-start)>>2) invert=1;
5385   #endif
5386
5387   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5388   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5389
5390   if(ooo[i]) {
5391     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5392     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5393   }
5394   else {
5395     s1l=get_reg(i_regmap,rs1[i]);
5396     s1h=get_reg(i_regmap,rs1[i]|64);
5397   }
5398   if(rs1[i]==0)
5399   {
5400     if(opcode2[i]&1) unconditional=1;
5401     else nevertaken=1;
5402     // These are never taken (r0 is never less than zero)
5403     //assert(opcode2[i]!=0);
5404     //assert(opcode2[i]!=2);
5405     //assert(opcode2[i]!=0x10);
5406     //assert(opcode2[i]!=0x12);
5407   }
5408   else {
5409     only32=(regs[i].was32>>rs1[i])&1;
5410   }
5411
5412   if(ooo[i]) {
5413     // Out of order execution (delay slot first)
5414     //printf("OOOE\n");
5415     address_generation(i+1,i_regs,regs[i].regmap_entry);
5416     ds_assemble(i+1,i_regs);
5417     int adj;
5418     uint64_t bc_unneeded=branch_regs[i].u;
5419     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5420     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5421     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5422     bc_unneeded|=1;
5423     bc_unneeded_upper|=1;
5424     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5425                   bc_unneeded,bc_unneeded_upper);
5426     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5427     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5428     if(rt1[i]==31) {
5429       int rt,return_address;
5430       rt=get_reg(branch_regs[i].regmap,31);
5431       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5432       if(rt>=0) {
5433         // Save the PC even if the branch is not taken
5434         return_address=start+i*4+8;
5435         emit_movimm(return_address,rt); // PC into link register
5436         #ifdef IMM_PREFETCH
5437         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5438         #endif
5439       }
5440     }
5441     cc=get_reg(branch_regs[i].regmap,CCREG);
5442     assert(cc==HOST_CCREG);
5443     if(unconditional)
5444       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5445     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5446     assem_debug("cycle count (adj)\n");
5447     if(unconditional) {
5448       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5449       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5450         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5451         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5452         if(internal)
5453           assem_debug("branch: internal\n");
5454         else
5455           assem_debug("branch: external\n");
5456         if(internal&&is_ds[(ba[i]-start)>>2]) {
5457           ds_assemble_entry(i);
5458         }
5459         else {
5460           add_to_linker((int)out,ba[i],internal);
5461           emit_jmp(0);
5462         }
5463         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5464         if(((u_int)out)&7) emit_addnop(0);
5465         #endif
5466       }
5467     }
5468     else if(nevertaken) {
5469       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5470       void *jaddr=out;
5471       emit_jns(0);
5472       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5473     }
5474     else {
5475       void *nottaken = NULL;
5476       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5477       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5478       if(!only32)
5479       {
5480         assert(s1h>=0);
5481         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5482         {
5483           emit_test(s1h,s1h);
5484           if(invert){
5485             nottaken=out;
5486             emit_jns(1);
5487           }else{
5488             add_to_linker((int)out,ba[i],internal);
5489             emit_js(0);
5490           }
5491         }
5492         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5493         {
5494           emit_test(s1h,s1h);
5495           if(invert){
5496             nottaken=out;
5497             emit_js(1);
5498           }else{
5499             add_to_linker((int)out,ba[i],internal);
5500             emit_jns(0);
5501           }
5502         }
5503       } // if(!only32)
5504       else
5505       {
5506         assert(s1l>=0);
5507         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5508         {
5509           emit_test(s1l,s1l);
5510           if(invert){
5511             nottaken=out;
5512             emit_jns(1);
5513           }else{
5514             add_to_linker((int)out,ba[i],internal);
5515             emit_js(0);
5516           }
5517         }
5518         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5519         {
5520           emit_test(s1l,s1l);
5521           if(invert){
5522             nottaken=out;
5523             emit_js(1);
5524           }else{
5525             add_to_linker((int)out,ba[i],internal);
5526             emit_jns(0);
5527           }
5528         }
5529       } // if(!only32)
5530
5531       if(invert) {
5532         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5533         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5534           if(adj) {
5535             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5536             add_to_linker((int)out,ba[i],internal);
5537           }else{
5538             emit_addnop(13);
5539             add_to_linker((int)out,ba[i],internal*2);
5540           }
5541           emit_jmp(0);
5542         }else
5543         #endif
5544         {
5545           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5546           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5547           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5548           if(internal)
5549             assem_debug("branch: internal\n");
5550           else
5551             assem_debug("branch: external\n");
5552           if(internal&&is_ds[(ba[i]-start)>>2]) {
5553             ds_assemble_entry(i);
5554           }
5555           else {
5556             add_to_linker((int)out,ba[i],internal);
5557             emit_jmp(0);
5558           }
5559         }
5560         set_jump_target(nottaken, out);
5561       }
5562
5563       if(adj) {
5564         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5565       }
5566     } // (!unconditional)
5567   } // if(ooo)
5568   else
5569   {
5570     // In-order execution (branch first)
5571     //printf("IOE\n");
5572     void *nottaken = NULL;
5573     if(rt1[i]==31) {
5574       int rt,return_address;
5575       rt=get_reg(branch_regs[i].regmap,31);
5576       if(rt>=0) {
5577         // Save the PC even if the branch is not taken
5578         return_address=start+i*4+8;
5579         emit_movimm(return_address,rt); // PC into link register
5580         #ifdef IMM_PREFETCH
5581         emit_prefetch(hash_table_get(return_address));
5582         #endif
5583       }
5584     }
5585     if(!unconditional) {
5586       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5587       if(!only32)
5588       {
5589         assert(s1h>=0);
5590         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5591         {
5592           emit_test(s1h,s1h);
5593           nottaken=out;
5594           emit_jns(1);
5595         }
5596         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5597         {
5598           emit_test(s1h,s1h);
5599           nottaken=out;
5600           emit_js(1);
5601         }
5602       } // if(!only32)
5603       else
5604       {
5605         assert(s1l>=0);
5606         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5607         {
5608           emit_test(s1l,s1l);
5609           nottaken=out;
5610           emit_jns(1);
5611         }
5612         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5613         {
5614           emit_test(s1l,s1l);
5615           nottaken=out;
5616           emit_js(1);
5617         }
5618       }
5619     } // if(!unconditional)
5620     int adj;
5621     uint64_t ds_unneeded=branch_regs[i].u;
5622     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5623     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5624     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5625     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5626     ds_unneeded|=1;
5627     ds_unneeded_upper|=1;
5628     // branch taken
5629     if(!nevertaken) {
5630       //assem_debug("1:\n");
5631       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5632                     ds_unneeded,ds_unneeded_upper);
5633       // load regs
5634       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5635       address_generation(i+1,&branch_regs[i],0);
5636       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5637       ds_assemble(i+1,&branch_regs[i]);
5638       cc=get_reg(branch_regs[i].regmap,CCREG);
5639       if(cc==-1) {
5640         emit_loadreg(CCREG,cc=HOST_CCREG);
5641         // CHECK: Is the following instruction (fall thru) allocated ok?
5642       }
5643       assert(cc==HOST_CCREG);
5644       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5645       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5646       assem_debug("cycle count (adj)\n");
5647       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5648       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5649       if(internal)
5650         assem_debug("branch: internal\n");
5651       else
5652         assem_debug("branch: external\n");
5653       if(internal&&is_ds[(ba[i]-start)>>2]) {
5654         ds_assemble_entry(i);
5655       }
5656       else {
5657         add_to_linker((int)out,ba[i],internal);
5658         emit_jmp(0);
5659       }
5660     }
5661     // branch not taken
5662     cop1_usable=prev_cop1_usable;
5663     if(!unconditional) {
5664       set_jump_target(nottaken, out);
5665       assem_debug("1:\n");
5666       if(!likely[i]) {
5667         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5668                       ds_unneeded,ds_unneeded_upper);
5669         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5670         address_generation(i+1,&branch_regs[i],0);
5671         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5672         ds_assemble(i+1,&branch_regs[i]);
5673       }
5674       cc=get_reg(branch_regs[i].regmap,CCREG);
5675       if(cc==-1&&!likely[i]) {
5676         // Cycle count isn't in a register, temporarily load it then write it out
5677         emit_loadreg(CCREG,HOST_CCREG);
5678         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5679         void *jaddr=out;
5680         emit_jns(0);
5681         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5682         emit_storereg(CCREG,HOST_CCREG);
5683       }
5684       else{
5685         cc=get_reg(i_regmap,CCREG);
5686         assert(cc==HOST_CCREG);
5687         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5688         void *jaddr=out;
5689         emit_jns(0);
5690         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5691       }
5692     }
5693   }
5694 }
5695
5696 void fjump_assemble(int i,struct regstat *i_regs)
5697 {
5698   signed char *i_regmap=i_regs->regmap;
5699   int cc;
5700   int match;
5701   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5702   assem_debug("fmatch=%d\n",match);
5703   int fs,cs;
5704   void *eaddr;
5705   int invert=0;
5706   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5707   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5708   if(!match) invert=1;
5709   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5710   if(i>(ba[i]-start)>>2) invert=1;
5711   #endif
5712
5713   if(ooo[i]) {
5714     fs=get_reg(branch_regs[i].regmap,FSREG);
5715     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5716   }
5717   else {
5718     fs=get_reg(i_regmap,FSREG);
5719   }
5720
5721   // Check cop1 unusable
5722   if(!cop1_usable) {
5723     cs=get_reg(i_regmap,CSREG);
5724     assert(cs>=0);
5725     emit_testimm(cs,0x20000000);
5726     eaddr=out;
5727     emit_jeq(0);
5728     add_stub_r(FP_STUB,eaddr,out,i,cs,i_regs,0,0);
5729     cop1_usable=1;
5730   }
5731
5732   if(ooo[i]) {
5733     // Out of order execution (delay slot first)
5734     //printf("OOOE\n");
5735     ds_assemble(i+1,i_regs);
5736     int adj;
5737     uint64_t bc_unneeded=branch_regs[i].u;
5738     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5739     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5740     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5741     bc_unneeded|=1;
5742     bc_unneeded_upper|=1;
5743     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5744                   bc_unneeded,bc_unneeded_upper);
5745     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5746     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5747     cc=get_reg(branch_regs[i].regmap,CCREG);
5748     assert(cc==HOST_CCREG);
5749     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5750     assem_debug("cycle count (adj)\n");
5751     if(1) {
5752       void *nottaken = NULL;
5753       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5754       if(1) {
5755         assert(fs>=0);
5756         emit_testimm(fs,0x800000);
5757         if(source[i]&0x10000) // BC1T
5758         {
5759           if(invert){
5760             nottaken=out;
5761             emit_jeq(1);
5762           }else{
5763             add_to_linker((int)out,ba[i],internal);
5764             emit_jne(0);
5765           }
5766         }
5767         else // BC1F
5768           if(invert){
5769             nottaken=out;
5770             emit_jne(1);
5771           }else{
5772             add_to_linker((int)out,ba[i],internal);
5773             emit_jeq(0);
5774           }
5775         {
5776         }
5777       } // if(!only32)
5778
5779       if(invert) {
5780         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5781         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5782         else if(match) emit_addnop(13);
5783         #endif
5784         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5785         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5786         if(internal)
5787           assem_debug("branch: internal\n");
5788         else
5789           assem_debug("branch: external\n");
5790         if(internal&&is_ds[(ba[i]-start)>>2]) {
5791           ds_assemble_entry(i);
5792         }
5793         else {
5794           add_to_linker((int)out,ba[i],internal);
5795           emit_jmp(0);
5796         }
5797         set_jump_target(nottaken, out);
5798       }
5799
5800       if(adj) {
5801         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5802       }
5803     } // (!unconditional)
5804   } // if(ooo)
5805   else
5806   {
5807     // In-order execution (branch first)
5808     //printf("IOE\n");
5809     void *nottaken = NULL;
5810     if(1) {
5811       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5812       if(1) {
5813         assert(fs>=0);
5814         emit_testimm(fs,0x800000);
5815         if(source[i]&0x10000) // BC1T
5816         {
5817           nottaken=out;
5818           emit_jeq(1);
5819         }
5820         else // BC1F
5821         {
5822           nottaken=out;
5823           emit_jne(1);
5824         }
5825       }
5826     } // if(!unconditional)
5827     int adj;
5828     uint64_t ds_unneeded=branch_regs[i].u;
5829     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5830     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5831     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5832     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5833     ds_unneeded|=1;
5834     ds_unneeded_upper|=1;
5835     // branch taken
5836     //assem_debug("1:\n");
5837     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5838                   ds_unneeded,ds_unneeded_upper);
5839     // load regs
5840     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5841     address_generation(i+1,&branch_regs[i],0);
5842     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5843     ds_assemble(i+1,&branch_regs[i]);
5844     cc=get_reg(branch_regs[i].regmap,CCREG);
5845     if(cc==-1) {
5846       emit_loadreg(CCREG,cc=HOST_CCREG);
5847       // CHECK: Is the following instruction (fall thru) allocated ok?
5848     }
5849     assert(cc==HOST_CCREG);
5850     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5851     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5852     assem_debug("cycle count (adj)\n");
5853     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5854     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5855     if(internal)
5856       assem_debug("branch: internal\n");
5857     else
5858       assem_debug("branch: external\n");
5859     if(internal&&is_ds[(ba[i]-start)>>2]) {
5860       ds_assemble_entry(i);
5861     }
5862     else {
5863       add_to_linker((int)out,ba[i],internal);
5864       emit_jmp(0);
5865     }
5866
5867     // branch not taken
5868     if(1) { // <- FIXME (don't need this)
5869       set_jump_target(nottaken, out);
5870       assem_debug("1:\n");
5871       if(!likely[i]) {
5872         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5873                       ds_unneeded,ds_unneeded_upper);
5874         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5875         address_generation(i+1,&branch_regs[i],0);
5876         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5877         ds_assemble(i+1,&branch_regs[i]);
5878       }
5879       cc=get_reg(branch_regs[i].regmap,CCREG);
5880       if(cc==-1&&!likely[i]) {
5881         // Cycle count isn't in a register, temporarily load it then write it out
5882         emit_loadreg(CCREG,HOST_CCREG);
5883         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5884         void *jaddr=out;
5885         emit_jns(0);
5886         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5887         emit_storereg(CCREG,HOST_CCREG);
5888       }
5889       else{
5890         cc=get_reg(i_regmap,CCREG);
5891         assert(cc==HOST_CCREG);
5892         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5893         void *jaddr=out;
5894         emit_jns(0);
5895         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5896       }
5897     }
5898   }
5899 }
5900
5901 static void pagespan_assemble(int i,struct regstat *i_regs)
5902 {
5903   int s1l=get_reg(i_regs->regmap,rs1[i]);
5904   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5905   int s2l=get_reg(i_regs->regmap,rs2[i]);
5906   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5907   void *taken = NULL;
5908   void *nottaken = NULL;
5909   int unconditional=0;
5910   if(rs1[i]==0)
5911   {
5912     s1l=s2l;s1h=s2h;
5913     s2l=s2h=-1;
5914   }
5915   else if(rs2[i]==0)
5916   {
5917     s2l=s2h=-1;
5918   }
5919   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5920     s1h=s2h=-1;
5921   }
5922   int hr=0;
5923   int addr=-1,alt=-1,ntaddr=-1;
5924   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5925   else {
5926     while(hr<HOST_REGS)
5927     {
5928       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5929          (i_regs->regmap[hr]&63)!=rs1[i] &&
5930          (i_regs->regmap[hr]&63)!=rs2[i] )
5931       {
5932         addr=hr++;break;
5933       }
5934       hr++;
5935     }
5936   }
5937   while(hr<HOST_REGS)
5938   {
5939     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5940        (i_regs->regmap[hr]&63)!=rs1[i] &&
5941        (i_regs->regmap[hr]&63)!=rs2[i] )
5942     {
5943       alt=hr++;break;
5944     }
5945     hr++;
5946   }
5947   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5948   {
5949     while(hr<HOST_REGS)
5950     {
5951       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5952          (i_regs->regmap[hr]&63)!=rs1[i] &&
5953          (i_regs->regmap[hr]&63)!=rs2[i] )
5954       {
5955         ntaddr=hr;break;
5956       }
5957       hr++;
5958     }
5959   }
5960   assert(hr<HOST_REGS);
5961   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5962     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5963   }
5964   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5965   if(opcode[i]==2) // J
5966   {
5967     unconditional=1;
5968   }
5969   if(opcode[i]==3) // JAL
5970   {
5971     // TODO: mini_ht
5972     int rt=get_reg(i_regs->regmap,31);
5973     emit_movimm(start+i*4+8,rt);
5974     unconditional=1;
5975   }
5976   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5977   {
5978     emit_mov(s1l,addr);
5979     if(opcode2[i]==9) // JALR
5980     {
5981       int rt=get_reg(i_regs->regmap,rt1[i]);
5982       emit_movimm(start+i*4+8,rt);
5983     }
5984   }
5985   if((opcode[i]&0x3f)==4) // BEQ
5986   {
5987     if(rs1[i]==rs2[i])
5988     {
5989       unconditional=1;
5990     }
5991     else
5992     #ifdef HAVE_CMOV_IMM
5993     if(s1h<0) {
5994       if(s2l>=0) emit_cmp(s1l,s2l);
5995       else emit_test(s1l,s1l);
5996       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5997     }
5998     else
5999     #endif
6000     {
6001       assert(s1l>=0);
6002       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6003       if(s1h>=0) {
6004         if(s2h>=0) emit_cmp(s1h,s2h);
6005         else emit_test(s1h,s1h);
6006         emit_cmovne_reg(alt,addr);
6007       }
6008       if(s2l>=0) emit_cmp(s1l,s2l);
6009       else emit_test(s1l,s1l);
6010       emit_cmovne_reg(alt,addr);
6011     }
6012   }
6013   if((opcode[i]&0x3f)==5) // BNE
6014   {
6015     #ifdef HAVE_CMOV_IMM
6016     if(s1h<0) {
6017       if(s2l>=0) emit_cmp(s1l,s2l);
6018       else emit_test(s1l,s1l);
6019       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6020     }
6021     else
6022     #endif
6023     {
6024       assert(s1l>=0);
6025       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6026       if(s1h>=0) {
6027         if(s2h>=0) emit_cmp(s1h,s2h);
6028         else emit_test(s1h,s1h);
6029         emit_cmovne_reg(alt,addr);
6030       }
6031       if(s2l>=0) emit_cmp(s1l,s2l);
6032       else emit_test(s1l,s1l);
6033       emit_cmovne_reg(alt,addr);
6034     }
6035   }
6036   if((opcode[i]&0x3f)==0x14) // BEQL
6037   {
6038     if(s1h>=0) {
6039       if(s2h>=0) emit_cmp(s1h,s2h);
6040       else emit_test(s1h,s1h);
6041       nottaken=out;
6042       emit_jne(0);
6043     }
6044     if(s2l>=0) emit_cmp(s1l,s2l);
6045     else emit_test(s1l,s1l);
6046     if(nottaken) set_jump_target(nottaken, out);
6047     nottaken=out;
6048     emit_jne(0);
6049   }
6050   if((opcode[i]&0x3f)==0x15) // BNEL
6051   {
6052     if(s1h>=0) {
6053       if(s2h>=0) emit_cmp(s1h,s2h);
6054       else emit_test(s1h,s1h);
6055       taken=out;
6056       emit_jne(0);
6057     }
6058     if(s2l>=0) emit_cmp(s1l,s2l);
6059     else emit_test(s1l,s1l);
6060     nottaken=out;
6061     emit_jeq(0);
6062     if(taken) set_jump_target(taken, out);
6063   }
6064   if((opcode[i]&0x3f)==6) // BLEZ
6065   {
6066     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6067     emit_cmpimm(s1l,1);
6068     if(s1h>=0) emit_mov(addr,ntaddr);
6069     emit_cmovl_reg(alt,addr);
6070     if(s1h>=0) {
6071       emit_test(s1h,s1h);
6072       emit_cmovne_reg(ntaddr,addr);
6073       emit_cmovs_reg(alt,addr);
6074     }
6075   }
6076   if((opcode[i]&0x3f)==7) // BGTZ
6077   {
6078     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6079     emit_cmpimm(s1l,1);
6080     if(s1h>=0) emit_mov(addr,alt);
6081     emit_cmovl_reg(ntaddr,addr);
6082     if(s1h>=0) {
6083       emit_test(s1h,s1h);
6084       emit_cmovne_reg(alt,addr);
6085       emit_cmovs_reg(ntaddr,addr);
6086     }
6087   }
6088   if((opcode[i]&0x3f)==0x16) // BLEZL
6089   {
6090     assert((opcode[i]&0x3f)!=0x16);
6091   }
6092   if((opcode[i]&0x3f)==0x17) // BGTZL
6093   {
6094     assert((opcode[i]&0x3f)!=0x17);
6095   }
6096   assert(opcode[i]!=1); // BLTZ/BGEZ
6097
6098   //FIXME: Check CSREG
6099   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6100     if((source[i]&0x30000)==0) // BC1F
6101     {
6102       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6103       emit_testimm(s1l,0x800000);
6104       emit_cmovne_reg(alt,addr);
6105     }
6106     if((source[i]&0x30000)==0x10000) // BC1T
6107     {
6108       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6109       emit_testimm(s1l,0x800000);
6110       emit_cmovne_reg(alt,addr);
6111     }
6112     if((source[i]&0x30000)==0x20000) // BC1FL
6113     {
6114       emit_testimm(s1l,0x800000);
6115       nottaken=out;
6116       emit_jne(0);
6117     }
6118     if((source[i]&0x30000)==0x30000) // BC1TL
6119     {
6120       emit_testimm(s1l,0x800000);
6121       nottaken=out;
6122       emit_jeq(0);
6123     }
6124   }
6125
6126   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6127   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6128   if(likely[i]||unconditional)
6129   {
6130     emit_movimm(ba[i],HOST_BTREG);
6131   }
6132   else if(addr!=HOST_BTREG)
6133   {
6134     emit_mov(addr,HOST_BTREG);
6135   }
6136   void *branch_addr=out;
6137   emit_jmp(0);
6138   int target_addr=start+i*4+5;
6139   void *stub=out;
6140   void *compiled_target_addr=check_addr(target_addr);
6141   emit_extjump_ds((int)branch_addr,target_addr);
6142   if(compiled_target_addr) {
6143     set_jump_target(branch_addr, compiled_target_addr);
6144     add_link(target_addr,stub);
6145   }
6146   else set_jump_target(branch_addr, stub);
6147   if(likely[i]) {
6148     // Not-taken path
6149     set_jump_target(nottaken, out);
6150     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6151     void *branch_addr=out;
6152     emit_jmp(0);
6153     int target_addr=start+i*4+8;
6154     void *stub=out;
6155     void *compiled_target_addr=check_addr(target_addr);
6156     emit_extjump_ds((int)branch_addr,target_addr);
6157     if(compiled_target_addr) {
6158       set_jump_target(branch_addr, compiled_target_addr);
6159       add_link(target_addr,stub);
6160     }
6161     else set_jump_target(branch_addr, stub);
6162   }
6163 }
6164
6165 // Assemble the delay slot for the above
6166 static void pagespan_ds()
6167 {
6168   assem_debug("initial delay slot:\n");
6169   u_int vaddr=start+1;
6170   u_int page=get_page(vaddr);
6171   u_int vpage=get_vpage(vaddr);
6172   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6173   do_dirty_stub_ds();
6174   ll_add(jump_in+page,vaddr,(void *)out);
6175   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6176   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6177     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6178   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6179     emit_writeword(HOST_BTREG,(int)&branch_target);
6180   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6181   address_generation(0,&regs[0],regs[0].regmap_entry);
6182   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6183     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6184   cop1_usable=0;
6185   is_delayslot=0;
6186   switch(itype[0]) {
6187     case ALU:
6188       alu_assemble(0,&regs[0]);break;
6189     case IMM16:
6190       imm16_assemble(0,&regs[0]);break;
6191     case SHIFT:
6192       shift_assemble(0,&regs[0]);break;
6193     case SHIFTIMM:
6194       shiftimm_assemble(0,&regs[0]);break;
6195     case LOAD:
6196       load_assemble(0,&regs[0]);break;
6197     case LOADLR:
6198       loadlr_assemble(0,&regs[0]);break;
6199     case STORE:
6200       store_assemble(0,&regs[0]);break;
6201     case STORELR:
6202       storelr_assemble(0,&regs[0]);break;
6203     case COP0:
6204       cop0_assemble(0,&regs[0]);break;
6205     case COP1:
6206       cop1_assemble(0,&regs[0]);break;
6207     case C1LS:
6208       c1ls_assemble(0,&regs[0]);break;
6209     case COP2:
6210       cop2_assemble(0,&regs[0]);break;
6211     case C2LS:
6212       c2ls_assemble(0,&regs[0]);break;
6213     case C2OP:
6214       c2op_assemble(0,&regs[0]);break;
6215     case FCONV:
6216       fconv_assemble(0,&regs[0]);break;
6217     case FLOAT:
6218       float_assemble(0,&regs[0]);break;
6219     case FCOMP:
6220       fcomp_assemble(0,&regs[0]);break;
6221     case MULTDIV:
6222       multdiv_assemble(0,&regs[0]);break;
6223     case MOV:
6224       mov_assemble(0,&regs[0]);break;
6225     case SYSCALL:
6226     case HLECALL:
6227     case INTCALL:
6228     case SPAN:
6229     case UJUMP:
6230     case RJUMP:
6231     case CJUMP:
6232     case SJUMP:
6233     case FJUMP:
6234       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6235   }
6236   int btaddr=get_reg(regs[0].regmap,BTREG);
6237   if(btaddr<0) {
6238     btaddr=get_reg(regs[0].regmap,-1);
6239     emit_readword((int)&branch_target,btaddr);
6240   }
6241   assert(btaddr!=HOST_CCREG);
6242   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6243 #ifdef HOST_IMM8
6244   emit_movimm(start+4,HOST_TEMPREG);
6245   emit_cmp(btaddr,HOST_TEMPREG);
6246 #else
6247   emit_cmpimm(btaddr,start+4);
6248 #endif
6249   void *branch = out;
6250   emit_jeq(0);
6251   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6252   emit_jmp(jump_vaddr_reg[btaddr]);
6253   set_jump_target(branch, out);
6254   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6255   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6256 }
6257
6258 // Basic liveness analysis for MIPS registers
6259 void unneeded_registers(int istart,int iend,int r)
6260 {
6261   int i;
6262   uint64_t u,uu,gte_u,b,bu,gte_bu;
6263   uint64_t temp_u,temp_uu,temp_gte_u=0;
6264   uint64_t tdep;
6265   uint64_t gte_u_unknown=0;
6266   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6267     gte_u_unknown=~0ll;
6268   if(iend==slen-1) {
6269     u=1;uu=1;
6270     gte_u=gte_u_unknown;
6271   }else{
6272     u=unneeded_reg[iend+1];
6273     uu=unneeded_reg_upper[iend+1];
6274     u=1;uu=1;
6275     gte_u=gte_unneeded[iend+1];
6276   }
6277
6278   for (i=iend;i>=istart;i--)
6279   {
6280     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6281     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6282     {
6283       // If subroutine call, flag return address as a possible branch target
6284       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6285
6286       if(ba[i]<start || ba[i]>=(start+slen*4))
6287       {
6288         // Branch out of this block, flush all regs
6289         u=1;
6290         uu=1;
6291         gte_u=gte_u_unknown;
6292         /* Hexagon hack
6293         if(itype[i]==UJUMP&&rt1[i]==31)
6294         {
6295           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6296         }
6297         if(itype[i]==RJUMP&&rs1[i]==31)
6298         {
6299           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6300         }
6301         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6302           if(itype[i]==UJUMP&&rt1[i]==31)
6303           {
6304             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6305             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6306           }
6307           if(itype[i]==RJUMP&&rs1[i]==31)
6308           {
6309             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6310             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6311           }
6312         }*/
6313         branch_unneeded_reg[i]=u;
6314         branch_unneeded_reg_upper[i]=uu;
6315         // Merge in delay slot
6316         tdep=(~uu>>rt1[i+1])&1;
6317         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6318         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6319         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6320         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6321         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6322         u|=1;uu|=1;
6323         gte_u|=gte_rt[i+1];
6324         gte_u&=~gte_rs[i+1];
6325         // If branch is "likely" (and conditional)
6326         // then we skip the delay slot on the fall-thru path
6327         if(likely[i]) {
6328           if(i<slen-1) {
6329             u&=unneeded_reg[i+2];
6330             uu&=unneeded_reg_upper[i+2];
6331             gte_u&=gte_unneeded[i+2];
6332           }
6333           else
6334           {
6335             u=1;
6336             uu=1;
6337             gte_u=gte_u_unknown;
6338           }
6339         }
6340       }
6341       else
6342       {
6343         // Internal branch, flag target
6344         bt[(ba[i]-start)>>2]=1;
6345         if(ba[i]<=start+i*4) {
6346           // Backward branch
6347           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6348           {
6349             // Unconditional branch
6350             temp_u=1;temp_uu=1;
6351             temp_gte_u=0;
6352           } else {
6353             // Conditional branch (not taken case)
6354             temp_u=unneeded_reg[i+2];
6355             temp_uu=unneeded_reg_upper[i+2];
6356             temp_gte_u&=gte_unneeded[i+2];
6357           }
6358           // Merge in delay slot
6359           tdep=(~temp_uu>>rt1[i+1])&1;
6360           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6361           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6362           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6363           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6364           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6365           temp_u|=1;temp_uu|=1;
6366           temp_gte_u|=gte_rt[i+1];
6367           temp_gte_u&=~gte_rs[i+1];
6368           // If branch is "likely" (and conditional)
6369           // then we skip the delay slot on the fall-thru path
6370           if(likely[i]) {
6371             if(i<slen-1) {
6372               temp_u&=unneeded_reg[i+2];
6373               temp_uu&=unneeded_reg_upper[i+2];
6374               temp_gte_u&=gte_unneeded[i+2];
6375             }
6376             else
6377             {
6378               temp_u=1;
6379               temp_uu=1;
6380               temp_gte_u=gte_u_unknown;
6381             }
6382           }
6383           tdep=(~temp_uu>>rt1[i])&1;
6384           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6385           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6386           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6387           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6388           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6389           temp_u|=1;temp_uu|=1;
6390           temp_gte_u|=gte_rt[i];
6391           temp_gte_u&=~gte_rs[i];
6392           unneeded_reg[i]=temp_u;
6393           unneeded_reg_upper[i]=temp_uu;
6394           gte_unneeded[i]=temp_gte_u;
6395           // Only go three levels deep.  This recursion can take an
6396           // excessive amount of time if there are a lot of nested loops.
6397           if(r<2) {
6398             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6399           }else{
6400             unneeded_reg[(ba[i]-start)>>2]=1;
6401             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6402             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6403           }
6404         } /*else*/ if(1) {
6405           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6406           {
6407             // Unconditional branch
6408             u=unneeded_reg[(ba[i]-start)>>2];
6409             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6410             gte_u=gte_unneeded[(ba[i]-start)>>2];
6411             branch_unneeded_reg[i]=u;
6412             branch_unneeded_reg_upper[i]=uu;
6413         //u=1;
6414         //uu=1;
6415         //branch_unneeded_reg[i]=u;
6416         //branch_unneeded_reg_upper[i]=uu;
6417             // Merge in delay slot
6418             tdep=(~uu>>rt1[i+1])&1;
6419             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6420             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6421             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6422             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6423             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6424             u|=1;uu|=1;
6425             gte_u|=gte_rt[i+1];
6426             gte_u&=~gte_rs[i+1];
6427           } else {
6428             // Conditional branch
6429             b=unneeded_reg[(ba[i]-start)>>2];
6430             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6431             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6432             branch_unneeded_reg[i]=b;
6433             branch_unneeded_reg_upper[i]=bu;
6434         //b=1;
6435         //bu=1;
6436         //branch_unneeded_reg[i]=b;
6437         //branch_unneeded_reg_upper[i]=bu;
6438             // Branch delay slot
6439             tdep=(~uu>>rt1[i+1])&1;
6440             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6441             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6442             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6443             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6444             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6445             b|=1;bu|=1;
6446             gte_bu|=gte_rt[i+1];
6447             gte_bu&=~gte_rs[i+1];
6448             // If branch is "likely" then we skip the
6449             // delay slot on the fall-thru path
6450             if(likely[i]) {
6451               u=b;
6452               uu=bu;
6453               gte_u=gte_bu;
6454               if(i<slen-1) {
6455                 u&=unneeded_reg[i+2];
6456                 uu&=unneeded_reg_upper[i+2];
6457                 gte_u&=gte_unneeded[i+2];
6458         //u=1;
6459         //uu=1;
6460               }
6461             } else {
6462               u&=b;
6463               uu&=bu;
6464               gte_u&=gte_bu;
6465         //u=1;
6466         //uu=1;
6467             }
6468             if(i<slen-1) {
6469               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6470               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6471         //branch_unneeded_reg[i]=1;
6472         //branch_unneeded_reg_upper[i]=1;
6473             } else {
6474               branch_unneeded_reg[i]=1;
6475               branch_unneeded_reg_upper[i]=1;
6476             }
6477           }
6478         }
6479       }
6480     }
6481     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6482     {
6483       // SYSCALL instruction (software interrupt)
6484       u=1;
6485       uu=1;
6486     }
6487     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6488     {
6489       // ERET instruction (return from interrupt)
6490       u=1;
6491       uu=1;
6492     }
6493     //u=uu=1; // DEBUG
6494     tdep=(~uu>>rt1[i])&1;
6495     // Written registers are unneeded
6496     u|=1LL<<rt1[i];
6497     u|=1LL<<rt2[i];
6498     uu|=1LL<<rt1[i];
6499     uu|=1LL<<rt2[i];
6500     gte_u|=gte_rt[i];
6501     // Accessed registers are needed
6502     u&=~(1LL<<rs1[i]);
6503     u&=~(1LL<<rs2[i]);
6504     uu&=~(1LL<<us1[i]);
6505     uu&=~(1LL<<us2[i]);
6506     gte_u&=~gte_rs[i];
6507     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6508       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6509     // Source-target dependencies
6510     uu&=~(tdep<<dep1[i]);
6511     uu&=~(tdep<<dep2[i]);
6512     // R0 is always unneeded
6513     u|=1;uu|=1;
6514     // Save it
6515     unneeded_reg[i]=u;
6516     unneeded_reg_upper[i]=uu;
6517     gte_unneeded[i]=gte_u;
6518     /*
6519     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6520     printf("U:");
6521     int r;
6522     for(r=1;r<=CCREG;r++) {
6523       if((unneeded_reg[i]>>r)&1) {
6524         if(r==HIREG) printf(" HI");
6525         else if(r==LOREG) printf(" LO");
6526         else printf(" r%d",r);
6527       }
6528     }
6529     printf(" UU:");
6530     for(r=1;r<=CCREG;r++) {
6531       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6532         if(r==HIREG) printf(" HI");
6533         else if(r==LOREG) printf(" LO");
6534         else printf(" r%d",r);
6535       }
6536     }
6537     printf("\n");*/
6538   }
6539   for (i=iend;i>=istart;i--)
6540   {
6541     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6542   }
6543 }
6544
6545 // Write back dirty registers as soon as we will no longer modify them,
6546 // so that we don't end up with lots of writes at the branches.
6547 void clean_registers(int istart,int iend,int wr)
6548 {
6549   int i;
6550   int r;
6551   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6552   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6553   if(iend==slen-1) {
6554     will_dirty_i=will_dirty_next=0;
6555     wont_dirty_i=wont_dirty_next=0;
6556   }else{
6557     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6558     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6559   }
6560   for (i=iend;i>=istart;i--)
6561   {
6562     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6563     {
6564       if(ba[i]<start || ba[i]>=(start+slen*4))
6565       {
6566         // Branch out of this block, flush all regs
6567         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6568         {
6569           // Unconditional branch
6570           will_dirty_i=0;
6571           wont_dirty_i=0;
6572           // Merge in delay slot (will dirty)
6573           for(r=0;r<HOST_REGS;r++) {
6574             if(r!=EXCLUDE_REG) {
6575               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6576               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6577               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6578               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6579               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6580               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6581               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6582               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6583               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6584               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6585               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6586               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6587               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6588               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6589             }
6590           }
6591         }
6592         else
6593         {
6594           // Conditional branch
6595           will_dirty_i=0;
6596           wont_dirty_i=wont_dirty_next;
6597           // Merge in delay slot (will dirty)
6598           for(r=0;r<HOST_REGS;r++) {
6599             if(r!=EXCLUDE_REG) {
6600               if(!likely[i]) {
6601                 // Might not dirty if likely branch is not taken
6602                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6603                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6604                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6605                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6606                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6607                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6608                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6609                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6610                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6611                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6612                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6613                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6614                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6615                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6616               }
6617             }
6618           }
6619         }
6620         // Merge in delay slot (wont dirty)
6621         for(r=0;r<HOST_REGS;r++) {
6622           if(r!=EXCLUDE_REG) {
6623             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6624             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6625             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6626             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6627             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6628             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6629             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6630             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6631             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6632             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6633           }
6634         }
6635         if(wr) {
6636           #ifndef DESTRUCTIVE_WRITEBACK
6637           branch_regs[i].dirty&=wont_dirty_i;
6638           #endif
6639           branch_regs[i].dirty|=will_dirty_i;
6640         }
6641       }
6642       else
6643       {
6644         // Internal branch
6645         if(ba[i]<=start+i*4) {
6646           // Backward branch
6647           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6648           {
6649             // Unconditional branch
6650             temp_will_dirty=0;
6651             temp_wont_dirty=0;
6652             // Merge in delay slot (will dirty)
6653             for(r=0;r<HOST_REGS;r++) {
6654               if(r!=EXCLUDE_REG) {
6655                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6656                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6657                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6658                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6659                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6660                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6661                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6662                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6663                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6664                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6665                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6666                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6667                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6668                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6669               }
6670             }
6671           } else {
6672             // Conditional branch (not taken case)
6673             temp_will_dirty=will_dirty_next;
6674             temp_wont_dirty=wont_dirty_next;
6675             // Merge in delay slot (will dirty)
6676             for(r=0;r<HOST_REGS;r++) {
6677               if(r!=EXCLUDE_REG) {
6678                 if(!likely[i]) {
6679                   // Will not dirty if likely branch is not taken
6680                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6681                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6682                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6683                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6684                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6685                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6686                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6687                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6688                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6689                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6690                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6691                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6692                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6693                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6694                 }
6695               }
6696             }
6697           }
6698           // Merge in delay slot (wont dirty)
6699           for(r=0;r<HOST_REGS;r++) {
6700             if(r!=EXCLUDE_REG) {
6701               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6702               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6703               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6704               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6705               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6706               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6707               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6708               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6709               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6710               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6711             }
6712           }
6713           // Deal with changed mappings
6714           if(i<iend) {
6715             for(r=0;r<HOST_REGS;r++) {
6716               if(r!=EXCLUDE_REG) {
6717                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6718                   temp_will_dirty&=~(1<<r);
6719                   temp_wont_dirty&=~(1<<r);
6720                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6721                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6722                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6723                   } else {
6724                     temp_will_dirty|=1<<r;
6725                     temp_wont_dirty|=1<<r;
6726                   }
6727                 }
6728               }
6729             }
6730           }
6731           if(wr) {
6732             will_dirty[i]=temp_will_dirty;
6733             wont_dirty[i]=temp_wont_dirty;
6734             clean_registers((ba[i]-start)>>2,i-1,0);
6735           }else{
6736             // Limit recursion.  It can take an excessive amount
6737             // of time if there are a lot of nested loops.
6738             will_dirty[(ba[i]-start)>>2]=0;
6739             wont_dirty[(ba[i]-start)>>2]=-1;
6740           }
6741         }
6742         /*else*/ if(1)
6743         {
6744           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6745           {
6746             // Unconditional branch
6747             will_dirty_i=0;
6748             wont_dirty_i=0;
6749           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6750             for(r=0;r<HOST_REGS;r++) {
6751               if(r!=EXCLUDE_REG) {
6752                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6753                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6754                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6755                 }
6756                 if(branch_regs[i].regmap[r]>=0) {
6757                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6758                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6759                 }
6760               }
6761             }
6762           //}
6763             // Merge in delay slot
6764             for(r=0;r<HOST_REGS;r++) {
6765               if(r!=EXCLUDE_REG) {
6766                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6767                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6768                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6769                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6770                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6771                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6772                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6773                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6774                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6775                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6776                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6777                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6778                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6779                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6780               }
6781             }
6782           } else {
6783             // Conditional branch
6784             will_dirty_i=will_dirty_next;
6785             wont_dirty_i=wont_dirty_next;
6786           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6787             for(r=0;r<HOST_REGS;r++) {
6788               if(r!=EXCLUDE_REG) {
6789                 signed char target_reg=branch_regs[i].regmap[r];
6790                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6791                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6792                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6793                 }
6794                 else if(target_reg>=0) {
6795                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6796                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6797                 }
6798                 // Treat delay slot as part of branch too
6799                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6800                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6801                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6802                 }
6803                 else
6804                 {
6805                   will_dirty[i+1]&=~(1<<r);
6806                 }*/
6807               }
6808             }
6809           //}
6810             // Merge in delay slot
6811             for(r=0;r<HOST_REGS;r++) {
6812               if(r!=EXCLUDE_REG) {
6813                 if(!likely[i]) {
6814                   // Might not dirty if likely branch is not taken
6815                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6816                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6817                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6818                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6819                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6820                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6821                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6822                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6823                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6824                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6825                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6826                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6827                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6828                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6829                 }
6830               }
6831             }
6832           }
6833           // Merge in delay slot (won't dirty)
6834           for(r=0;r<HOST_REGS;r++) {
6835             if(r!=EXCLUDE_REG) {
6836               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6837               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6838               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6839               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6840               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6841               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6842               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6843               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6844               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6845               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6846             }
6847           }
6848           if(wr) {
6849             #ifndef DESTRUCTIVE_WRITEBACK
6850             branch_regs[i].dirty&=wont_dirty_i;
6851             #endif
6852             branch_regs[i].dirty|=will_dirty_i;
6853           }
6854         }
6855       }
6856     }
6857     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6858     {
6859       // SYSCALL instruction (software interrupt)
6860       will_dirty_i=0;
6861       wont_dirty_i=0;
6862     }
6863     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6864     {
6865       // ERET instruction (return from interrupt)
6866       will_dirty_i=0;
6867       wont_dirty_i=0;
6868     }
6869     will_dirty_next=will_dirty_i;
6870     wont_dirty_next=wont_dirty_i;
6871     for(r=0;r<HOST_REGS;r++) {
6872       if(r!=EXCLUDE_REG) {
6873         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6874         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6875         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6876         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6877         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6878         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6879         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6880         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6881         if(i>istart) {
6882           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6883           {
6884             // Don't store a register immediately after writing it,
6885             // may prevent dual-issue.
6886             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6887             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6888           }
6889         }
6890       }
6891     }
6892     // Save it
6893     will_dirty[i]=will_dirty_i;
6894     wont_dirty[i]=wont_dirty_i;
6895     // Mark registers that won't be dirtied as not dirty
6896     if(wr) {
6897       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6898       for(r=0;r<HOST_REGS;r++) {
6899         if((will_dirty_i>>r)&1) {
6900           printf(" r%d",r);
6901         }
6902       }
6903       printf("\n");*/
6904
6905       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6906         regs[i].dirty|=will_dirty_i;
6907         #ifndef DESTRUCTIVE_WRITEBACK
6908         regs[i].dirty&=wont_dirty_i;
6909         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6910         {
6911           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6912             for(r=0;r<HOST_REGS;r++) {
6913               if(r!=EXCLUDE_REG) {
6914                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6915                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6916                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6917               }
6918             }
6919           }
6920         }
6921         else
6922         {
6923           if(i<iend) {
6924             for(r=0;r<HOST_REGS;r++) {
6925               if(r!=EXCLUDE_REG) {
6926                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6927                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6928                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6929               }
6930             }
6931           }
6932         }
6933         #endif
6934       //}
6935     }
6936     // Deal with changed mappings
6937     temp_will_dirty=will_dirty_i;
6938     temp_wont_dirty=wont_dirty_i;
6939     for(r=0;r<HOST_REGS;r++) {
6940       if(r!=EXCLUDE_REG) {
6941         int nr;
6942         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6943           if(wr) {
6944             #ifndef DESTRUCTIVE_WRITEBACK
6945             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6946             #endif
6947             regs[i].wasdirty|=will_dirty_i&(1<<r);
6948           }
6949         }
6950         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6951           // Register moved to a different register
6952           will_dirty_i&=~(1<<r);
6953           wont_dirty_i&=~(1<<r);
6954           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6955           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6956           if(wr) {
6957             #ifndef DESTRUCTIVE_WRITEBACK
6958             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6959             #endif
6960             regs[i].wasdirty|=will_dirty_i&(1<<r);
6961           }
6962         }
6963         else {
6964           will_dirty_i&=~(1<<r);
6965           wont_dirty_i&=~(1<<r);
6966           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6967             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6968             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6969           } else {
6970             wont_dirty_i|=1<<r;
6971             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6972           }
6973         }
6974       }
6975     }
6976   }
6977 }
6978
6979 #ifdef DISASM
6980   /* disassembly */
6981 void disassemble_inst(int i)
6982 {
6983     if (bt[i]) printf("*"); else printf(" ");
6984     switch(itype[i]) {
6985       case UJUMP:
6986         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6987       case CJUMP:
6988         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6989       case SJUMP:
6990         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6991       case FJUMP:
6992         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6993       case RJUMP:
6994         if (opcode[i]==0x9&&rt1[i]!=31)
6995           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6996         else
6997           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6998         break;
6999       case SPAN:
7000         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7001       case IMM16:
7002         if(opcode[i]==0xf) //LUI
7003           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7004         else
7005           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7006         break;
7007       case LOAD:
7008       case LOADLR:
7009         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7010         break;
7011       case STORE:
7012       case STORELR:
7013         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7014         break;
7015       case ALU:
7016       case SHIFT:
7017         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7018         break;
7019       case MULTDIV:
7020         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7021         break;
7022       case SHIFTIMM:
7023         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7024         break;
7025       case MOV:
7026         if((opcode2[i]&0x1d)==0x10)
7027           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7028         else if((opcode2[i]&0x1d)==0x11)
7029           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7030         else
7031           printf (" %x: %s\n",start+i*4,insn[i]);
7032         break;
7033       case COP0:
7034         if(opcode2[i]==0)
7035           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7036         else if(opcode2[i]==4)
7037           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7038         else printf (" %x: %s\n",start+i*4,insn[i]);
7039         break;
7040       case COP1:
7041         if(opcode2[i]<3)
7042           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7043         else if(opcode2[i]>3)
7044           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7045         else printf (" %x: %s\n",start+i*4,insn[i]);
7046         break;
7047       case COP2:
7048         if(opcode2[i]<3)
7049           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7050         else if(opcode2[i]>3)
7051           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7052         else printf (" %x: %s\n",start+i*4,insn[i]);
7053         break;
7054       case C1LS:
7055         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7056         break;
7057       case C2LS:
7058         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7059         break;
7060       case INTCALL:
7061         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7062         break;
7063       default:
7064         //printf (" %s %8x\n",insn[i],source[i]);
7065         printf (" %x: %s\n",start+i*4,insn[i]);
7066     }
7067 }
7068 #else
7069 static void disassemble_inst(int i) {}
7070 #endif // DISASM
7071
7072 #define DRC_TEST_VAL 0x74657374
7073
7074 static int new_dynarec_test(void)
7075 {
7076   int (*testfunc)(void) = (void *)out;
7077   void *beginning;
7078   int ret;
7079
7080   beginning = start_block();
7081   emit_movimm(DRC_TEST_VAL,0); // test
7082   emit_jmpreg(14);
7083   literal_pool(0);
7084   end_block(beginning);
7085   SysPrintf("testing if we can run recompiled code..\n");
7086   ret = testfunc();
7087   if (ret == DRC_TEST_VAL)
7088     SysPrintf("test passed.\n");
7089   else
7090     SysPrintf("test failed: %08x\n", ret);
7091   out=(u_char *)BASE_ADDR;
7092   return ret == DRC_TEST_VAL;
7093 }
7094
7095 // clear the state completely, instead of just marking
7096 // things invalid like invalidate_all_pages() does
7097 void new_dynarec_clear_full()
7098 {
7099   int n;
7100   out=(u_char *)BASE_ADDR;
7101   memset(invalid_code,1,sizeof(invalid_code));
7102   memset(hash_table,0xff,sizeof(hash_table));
7103   memset(mini_ht,-1,sizeof(mini_ht));
7104   memset(restore_candidate,0,sizeof(restore_candidate));
7105   memset(shadow,0,sizeof(shadow));
7106   copy=shadow;
7107   expirep=16384; // Expiry pointer, +2 blocks
7108   pending_exception=0;
7109   literalcount=0;
7110   stop_after_jal=0;
7111   inv_code_start=inv_code_end=~0;
7112   // TLB
7113   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7114   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7115   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7116 }
7117
7118 void new_dynarec_init()
7119 {
7120   SysPrintf("Init new dynarec\n");
7121
7122   // allocate/prepare a buffer for translation cache
7123   // see assem_arm.h for some explanation
7124 #if   defined(BASE_ADDR_FIXED)
7125   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7126             PROT_READ | PROT_WRITE | PROT_EXEC,
7127             MAP_PRIVATE | MAP_ANONYMOUS,
7128             -1, 0) != translation_cache) {
7129     SysPrintf("mmap() failed: %s\n", strerror(errno));
7130     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7131     abort();
7132   }
7133 #elif defined(BASE_ADDR_DYNAMIC)
7134   #ifdef VITA
7135   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7136   if (sceBlock < 0)
7137     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7138   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7139   if (ret < 0)
7140     SysPrintf("sceKernelGetMemBlockBase failed\n");
7141   #else
7142   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7143             PROT_READ | PROT_WRITE | PROT_EXEC,
7144             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7145   if (translation_cache == MAP_FAILED) {
7146     SysPrintf("mmap() failed: %s\n", strerror(errno));
7147     abort();
7148   }
7149   #endif
7150 #else
7151   #ifndef NO_WRITE_EXEC
7152   // not all systems allow execute in data segment by default
7153   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7154     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7155   #endif
7156 #endif
7157   out=(u_char *)BASE_ADDR;
7158   cycle_multiplier=200;
7159   new_dynarec_clear_full();
7160 #ifdef HOST_IMM8
7161   // Copy this into local area so we don't have to put it in every literal pool
7162   invc_ptr=invalid_code;
7163 #endif
7164   arch_init();
7165   new_dynarec_test();
7166 #ifndef RAM_FIXED
7167   ram_offset=(u_int)rdram-0x80000000;
7168 #endif
7169   if (ram_offset!=0)
7170     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7171 }
7172
7173 void new_dynarec_cleanup()
7174 {
7175   int n;
7176 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7177   #ifdef VITA
7178   sceKernelFreeMemBlock(sceBlock);
7179   sceBlock = -1;
7180   #else
7181   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7182     SysPrintf("munmap() failed\n");
7183   #endif
7184 #endif
7185   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7186   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7187   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7188   #ifdef ROM_COPY
7189   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7190   #endif
7191 }
7192
7193 static u_int *get_source_start(u_int addr, u_int *limit)
7194 {
7195   if (addr < 0x00200000 ||
7196     (0xa0000000 <= addr && addr < 0xa0200000)) {
7197     // used for BIOS calls mostly?
7198     *limit = (addr&0xa0000000)|0x00200000;
7199     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7200   }
7201   else if (!Config.HLE && (
7202     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7203     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7204     // BIOS
7205     *limit = (addr & 0xfff00000) | 0x80000;
7206     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7207   }
7208   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7209     *limit = (addr & 0x80600000) + 0x00200000;
7210     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7211   }
7212   return NULL;
7213 }
7214
7215 static u_int scan_for_ret(u_int addr)
7216 {
7217   u_int limit = 0;
7218   u_int *mem;
7219
7220   mem = get_source_start(addr, &limit);
7221   if (mem == NULL)
7222     return addr;
7223
7224   if (limit > addr + 0x1000)
7225     limit = addr + 0x1000;
7226   for (; addr < limit; addr += 4, mem++) {
7227     if (*mem == 0x03e00008) // jr $ra
7228       return addr + 8;
7229   }
7230   return addr;
7231 }
7232
7233 struct savestate_block {
7234   uint32_t addr;
7235   uint32_t regflags;
7236 };
7237
7238 static int addr_cmp(const void *p1_, const void *p2_)
7239 {
7240   const struct savestate_block *p1 = p1_, *p2 = p2_;
7241   return p1->addr - p2->addr;
7242 }
7243
7244 int new_dynarec_save_blocks(void *save, int size)
7245 {
7246   struct savestate_block *blocks = save;
7247   int maxcount = size / sizeof(blocks[0]);
7248   struct savestate_block tmp_blocks[1024];
7249   struct ll_entry *head;
7250   int p, s, d, o, bcnt;
7251   u_int addr;
7252
7253   o = 0;
7254   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
7255     bcnt = 0;
7256     for (head = jump_in[p]; head != NULL; head = head->next) {
7257       tmp_blocks[bcnt].addr = head->vaddr;
7258       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7259       bcnt++;
7260     }
7261     if (bcnt < 1)
7262       continue;
7263     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7264
7265     addr = tmp_blocks[0].addr;
7266     for (s = d = 0; s < bcnt; s++) {
7267       if (tmp_blocks[s].addr < addr)
7268         continue;
7269       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7270         tmp_blocks[d++] = tmp_blocks[s];
7271       addr = scan_for_ret(tmp_blocks[s].addr);
7272     }
7273
7274     if (o + d > maxcount)
7275       d = maxcount - o;
7276     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7277     o += d;
7278   }
7279
7280   return o * sizeof(blocks[0]);
7281 }
7282
7283 void new_dynarec_load_blocks(const void *save, int size)
7284 {
7285   const struct savestate_block *blocks = save;
7286   int count = size / sizeof(blocks[0]);
7287   u_int regs_save[32];
7288   uint32_t f;
7289   int i, b;
7290
7291   get_addr(psxRegs.pc);
7292
7293   // change GPRs for speculation to at least partially work..
7294   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7295   for (i = 1; i < 32; i++)
7296     psxRegs.GPR.r[i] = 0x80000000;
7297
7298   for (b = 0; b < count; b++) {
7299     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7300       if (f & 1)
7301         psxRegs.GPR.r[i] = 0x1f800000;
7302     }
7303
7304     get_addr(blocks[b].addr);
7305
7306     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7307       if (f & 1)
7308         psxRegs.GPR.r[i] = 0x80000000;
7309     }
7310   }
7311
7312   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7313 }
7314
7315 int new_recompile_block(int addr)
7316 {
7317   u_int pagelimit = 0;
7318   u_int state_rflags = 0;
7319   int i;
7320
7321   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7322   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7323   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7324   //if(debug)
7325   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7326   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7327   /*if(Count>=312978186) {
7328     rlist();
7329   }*/
7330   //rlist();
7331
7332   // this is just for speculation
7333   for (i = 1; i < 32; i++) {
7334     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7335       state_rflags |= 1 << i;
7336   }
7337
7338   start = (u_int)addr&~3;
7339   //assert(((u_int)addr&1)==0);
7340   new_dynarec_did_compile=1;
7341   if (Config.HLE && start == 0x80001000) // hlecall
7342   {
7343     // XXX: is this enough? Maybe check hleSoftCall?
7344     void *beginning=start_block();
7345     u_int page=get_page(start);
7346
7347     invalid_code[start>>12]=0;
7348     emit_movimm(start,0);
7349     emit_writeword(0,(int)&pcaddr);
7350     emit_jmp(new_dyna_leave);
7351     literal_pool(0);
7352     end_block(beginning);
7353     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7354     return 0;
7355   }
7356
7357   source = get_source_start(start, &pagelimit);
7358   if (source == NULL) {
7359     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7360     exit(1);
7361   }
7362
7363   /* Pass 1: disassemble */
7364   /* Pass 2: register dependencies, branch targets */
7365   /* Pass 3: register allocation */
7366   /* Pass 4: branch dependencies */
7367   /* Pass 5: pre-alloc */
7368   /* Pass 6: optimize clean/dirty state */
7369   /* Pass 7: flag 32-bit registers */
7370   /* Pass 8: assembly */
7371   /* Pass 9: linker */
7372   /* Pass 10: garbage collection / free memory */
7373
7374   int j;
7375   int done=0;
7376   unsigned int type,op,op2;
7377
7378   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7379
7380   /* Pass 1 disassembly */
7381
7382   for(i=0;!done;i++) {
7383     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7384     minimum_free_regs[i]=0;
7385     opcode[i]=op=source[i]>>26;
7386     switch(op)
7387     {
7388       case 0x00: strcpy(insn[i],"special"); type=NI;
7389         op2=source[i]&0x3f;
7390         switch(op2)
7391         {
7392           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7393           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7394           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7395           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7396           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7397           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7398           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7399           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7400           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7401           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7402           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7403           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7404           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7405           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7406           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7407           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7408           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7409           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7410           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7411           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7412           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7413           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7414           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7415           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7416           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7417           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7418           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7419           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7420           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7421           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7422           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7423           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7424           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7425           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7426           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7427 #if 0
7428           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7429           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7430           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7431           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7432           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7433           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7434           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7435           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7436           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7437           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7438           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7439           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7440           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7441           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7442           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7443           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7444           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7445 #endif
7446         }
7447         break;
7448       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7449         op2=(source[i]>>16)&0x1f;
7450         switch(op2)
7451         {
7452           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7453           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7454           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7455           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7456           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7457           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7458           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7459           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7460           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7461           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7462           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7463           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7464           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7465           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7466         }
7467         break;
7468       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7469       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7470       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7471       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7472       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7473       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7474       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7475       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7476       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7477       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7478       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7479       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7480       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7481       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7482       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7483         op2=(source[i]>>21)&0x1f;
7484         switch(op2)
7485         {
7486           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7487           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7488           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7489           switch(source[i]&0x3f)
7490           {
7491             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7492             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7493             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7494             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7495             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7496             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7497           }
7498         }
7499         break;
7500       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7501         op2=(source[i]>>21)&0x1f;
7502         switch(op2)
7503         {
7504           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7505           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7506           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7507           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7508           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7509           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7510           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7511           switch((source[i]>>16)&0x3)
7512           {
7513             case 0x00: strcpy(insn[i],"BC1F"); break;
7514             case 0x01: strcpy(insn[i],"BC1T"); break;
7515             case 0x02: strcpy(insn[i],"BC1FL"); break;
7516             case 0x03: strcpy(insn[i],"BC1TL"); break;
7517           }
7518           break;
7519           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7520           switch(source[i]&0x3f)
7521           {
7522             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7523             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7524             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7525             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7526             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7527             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7528             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7529             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7530             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7531             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7532             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7533             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7534             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7535             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7536             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7537             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7538             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7539             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7540             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7541             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7542             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7543             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7544             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7545             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7546             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7547             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7548             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7549             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7550             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7551             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7552             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7553             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7554             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7555             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7556             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7557           }
7558           break;
7559           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7560           switch(source[i]&0x3f)
7561           {
7562             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7563             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7564             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7565             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7566             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7567             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7568             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7569             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7570             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7571             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7572             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7573             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7574             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7575             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7576             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7577             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7578             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7579             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7580             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7581             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7582             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7583             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7584             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7585             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7586             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7587             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7588             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7589             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7590             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7591             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7592             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7593             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7594             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7595             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7596             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7597           }
7598           break;
7599           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7600           switch(source[i]&0x3f)
7601           {
7602             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7603             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7604           }
7605           break;
7606           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7607           switch(source[i]&0x3f)
7608           {
7609             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7610             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7611           }
7612           break;
7613         }
7614         break;
7615 #if 0
7616       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7617       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7618       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7619       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7620       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7621       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7622       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7623       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7624 #endif
7625       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7626       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7627       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7628       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7629       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7630       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7631       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7632 #if 0
7633       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7634 #endif
7635       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7636       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7637       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7638       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7639 #if 0
7640       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7641       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7642 #endif
7643       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7644       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7645       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7646       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7647 #if 0
7648       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7649       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7650       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7651 #endif
7652       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7653       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7654 #if 0
7655       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7656       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7657       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7658 #endif
7659       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7660         op2=(source[i]>>21)&0x1f;
7661         //if (op2 & 0x10) {
7662         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7663           if (gte_handlers[source[i]&0x3f]!=NULL) {
7664             if (gte_regnames[source[i]&0x3f]!=NULL)
7665               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7666             else
7667               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7668             type=C2OP;
7669           }
7670         }
7671         else switch(op2)
7672         {
7673           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7674           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7675           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7676           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7677         }
7678         break;
7679       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7680       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7681       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7682       default: strcpy(insn[i],"???"); type=NI;
7683         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7684         break;
7685     }
7686     itype[i]=type;
7687     opcode2[i]=op2;
7688     /* Get registers/immediates */
7689     lt1[i]=0;
7690     us1[i]=0;
7691     us2[i]=0;
7692     dep1[i]=0;
7693     dep2[i]=0;
7694     gte_rs[i]=gte_rt[i]=0;
7695     switch(type) {
7696       case LOAD:
7697         rs1[i]=(source[i]>>21)&0x1f;
7698         rs2[i]=0;
7699         rt1[i]=(source[i]>>16)&0x1f;
7700         rt2[i]=0;
7701         imm[i]=(short)source[i];
7702         break;
7703       case STORE:
7704       case STORELR:
7705         rs1[i]=(source[i]>>21)&0x1f;
7706         rs2[i]=(source[i]>>16)&0x1f;
7707         rt1[i]=0;
7708         rt2[i]=0;
7709         imm[i]=(short)source[i];
7710         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7711         break;
7712       case LOADLR:
7713         // LWL/LWR only load part of the register,
7714         // therefore the target register must be treated as a source too
7715         rs1[i]=(source[i]>>21)&0x1f;
7716         rs2[i]=(source[i]>>16)&0x1f;
7717         rt1[i]=(source[i]>>16)&0x1f;
7718         rt2[i]=0;
7719         imm[i]=(short)source[i];
7720         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7721         if(op==0x26) dep1[i]=rt1[i]; // LWR
7722         break;
7723       case IMM16:
7724         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7725         else rs1[i]=(source[i]>>21)&0x1f;
7726         rs2[i]=0;
7727         rt1[i]=(source[i]>>16)&0x1f;
7728         rt2[i]=0;
7729         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7730           imm[i]=(unsigned short)source[i];
7731         }else{
7732           imm[i]=(short)source[i];
7733         }
7734         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7735         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7736         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7737         break;
7738       case UJUMP:
7739         rs1[i]=0;
7740         rs2[i]=0;
7741         rt1[i]=0;
7742         rt2[i]=0;
7743         // The JAL instruction writes to r31.
7744         if (op&1) {
7745           rt1[i]=31;
7746         }
7747         rs2[i]=CCREG;
7748         break;
7749       case RJUMP:
7750         rs1[i]=(source[i]>>21)&0x1f;
7751         rs2[i]=0;
7752         rt1[i]=0;
7753         rt2[i]=0;
7754         // The JALR instruction writes to rd.
7755         if (op2&1) {
7756           rt1[i]=(source[i]>>11)&0x1f;
7757         }
7758         rs2[i]=CCREG;
7759         break;
7760       case CJUMP:
7761         rs1[i]=(source[i]>>21)&0x1f;
7762         rs2[i]=(source[i]>>16)&0x1f;
7763         rt1[i]=0;
7764         rt2[i]=0;
7765         if(op&2) { // BGTZ/BLEZ
7766           rs2[i]=0;
7767         }
7768         us1[i]=rs1[i];
7769         us2[i]=rs2[i];
7770         likely[i]=op>>4;
7771         break;
7772       case SJUMP:
7773         rs1[i]=(source[i]>>21)&0x1f;
7774         rs2[i]=CCREG;
7775         rt1[i]=0;
7776         rt2[i]=0;
7777         us1[i]=rs1[i];
7778         if(op2&0x10) { // BxxAL
7779           rt1[i]=31;
7780           // NOTE: If the branch is not taken, r31 is still overwritten
7781         }
7782         likely[i]=(op2&2)>>1;
7783         break;
7784       case FJUMP:
7785         rs1[i]=FSREG;
7786         rs2[i]=CSREG;
7787         rt1[i]=0;
7788         rt2[i]=0;
7789         likely[i]=((source[i])>>17)&1;
7790         break;
7791       case ALU:
7792         rs1[i]=(source[i]>>21)&0x1f; // source
7793         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7794         rt1[i]=(source[i]>>11)&0x1f; // destination
7795         rt2[i]=0;
7796         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7797           us1[i]=rs1[i];us2[i]=rs2[i];
7798         }
7799         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7800           dep1[i]=rs1[i];dep2[i]=rs2[i];
7801         }
7802         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7803           dep1[i]=rs1[i];dep2[i]=rs2[i];
7804         }
7805         break;
7806       case MULTDIV:
7807         rs1[i]=(source[i]>>21)&0x1f; // source
7808         rs2[i]=(source[i]>>16)&0x1f; // divisor
7809         rt1[i]=HIREG;
7810         rt2[i]=LOREG;
7811         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7812           us1[i]=rs1[i];us2[i]=rs2[i];
7813         }
7814         break;
7815       case MOV:
7816         rs1[i]=0;
7817         rs2[i]=0;
7818         rt1[i]=0;
7819         rt2[i]=0;
7820         if(op2==0x10) rs1[i]=HIREG; // MFHI
7821         if(op2==0x11) rt1[i]=HIREG; // MTHI
7822         if(op2==0x12) rs1[i]=LOREG; // MFLO
7823         if(op2==0x13) rt1[i]=LOREG; // MTLO
7824         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7825         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7826         dep1[i]=rs1[i];
7827         break;
7828       case SHIFT:
7829         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7830         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7831         rt1[i]=(source[i]>>11)&0x1f; // destination
7832         rt2[i]=0;
7833         // DSLLV/DSRLV/DSRAV are 64-bit
7834         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7835         break;
7836       case SHIFTIMM:
7837         rs1[i]=(source[i]>>16)&0x1f;
7838         rs2[i]=0;
7839         rt1[i]=(source[i]>>11)&0x1f;
7840         rt2[i]=0;
7841         imm[i]=(source[i]>>6)&0x1f;
7842         // DSxx32 instructions
7843         if(op2>=0x3c) imm[i]|=0x20;
7844         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7845         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7846         break;
7847       case COP0:
7848         rs1[i]=0;
7849         rs2[i]=0;
7850         rt1[i]=0;
7851         rt2[i]=0;
7852         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7853         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7854         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7855         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7856         break;
7857       case COP1:
7858         rs1[i]=0;
7859         rs2[i]=0;
7860         rt1[i]=0;
7861         rt2[i]=0;
7862         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7863         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7864         if(op2==5) us1[i]=rs1[i]; // DMTC1
7865         rs2[i]=CSREG;
7866         break;
7867       case COP2:
7868         rs1[i]=0;
7869         rs2[i]=0;
7870         rt1[i]=0;
7871         rt2[i]=0;
7872         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7873         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7874         rs2[i]=CSREG;
7875         int gr=(source[i]>>11)&0x1F;
7876         switch(op2)
7877         {
7878           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7879           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7880           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7881           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7882         }
7883         break;
7884       case C1LS:
7885         rs1[i]=(source[i]>>21)&0x1F;
7886         rs2[i]=CSREG;
7887         rt1[i]=0;
7888         rt2[i]=0;
7889         imm[i]=(short)source[i];
7890         break;
7891       case C2LS:
7892         rs1[i]=(source[i]>>21)&0x1F;
7893         rs2[i]=0;
7894         rt1[i]=0;
7895         rt2[i]=0;
7896         imm[i]=(short)source[i];
7897         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7898         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7899         break;
7900       case C2OP:
7901         rs1[i]=0;
7902         rs2[i]=0;
7903         rt1[i]=0;
7904         rt2[i]=0;
7905         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7906         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7907         gte_rt[i]|=1ll<<63; // every op changes flags
7908         if((source[i]&0x3f)==GTE_MVMVA) {
7909           int v = (source[i] >> 15) & 3;
7910           gte_rs[i]&=~0xe3fll;
7911           if(v==3) gte_rs[i]|=0xe00ll;
7912           else gte_rs[i]|=3ll<<(v*2);
7913         }
7914         break;
7915       case FLOAT:
7916       case FCONV:
7917         rs1[i]=0;
7918         rs2[i]=CSREG;
7919         rt1[i]=0;
7920         rt2[i]=0;
7921         break;
7922       case FCOMP:
7923         rs1[i]=FSREG;
7924         rs2[i]=CSREG;
7925         rt1[i]=FSREG;
7926         rt2[i]=0;
7927         break;
7928       case SYSCALL:
7929       case HLECALL:
7930       case INTCALL:
7931         rs1[i]=CCREG;
7932         rs2[i]=0;
7933         rt1[i]=0;
7934         rt2[i]=0;
7935         break;
7936       default:
7937         rs1[i]=0;
7938         rs2[i]=0;
7939         rt1[i]=0;
7940         rt2[i]=0;
7941     }
7942     /* Calculate branch target addresses */
7943     if(type==UJUMP)
7944       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7945     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7946       ba[i]=start+i*4+8; // Ignore never taken branch
7947     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7948       ba[i]=start+i*4+8; // Ignore never taken branch
7949     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7950       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7951     else ba[i]=-1;
7952     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7953       int do_in_intrp=0;
7954       // branch in delay slot?
7955       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7956         // don't handle first branch and call interpreter if it's hit
7957         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7958         do_in_intrp=1;
7959       }
7960       // basic load delay detection
7961       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7962         int t=(ba[i-1]-start)/4;
7963         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7964           // jump target wants DS result - potential load delay effect
7965           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7966           do_in_intrp=1;
7967           bt[t+1]=1; // expected return from interpreter
7968         }
7969         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7970               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7971           // v0 overwrite like this is a sign of trouble, bail out
7972           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7973           do_in_intrp=1;
7974         }
7975       }
7976       if(do_in_intrp) {
7977         rs1[i-1]=CCREG;
7978         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7979         ba[i-1]=-1;
7980         itype[i-1]=INTCALL;
7981         done=2;
7982         i--; // don't compile the DS
7983       }
7984     }
7985     /* Is this the end of the block? */
7986     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7987       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7988         done=2;
7989       }
7990       else {
7991         if(stop_after_jal) done=1;
7992         // Stop on BREAK
7993         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7994       }
7995       // Don't recompile stuff that's already compiled
7996       if(check_addr(start+i*4+4)) done=1;
7997       // Don't get too close to the limit
7998       if(i>MAXBLOCK/2) done=1;
7999     }
8000     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8001     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8002     if(done==2) {
8003       // Does the block continue due to a branch?
8004       for(j=i-1;j>=0;j--)
8005       {
8006         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8007         if(ba[j]==start+i*4+4) done=j=0;
8008         if(ba[j]==start+i*4+8) done=j=0;
8009       }
8010     }
8011     //assert(i<MAXBLOCK-1);
8012     if(start+i*4==pagelimit-4) done=1;
8013     assert(start+i*4<pagelimit);
8014     if (i==MAXBLOCK-1) done=1;
8015     // Stop if we're compiling junk
8016     if(itype[i]==NI&&opcode[i]==0x11) {
8017       done=stop_after_jal=1;
8018       SysPrintf("Disabled speculative precompilation\n");
8019     }
8020   }
8021   slen=i;
8022   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8023     if(start+i*4==pagelimit) {
8024       itype[i-1]=SPAN;
8025     }
8026   }
8027   assert(slen>0);
8028
8029   /* Pass 2 - Register dependencies and branch targets */
8030
8031   unneeded_registers(0,slen-1,0);
8032
8033   /* Pass 3 - Register allocation */
8034
8035   struct regstat current; // Current register allocations/status
8036   current.is32=1;
8037   current.dirty=0;
8038   current.u=unneeded_reg[0];
8039   current.uu=unneeded_reg_upper[0];
8040   clear_all_regs(current.regmap);
8041   alloc_reg(&current,0,CCREG);
8042   dirty_reg(&current,CCREG);
8043   current.isconst=0;
8044   current.wasconst=0;
8045   current.waswritten=0;
8046   int ds=0;
8047   int cc=0;
8048   int hr=-1;
8049
8050   if((u_int)addr&1) {
8051     // First instruction is delay slot
8052     cc=-1;
8053     bt[1]=1;
8054     ds=1;
8055     unneeded_reg[0]=1;
8056     unneeded_reg_upper[0]=1;
8057     current.regmap[HOST_BTREG]=BTREG;
8058   }
8059
8060   for(i=0;i<slen;i++)
8061   {
8062     if(bt[i])
8063     {
8064       int hr;
8065       for(hr=0;hr<HOST_REGS;hr++)
8066       {
8067         // Is this really necessary?
8068         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8069       }
8070       current.isconst=0;
8071       current.waswritten=0;
8072     }
8073     if(i>1)
8074     {
8075       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8076       {
8077         if(rs1[i-2]==0||rs2[i-2]==0)
8078         {
8079           if(rs1[i-2]) {
8080             current.is32|=1LL<<rs1[i-2];
8081             int hr=get_reg(current.regmap,rs1[i-2]|64);
8082             if(hr>=0) current.regmap[hr]=-1;
8083           }
8084           if(rs2[i-2]) {
8085             current.is32|=1LL<<rs2[i-2];
8086             int hr=get_reg(current.regmap,rs2[i-2]|64);
8087             if(hr>=0) current.regmap[hr]=-1;
8088           }
8089         }
8090       }
8091     }
8092     current.is32=-1LL;
8093
8094     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8095     regs[i].wasconst=current.isconst;
8096     regs[i].was32=current.is32;
8097     regs[i].wasdirty=current.dirty;
8098     regs[i].loadedconst=0;
8099     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8100       if(i+1<slen) {
8101         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8102         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8103         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8104         current.u|=1;
8105         current.uu|=1;
8106       } else {
8107         current.u=1;
8108         current.uu=1;
8109       }
8110     } else {
8111       if(i+1<slen) {
8112         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8113         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8114         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8115         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8116         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8117         current.u|=1;
8118         current.uu|=1;
8119       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8120     }
8121     is_ds[i]=ds;
8122     if(ds) {
8123       ds=0; // Skip delay slot, already allocated as part of branch
8124       // ...but we need to alloc it in case something jumps here
8125       if(i+1<slen) {
8126         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8127         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8128       }else{
8129         current.u=branch_unneeded_reg[i-1];
8130         current.uu=branch_unneeded_reg_upper[i-1];
8131       }
8132       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8133       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8134       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8135       current.u|=1;
8136       current.uu|=1;
8137       struct regstat temp;
8138       memcpy(&temp,&current,sizeof(current));
8139       temp.wasdirty=temp.dirty;
8140       temp.was32=temp.is32;
8141       // TODO: Take into account unconditional branches, as below
8142       delayslot_alloc(&temp,i);
8143       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8144       regs[i].wasdirty=temp.wasdirty;
8145       regs[i].was32=temp.was32;
8146       regs[i].dirty=temp.dirty;
8147       regs[i].is32=temp.is32;
8148       regs[i].isconst=0;
8149       regs[i].wasconst=0;
8150       current.isconst=0;
8151       // Create entry (branch target) regmap
8152       for(hr=0;hr<HOST_REGS;hr++)
8153       {
8154         int r=temp.regmap[hr];
8155         if(r>=0) {
8156           if(r!=regmap_pre[i][hr]) {
8157             regs[i].regmap_entry[hr]=-1;
8158           }
8159           else
8160           {
8161             if(r<64){
8162               if((current.u>>r)&1) {
8163                 regs[i].regmap_entry[hr]=-1;
8164                 regs[i].regmap[hr]=-1;
8165                 //Don't clear regs in the delay slot as the branch might need them
8166                 //current.regmap[hr]=-1;
8167               }else
8168                 regs[i].regmap_entry[hr]=r;
8169             }
8170             else {
8171               if((current.uu>>(r&63))&1) {
8172                 regs[i].regmap_entry[hr]=-1;
8173                 regs[i].regmap[hr]=-1;
8174                 //Don't clear regs in the delay slot as the branch might need them
8175                 //current.regmap[hr]=-1;
8176               }else
8177                 regs[i].regmap_entry[hr]=r;
8178             }
8179           }
8180         } else {
8181           // First instruction expects CCREG to be allocated
8182           if(i==0&&hr==HOST_CCREG)
8183             regs[i].regmap_entry[hr]=CCREG;
8184           else
8185             regs[i].regmap_entry[hr]=-1;
8186         }
8187       }
8188     }
8189     else { // Not delay slot
8190       switch(itype[i]) {
8191         case UJUMP:
8192           //current.isconst=0; // DEBUG
8193           //current.wasconst=0; // DEBUG
8194           //regs[i].wasconst=0; // DEBUG
8195           clear_const(&current,rt1[i]);
8196           alloc_cc(&current,i);
8197           dirty_reg(&current,CCREG);
8198           if (rt1[i]==31) {
8199             alloc_reg(&current,i,31);
8200             dirty_reg(&current,31);
8201             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8202             //assert(rt1[i+1]!=rt1[i]);
8203             #ifdef REG_PREFETCH
8204             alloc_reg(&current,i,PTEMP);
8205             #endif
8206             //current.is32|=1LL<<rt1[i];
8207           }
8208           ooo[i]=1;
8209           delayslot_alloc(&current,i+1);
8210           //current.isconst=0; // DEBUG
8211           ds=1;
8212           //printf("i=%d, isconst=%x\n",i,current.isconst);
8213           break;
8214         case RJUMP:
8215           //current.isconst=0;
8216           //current.wasconst=0;
8217           //regs[i].wasconst=0;
8218           clear_const(&current,rs1[i]);
8219           clear_const(&current,rt1[i]);
8220           alloc_cc(&current,i);
8221           dirty_reg(&current,CCREG);
8222           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8223             alloc_reg(&current,i,rs1[i]);
8224             if (rt1[i]!=0) {
8225               alloc_reg(&current,i,rt1[i]);
8226               dirty_reg(&current,rt1[i]);
8227               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8228               assert(rt1[i+1]!=rt1[i]);
8229               #ifdef REG_PREFETCH
8230               alloc_reg(&current,i,PTEMP);
8231               #endif
8232             }
8233             #ifdef USE_MINI_HT
8234             if(rs1[i]==31) { // JALR
8235               alloc_reg(&current,i,RHASH);
8236               #ifndef HOST_IMM_ADDR32
8237               alloc_reg(&current,i,RHTBL);
8238               #endif
8239             }
8240             #endif
8241             delayslot_alloc(&current,i+1);
8242           } else {
8243             // The delay slot overwrites our source register,
8244             // allocate a temporary register to hold the old value.
8245             current.isconst=0;
8246             current.wasconst=0;
8247             regs[i].wasconst=0;
8248             delayslot_alloc(&current,i+1);
8249             current.isconst=0;
8250             alloc_reg(&current,i,RTEMP);
8251           }
8252           //current.isconst=0; // DEBUG
8253           ooo[i]=1;
8254           ds=1;
8255           break;
8256         case CJUMP:
8257           //current.isconst=0;
8258           //current.wasconst=0;
8259           //regs[i].wasconst=0;
8260           clear_const(&current,rs1[i]);
8261           clear_const(&current,rs2[i]);
8262           if((opcode[i]&0x3E)==4) // BEQ/BNE
8263           {
8264             alloc_cc(&current,i);
8265             dirty_reg(&current,CCREG);
8266             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8267             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8268             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8269             {
8270               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8271               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8272             }
8273             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8274                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8275               // The delay slot overwrites one of our conditions.
8276               // Allocate the branch condition registers instead.
8277               current.isconst=0;
8278               current.wasconst=0;
8279               regs[i].wasconst=0;
8280               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8281               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8282               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8283               {
8284                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8285                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8286               }
8287             }
8288             else
8289             {
8290               ooo[i]=1;
8291               delayslot_alloc(&current,i+1);
8292             }
8293           }
8294           else
8295           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8296           {
8297             alloc_cc(&current,i);
8298             dirty_reg(&current,CCREG);
8299             alloc_reg(&current,i,rs1[i]);
8300             if(!(current.is32>>rs1[i]&1))
8301             {
8302               alloc_reg64(&current,i,rs1[i]);
8303             }
8304             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8305               // The delay slot overwrites one of our conditions.
8306               // Allocate the branch condition registers instead.
8307               current.isconst=0;
8308               current.wasconst=0;
8309               regs[i].wasconst=0;
8310               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8311               if(!((current.is32>>rs1[i])&1))
8312               {
8313                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8314               }
8315             }
8316             else
8317             {
8318               ooo[i]=1;
8319               delayslot_alloc(&current,i+1);
8320             }
8321           }
8322           else
8323           // Don't alloc the delay slot yet because we might not execute it
8324           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8325           {
8326             current.isconst=0;
8327             current.wasconst=0;
8328             regs[i].wasconst=0;
8329             alloc_cc(&current,i);
8330             dirty_reg(&current,CCREG);
8331             alloc_reg(&current,i,rs1[i]);
8332             alloc_reg(&current,i,rs2[i]);
8333             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8334             {
8335               alloc_reg64(&current,i,rs1[i]);
8336               alloc_reg64(&current,i,rs2[i]);
8337             }
8338           }
8339           else
8340           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8341           {
8342             current.isconst=0;
8343             current.wasconst=0;
8344             regs[i].wasconst=0;
8345             alloc_cc(&current,i);
8346             dirty_reg(&current,CCREG);
8347             alloc_reg(&current,i,rs1[i]);
8348             if(!(current.is32>>rs1[i]&1))
8349             {
8350               alloc_reg64(&current,i,rs1[i]);
8351             }
8352           }
8353           ds=1;
8354           //current.isconst=0;
8355           break;
8356         case SJUMP:
8357           //current.isconst=0;
8358           //current.wasconst=0;
8359           //regs[i].wasconst=0;
8360           clear_const(&current,rs1[i]);
8361           clear_const(&current,rt1[i]);
8362           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8363           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8364           {
8365             alloc_cc(&current,i);
8366             dirty_reg(&current,CCREG);
8367             alloc_reg(&current,i,rs1[i]);
8368             if(!(current.is32>>rs1[i]&1))
8369             {
8370               alloc_reg64(&current,i,rs1[i]);
8371             }
8372             if (rt1[i]==31) { // BLTZAL/BGEZAL
8373               alloc_reg(&current,i,31);
8374               dirty_reg(&current,31);
8375               //#ifdef REG_PREFETCH
8376               //alloc_reg(&current,i,PTEMP);
8377               //#endif
8378               //current.is32|=1LL<<rt1[i];
8379             }
8380             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8381                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8382               // Allocate the branch condition registers instead.
8383               current.isconst=0;
8384               current.wasconst=0;
8385               regs[i].wasconst=0;
8386               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8387               if(!((current.is32>>rs1[i])&1))
8388               {
8389                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8390               }
8391             }
8392             else
8393             {
8394               ooo[i]=1;
8395               delayslot_alloc(&current,i+1);
8396             }
8397           }
8398           else
8399           // Don't alloc the delay slot yet because we might not execute it
8400           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8401           {
8402             current.isconst=0;
8403             current.wasconst=0;
8404             regs[i].wasconst=0;
8405             alloc_cc(&current,i);
8406             dirty_reg(&current,CCREG);
8407             alloc_reg(&current,i,rs1[i]);
8408             if(!(current.is32>>rs1[i]&1))
8409             {
8410               alloc_reg64(&current,i,rs1[i]);
8411             }
8412           }
8413           ds=1;
8414           //current.isconst=0;
8415           break;
8416         case FJUMP:
8417           current.isconst=0;
8418           current.wasconst=0;
8419           regs[i].wasconst=0;
8420           if(likely[i]==0) // BC1F/BC1T
8421           {
8422             // TODO: Theoretically we can run out of registers here on x86.
8423             // The delay slot can allocate up to six, and we need to check
8424             // CSREG before executing the delay slot.  Possibly we can drop
8425             // the cycle count and then reload it after checking that the
8426             // FPU is in a usable state, or don't do out-of-order execution.
8427             alloc_cc(&current,i);
8428             dirty_reg(&current,CCREG);
8429             alloc_reg(&current,i,FSREG);
8430             alloc_reg(&current,i,CSREG);
8431             if(itype[i+1]==FCOMP) {
8432               // The delay slot overwrites the branch condition.
8433               // Allocate the branch condition registers instead.
8434               alloc_cc(&current,i);
8435               dirty_reg(&current,CCREG);
8436               alloc_reg(&current,i,CSREG);
8437               alloc_reg(&current,i,FSREG);
8438             }
8439             else {
8440               ooo[i]=1;
8441               delayslot_alloc(&current,i+1);
8442               alloc_reg(&current,i+1,CSREG);
8443             }
8444           }
8445           else
8446           // Don't alloc the delay slot yet because we might not execute it
8447           if(likely[i]) // BC1FL/BC1TL
8448           {
8449             alloc_cc(&current,i);
8450             dirty_reg(&current,CCREG);
8451             alloc_reg(&current,i,CSREG);
8452             alloc_reg(&current,i,FSREG);
8453           }
8454           ds=1;
8455           current.isconst=0;
8456           break;
8457         case IMM16:
8458           imm16_alloc(&current,i);
8459           break;
8460         case LOAD:
8461         case LOADLR:
8462           load_alloc(&current,i);
8463           break;
8464         case STORE:
8465         case STORELR:
8466           store_alloc(&current,i);
8467           break;
8468         case ALU:
8469           alu_alloc(&current,i);
8470           break;
8471         case SHIFT:
8472           shift_alloc(&current,i);
8473           break;
8474         case MULTDIV:
8475           multdiv_alloc(&current,i);
8476           break;
8477         case SHIFTIMM:
8478           shiftimm_alloc(&current,i);
8479           break;
8480         case MOV:
8481           mov_alloc(&current,i);
8482           break;
8483         case COP0:
8484           cop0_alloc(&current,i);
8485           break;
8486         case COP1:
8487         case COP2:
8488           cop1_alloc(&current,i);
8489           break;
8490         case C1LS:
8491           c1ls_alloc(&current,i);
8492           break;
8493         case C2LS:
8494           c2ls_alloc(&current,i);
8495           break;
8496         case C2OP:
8497           c2op_alloc(&current,i);
8498           break;
8499         case FCONV:
8500           fconv_alloc(&current,i);
8501           break;
8502         case FLOAT:
8503           float_alloc(&current,i);
8504           break;
8505         case FCOMP:
8506           fcomp_alloc(&current,i);
8507           break;
8508         case SYSCALL:
8509         case HLECALL:
8510         case INTCALL:
8511           syscall_alloc(&current,i);
8512           break;
8513         case SPAN:
8514           pagespan_alloc(&current,i);
8515           break;
8516       }
8517
8518       // Drop the upper half of registers that have become 32-bit
8519       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8520       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8521         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8522         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8523         current.uu|=1;
8524       } else {
8525         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8526         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8527         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8528         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8529         current.uu|=1;
8530       }
8531
8532       // Create entry (branch target) regmap
8533       for(hr=0;hr<HOST_REGS;hr++)
8534       {
8535         int r,or;
8536         r=current.regmap[hr];
8537         if(r>=0) {
8538           if(r!=regmap_pre[i][hr]) {
8539             // TODO: delay slot (?)
8540             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8541             if(or<0||(r&63)>=TEMPREG){
8542               regs[i].regmap_entry[hr]=-1;
8543             }
8544             else
8545             {
8546               // Just move it to a different register
8547               regs[i].regmap_entry[hr]=r;
8548               // If it was dirty before, it's still dirty
8549               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8550             }
8551           }
8552           else
8553           {
8554             // Unneeded
8555             if(r==0){
8556               regs[i].regmap_entry[hr]=0;
8557             }
8558             else
8559             if(r<64){
8560               if((current.u>>r)&1) {
8561                 regs[i].regmap_entry[hr]=-1;
8562                 //regs[i].regmap[hr]=-1;
8563                 current.regmap[hr]=-1;
8564               }else
8565                 regs[i].regmap_entry[hr]=r;
8566             }
8567             else {
8568               if((current.uu>>(r&63))&1) {
8569                 regs[i].regmap_entry[hr]=-1;
8570                 //regs[i].regmap[hr]=-1;
8571                 current.regmap[hr]=-1;
8572               }else
8573                 regs[i].regmap_entry[hr]=r;
8574             }
8575           }
8576         } else {
8577           // Branches expect CCREG to be allocated at the target
8578           if(regmap_pre[i][hr]==CCREG)
8579             regs[i].regmap_entry[hr]=CCREG;
8580           else
8581             regs[i].regmap_entry[hr]=-1;
8582         }
8583       }
8584       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8585     }
8586
8587     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8588       current.waswritten|=1<<rs1[i-1];
8589     current.waswritten&=~(1<<rt1[i]);
8590     current.waswritten&=~(1<<rt2[i]);
8591     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8592       current.waswritten&=~(1<<rs1[i]);
8593
8594     /* Branch post-alloc */
8595     if(i>0)
8596     {
8597       current.was32=current.is32;
8598       current.wasdirty=current.dirty;
8599       switch(itype[i-1]) {
8600         case UJUMP:
8601           memcpy(&branch_regs[i-1],&current,sizeof(current));
8602           branch_regs[i-1].isconst=0;
8603           branch_regs[i-1].wasconst=0;
8604           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8605           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8606           alloc_cc(&branch_regs[i-1],i-1);
8607           dirty_reg(&branch_regs[i-1],CCREG);
8608           if(rt1[i-1]==31) { // JAL
8609             alloc_reg(&branch_regs[i-1],i-1,31);
8610             dirty_reg(&branch_regs[i-1],31);
8611             branch_regs[i-1].is32|=1LL<<31;
8612           }
8613           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8614           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8615           break;
8616         case RJUMP:
8617           memcpy(&branch_regs[i-1],&current,sizeof(current));
8618           branch_regs[i-1].isconst=0;
8619           branch_regs[i-1].wasconst=0;
8620           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8621           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8622           alloc_cc(&branch_regs[i-1],i-1);
8623           dirty_reg(&branch_regs[i-1],CCREG);
8624           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8625           if(rt1[i-1]!=0) { // JALR
8626             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8627             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8628             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8629           }
8630           #ifdef USE_MINI_HT
8631           if(rs1[i-1]==31) { // JALR
8632             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8633             #ifndef HOST_IMM_ADDR32
8634             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8635             #endif
8636           }
8637           #endif
8638           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8639           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8640           break;
8641         case CJUMP:
8642           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8643           {
8644             alloc_cc(&current,i-1);
8645             dirty_reg(&current,CCREG);
8646             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8647                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8648               // The delay slot overwrote one of our conditions
8649               // Delay slot goes after the test (in order)
8650               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8651               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8652               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8653               current.u|=1;
8654               current.uu|=1;
8655               delayslot_alloc(&current,i);
8656               current.isconst=0;
8657             }
8658             else
8659             {
8660               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8661               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8662               // Alloc the branch condition registers
8663               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8664               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8665               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8666               {
8667                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8668                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8669               }
8670             }
8671             memcpy(&branch_regs[i-1],&current,sizeof(current));
8672             branch_regs[i-1].isconst=0;
8673             branch_regs[i-1].wasconst=0;
8674             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8675             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8676           }
8677           else
8678           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8679           {
8680             alloc_cc(&current,i-1);
8681             dirty_reg(&current,CCREG);
8682             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8683               // The delay slot overwrote the branch condition
8684               // Delay slot goes after the test (in order)
8685               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8686               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8687               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8688               current.u|=1;
8689               current.uu|=1;
8690               delayslot_alloc(&current,i);
8691               current.isconst=0;
8692             }
8693             else
8694             {
8695               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8696               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8697               // Alloc the branch condition register
8698               alloc_reg(&current,i-1,rs1[i-1]);
8699               if(!(current.is32>>rs1[i-1]&1))
8700               {
8701                 alloc_reg64(&current,i-1,rs1[i-1]);
8702               }
8703             }
8704             memcpy(&branch_regs[i-1],&current,sizeof(current));
8705             branch_regs[i-1].isconst=0;
8706             branch_regs[i-1].wasconst=0;
8707             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8708             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8709           }
8710           else
8711           // Alloc the delay slot in case the branch is taken
8712           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8713           {
8714             memcpy(&branch_regs[i-1],&current,sizeof(current));
8715             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8716             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8717             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8718             alloc_cc(&branch_regs[i-1],i);
8719             dirty_reg(&branch_regs[i-1],CCREG);
8720             delayslot_alloc(&branch_regs[i-1],i);
8721             branch_regs[i-1].isconst=0;
8722             alloc_reg(&current,i,CCREG); // Not taken path
8723             dirty_reg(&current,CCREG);
8724             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8725           }
8726           else
8727           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8728           {
8729             memcpy(&branch_regs[i-1],&current,sizeof(current));
8730             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8731             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8732             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8733             alloc_cc(&branch_regs[i-1],i);
8734             dirty_reg(&branch_regs[i-1],CCREG);
8735             delayslot_alloc(&branch_regs[i-1],i);
8736             branch_regs[i-1].isconst=0;
8737             alloc_reg(&current,i,CCREG); // Not taken path
8738             dirty_reg(&current,CCREG);
8739             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8740           }
8741           break;
8742         case SJUMP:
8743           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8744           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8745           {
8746             alloc_cc(&current,i-1);
8747             dirty_reg(&current,CCREG);
8748             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8749               // The delay slot overwrote the branch condition
8750               // Delay slot goes after the test (in order)
8751               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8752               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8753               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8754               current.u|=1;
8755               current.uu|=1;
8756               delayslot_alloc(&current,i);
8757               current.isconst=0;
8758             }
8759             else
8760             {
8761               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8762               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8763               // Alloc the branch condition register
8764               alloc_reg(&current,i-1,rs1[i-1]);
8765               if(!(current.is32>>rs1[i-1]&1))
8766               {
8767                 alloc_reg64(&current,i-1,rs1[i-1]);
8768               }
8769             }
8770             memcpy(&branch_regs[i-1],&current,sizeof(current));
8771             branch_regs[i-1].isconst=0;
8772             branch_regs[i-1].wasconst=0;
8773             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8774             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8775           }
8776           else
8777           // Alloc the delay slot in case the branch is taken
8778           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8779           {
8780             memcpy(&branch_regs[i-1],&current,sizeof(current));
8781             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8782             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8783             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8784             alloc_cc(&branch_regs[i-1],i);
8785             dirty_reg(&branch_regs[i-1],CCREG);
8786             delayslot_alloc(&branch_regs[i-1],i);
8787             branch_regs[i-1].isconst=0;
8788             alloc_reg(&current,i,CCREG); // Not taken path
8789             dirty_reg(&current,CCREG);
8790             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8791           }
8792           // FIXME: BLTZAL/BGEZAL
8793           if(opcode2[i-1]&0x10) { // BxxZAL
8794             alloc_reg(&branch_regs[i-1],i-1,31);
8795             dirty_reg(&branch_regs[i-1],31);
8796             branch_regs[i-1].is32|=1LL<<31;
8797           }
8798           break;
8799         case FJUMP:
8800           if(likely[i-1]==0) // BC1F/BC1T
8801           {
8802             alloc_cc(&current,i-1);
8803             dirty_reg(&current,CCREG);
8804             if(itype[i]==FCOMP) {
8805               // The delay slot overwrote the branch condition
8806               // Delay slot goes after the test (in order)
8807               delayslot_alloc(&current,i);
8808               current.isconst=0;
8809             }
8810             else
8811             {
8812               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8813               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8814               // Alloc the branch condition register
8815               alloc_reg(&current,i-1,FSREG);
8816             }
8817             memcpy(&branch_regs[i-1],&current,sizeof(current));
8818             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8819           }
8820           else // BC1FL/BC1TL
8821           {
8822             // Alloc the delay slot in case the branch is taken
8823             memcpy(&branch_regs[i-1],&current,sizeof(current));
8824             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8825             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8826             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8827             alloc_cc(&branch_regs[i-1],i);
8828             dirty_reg(&branch_regs[i-1],CCREG);
8829             delayslot_alloc(&branch_regs[i-1],i);
8830             branch_regs[i-1].isconst=0;
8831             alloc_reg(&current,i,CCREG); // Not taken path
8832             dirty_reg(&current,CCREG);
8833             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8834           }
8835           break;
8836       }
8837
8838       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8839       {
8840         if(rt1[i-1]==31) // JAL/JALR
8841         {
8842           // Subroutine call will return here, don't alloc any registers
8843           current.is32=1;
8844           current.dirty=0;
8845           clear_all_regs(current.regmap);
8846           alloc_reg(&current,i,CCREG);
8847           dirty_reg(&current,CCREG);
8848         }
8849         else if(i+1<slen)
8850         {
8851           // Internal branch will jump here, match registers to caller
8852           current.is32=0x3FFFFFFFFLL;
8853           current.dirty=0;
8854           clear_all_regs(current.regmap);
8855           alloc_reg(&current,i,CCREG);
8856           dirty_reg(&current,CCREG);
8857           for(j=i-1;j>=0;j--)
8858           {
8859             if(ba[j]==start+i*4+4) {
8860               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8861               current.is32=branch_regs[j].is32;
8862               current.dirty=branch_regs[j].dirty;
8863               break;
8864             }
8865           }
8866           while(j>=0) {
8867             if(ba[j]==start+i*4+4) {
8868               for(hr=0;hr<HOST_REGS;hr++) {
8869                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8870                   current.regmap[hr]=-1;
8871                 }
8872                 current.is32&=branch_regs[j].is32;
8873                 current.dirty&=branch_regs[j].dirty;
8874               }
8875             }
8876             j--;
8877           }
8878         }
8879       }
8880     }
8881
8882     // Count cycles in between branches
8883     ccadj[i]=cc;
8884     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8885     {
8886       cc=0;
8887     }
8888 #if !defined(DRC_DBG)
8889     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8890     {
8891       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8892       cc+=gte_cycletab[source[i]&0x3f]/2;
8893     }
8894     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8895     {
8896       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8897     }
8898     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8899     {
8900       cc+=4;
8901     }
8902     else if(itype[i]==C2LS)
8903     {
8904       cc+=4;
8905     }
8906 #endif
8907     else
8908     {
8909       cc++;
8910     }
8911
8912     flush_dirty_uppers(&current);
8913     if(!is_ds[i]) {
8914       regs[i].is32=current.is32;
8915       regs[i].dirty=current.dirty;
8916       regs[i].isconst=current.isconst;
8917       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8918     }
8919     for(hr=0;hr<HOST_REGS;hr++) {
8920       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8921         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8922           regs[i].wasconst&=~(1<<hr);
8923         }
8924       }
8925     }
8926     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8927     regs[i].waswritten=current.waswritten;
8928   }
8929
8930   /* Pass 4 - Cull unused host registers */
8931
8932   uint64_t nr=0;
8933
8934   for (i=slen-1;i>=0;i--)
8935   {
8936     int hr;
8937     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8938     {
8939       if(ba[i]<start || ba[i]>=(start+slen*4))
8940       {
8941         // Branch out of this block, don't need anything
8942         nr=0;
8943       }
8944       else
8945       {
8946         // Internal branch
8947         // Need whatever matches the target
8948         nr=0;
8949         int t=(ba[i]-start)>>2;
8950         for(hr=0;hr<HOST_REGS;hr++)
8951         {
8952           if(regs[i].regmap_entry[hr]>=0) {
8953             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8954           }
8955         }
8956       }
8957       // Conditional branch may need registers for following instructions
8958       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8959       {
8960         if(i<slen-2) {
8961           nr|=needed_reg[i+2];
8962           for(hr=0;hr<HOST_REGS;hr++)
8963           {
8964             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8965             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8966           }
8967         }
8968       }
8969       // Don't need stuff which is overwritten
8970       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8971       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8972       // Merge in delay slot
8973       for(hr=0;hr<HOST_REGS;hr++)
8974       {
8975         if(!likely[i]) {
8976           // These are overwritten unless the branch is "likely"
8977           // and the delay slot is nullified if not taken
8978           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8979           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8980         }
8981         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8982         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8983         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8984         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8985         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8986         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8987         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8988         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8989         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8990           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8991           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8992         }
8993         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8994           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8995           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8996         }
8997         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8998           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8999           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9000         }
9001       }
9002     }
9003     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9004     {
9005       // SYSCALL instruction (software interrupt)
9006       nr=0;
9007     }
9008     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9009     {
9010       // ERET instruction (return from interrupt)
9011       nr=0;
9012     }
9013     else // Non-branch
9014     {
9015       if(i<slen-1) {
9016         for(hr=0;hr<HOST_REGS;hr++) {
9017           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9018           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9019           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9020           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9021         }
9022       }
9023     }
9024     for(hr=0;hr<HOST_REGS;hr++)
9025     {
9026       // Overwritten registers are not needed
9027       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9028       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9029       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9030       // Source registers are needed
9031       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9032       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9033       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9034       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9035       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9036       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9037       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9038       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9039       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9040         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9041         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9042       }
9043       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9044         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9045         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9046       }
9047       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9048         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9049         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9050       }
9051       // Don't store a register immediately after writing it,
9052       // may prevent dual-issue.
9053       // But do so if this is a branch target, otherwise we
9054       // might have to load the register before the branch.
9055       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9056         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9057            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9058           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9059           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9060         }
9061         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9062            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9063           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9064           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9065         }
9066       }
9067     }
9068     // Cycle count is needed at branches.  Assume it is needed at the target too.
9069     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9070       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9071       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9072     }
9073     // Save it
9074     needed_reg[i]=nr;
9075
9076     // Deallocate unneeded registers
9077     for(hr=0;hr<HOST_REGS;hr++)
9078     {
9079       if(!((nr>>hr)&1)) {
9080         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9081         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9082            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9083            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9084         {
9085           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9086           {
9087             if(likely[i]) {
9088               regs[i].regmap[hr]=-1;
9089               regs[i].isconst&=~(1<<hr);
9090               if(i<slen-2) {
9091                 regmap_pre[i+2][hr]=-1;
9092                 regs[i+2].wasconst&=~(1<<hr);
9093               }
9094             }
9095           }
9096         }
9097         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9098         {
9099           int d1=0,d2=0,map=0,temp=0;
9100           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9101           {
9102             d1=dep1[i+1];
9103             d2=dep2[i+1];
9104           }
9105           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9106              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9107             map=INVCP;
9108           }
9109           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9110              itype[i+1]==C1LS || itype[i+1]==C2LS)
9111             temp=FTEMP;
9112           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9113              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9114              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9115              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9116              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9117              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9118              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9119              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9120              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9121              regs[i].regmap[hr]!=map )
9122           {
9123             regs[i].regmap[hr]=-1;
9124             regs[i].isconst&=~(1<<hr);
9125             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9126                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9127                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9128                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9129                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9130                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9131                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9132                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9133                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9134                branch_regs[i].regmap[hr]!=map)
9135             {
9136               branch_regs[i].regmap[hr]=-1;
9137               branch_regs[i].regmap_entry[hr]=-1;
9138               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9139               {
9140                 if(!likely[i]&&i<slen-2) {
9141                   regmap_pre[i+2][hr]=-1;
9142                   regs[i+2].wasconst&=~(1<<hr);
9143                 }
9144               }
9145             }
9146           }
9147         }
9148         else
9149         {
9150           // Non-branch
9151           if(i>0)
9152           {
9153             int d1=0,d2=0,map=-1,temp=-1;
9154             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9155             {
9156               d1=dep1[i];
9157               d2=dep2[i];
9158             }
9159             if(itype[i]==STORE || itype[i]==STORELR ||
9160                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9161               map=INVCP;
9162             }
9163             if(itype[i]==LOADLR || itype[i]==STORELR ||
9164                itype[i]==C1LS || itype[i]==C2LS)
9165               temp=FTEMP;
9166             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9167                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9168                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9169                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9170                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9171                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9172             {
9173               if(i<slen-1&&!is_ds[i]) {
9174                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9175                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9176                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9177                 {
9178                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9179                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9180                 }
9181                 regmap_pre[i+1][hr]=-1;
9182                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9183                 regs[i+1].wasconst&=~(1<<hr);
9184               }
9185               regs[i].regmap[hr]=-1;
9186               regs[i].isconst&=~(1<<hr);
9187             }
9188           }
9189         }
9190       }
9191     }
9192   }
9193
9194   /* Pass 5 - Pre-allocate registers */
9195
9196   // If a register is allocated during a loop, try to allocate it for the
9197   // entire loop, if possible.  This avoids loading/storing registers
9198   // inside of the loop.
9199
9200   signed char f_regmap[HOST_REGS];
9201   clear_all_regs(f_regmap);
9202   for(i=0;i<slen-1;i++)
9203   {
9204     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9205     {
9206       if(ba[i]>=start && ba[i]<(start+i*4))
9207       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9208       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9209       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9210       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9211       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9212       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9213       {
9214         int t=(ba[i]-start)>>2;
9215         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9216         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9217         for(hr=0;hr<HOST_REGS;hr++)
9218         {
9219           if(regs[i].regmap[hr]>64) {
9220             if(!((regs[i].dirty>>hr)&1))
9221               f_regmap[hr]=regs[i].regmap[hr];
9222             else f_regmap[hr]=-1;
9223           }
9224           else if(regs[i].regmap[hr]>=0) {
9225             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9226               // dealloc old register
9227               int n;
9228               for(n=0;n<HOST_REGS;n++)
9229               {
9230                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9231               }
9232               // and alloc new one
9233               f_regmap[hr]=regs[i].regmap[hr];
9234             }
9235           }
9236           if(branch_regs[i].regmap[hr]>64) {
9237             if(!((branch_regs[i].dirty>>hr)&1))
9238               f_regmap[hr]=branch_regs[i].regmap[hr];
9239             else f_regmap[hr]=-1;
9240           }
9241           else if(branch_regs[i].regmap[hr]>=0) {
9242             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9243               // dealloc old register
9244               int n;
9245               for(n=0;n<HOST_REGS;n++)
9246               {
9247                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9248               }
9249               // and alloc new one
9250               f_regmap[hr]=branch_regs[i].regmap[hr];
9251             }
9252           }
9253           if(ooo[i]) {
9254             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9255               f_regmap[hr]=branch_regs[i].regmap[hr];
9256           }else{
9257             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9258               f_regmap[hr]=branch_regs[i].regmap[hr];
9259           }
9260           // Avoid dirty->clean transition
9261           #ifdef DESTRUCTIVE_WRITEBACK
9262           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9263           #endif
9264           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9265           // case above, however it's always a good idea.  We can't hoist the
9266           // load if the register was already allocated, so there's no point
9267           // wasting time analyzing most of these cases.  It only "succeeds"
9268           // when the mapping was different and the load can be replaced with
9269           // a mov, which is of negligible benefit.  So such cases are
9270           // skipped below.
9271           if(f_regmap[hr]>0) {
9272             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9273               int r=f_regmap[hr];
9274               for(j=t;j<=i;j++)
9275               {
9276                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9277                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9278                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9279                 if(r>63) {
9280                   // NB This can exclude the case where the upper-half
9281                   // register is lower numbered than the lower-half
9282                   // register.  Not sure if it's worth fixing...
9283                   if(get_reg(regs[j].regmap,r&63)<0) break;
9284                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9285                   if(regs[j].is32&(1LL<<(r&63))) break;
9286                 }
9287                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9288                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9289                   int k;
9290                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9291                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9292                     if(r>63) {
9293                       if(get_reg(regs[i].regmap,r&63)<0) break;
9294                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9295                     }
9296                     k=i;
9297                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9298                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9299                         //printf("no free regs for store %x\n",start+(k-1)*4);
9300                         break;
9301                       }
9302                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9303                         //printf("no-match due to different register\n");
9304                         break;
9305                       }
9306                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9307                         //printf("no-match due to branch\n");
9308                         break;
9309                       }
9310                       // call/ret fast path assumes no registers allocated
9311                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9312                         break;
9313                       }
9314                       if(r>63) {
9315                         // NB This can exclude the case where the upper-half
9316                         // register is lower numbered than the lower-half
9317                         // register.  Not sure if it's worth fixing...
9318                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9319                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9320                       }
9321                       k--;
9322                     }
9323                     if(i<slen-1) {
9324                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9325                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9326                         //printf("bad match after branch\n");
9327                         break;
9328                       }
9329                     }
9330                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9331                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9332                       while(k<i) {
9333                         regs[k].regmap_entry[hr]=f_regmap[hr];
9334                         regs[k].regmap[hr]=f_regmap[hr];
9335                         regmap_pre[k+1][hr]=f_regmap[hr];
9336                         regs[k].wasdirty&=~(1<<hr);
9337                         regs[k].dirty&=~(1<<hr);
9338                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9339                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9340                         regs[k].wasconst&=~(1<<hr);
9341                         regs[k].isconst&=~(1<<hr);
9342                         k++;
9343                       }
9344                     }
9345                     else {
9346                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9347                       break;
9348                     }
9349                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9350                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9351                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9352                       regs[i].regmap_entry[hr]=f_regmap[hr];
9353                       regs[i].regmap[hr]=f_regmap[hr];
9354                       regs[i].wasdirty&=~(1<<hr);
9355                       regs[i].dirty&=~(1<<hr);
9356                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9357                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9358                       regs[i].wasconst&=~(1<<hr);
9359                       regs[i].isconst&=~(1<<hr);
9360                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9361                       branch_regs[i].wasdirty&=~(1<<hr);
9362                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9363                       branch_regs[i].regmap[hr]=f_regmap[hr];
9364                       branch_regs[i].dirty&=~(1<<hr);
9365                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9366                       branch_regs[i].wasconst&=~(1<<hr);
9367                       branch_regs[i].isconst&=~(1<<hr);
9368                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9369                         regmap_pre[i+2][hr]=f_regmap[hr];
9370                         regs[i+2].wasdirty&=~(1<<hr);
9371                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9372                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9373                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9374                       }
9375                     }
9376                   }
9377                   for(k=t;k<j;k++) {
9378                     // Alloc register clean at beginning of loop,
9379                     // but may dirty it in pass 6
9380                     regs[k].regmap_entry[hr]=f_regmap[hr];
9381                     regs[k].regmap[hr]=f_regmap[hr];
9382                     regs[k].dirty&=~(1<<hr);
9383                     regs[k].wasconst&=~(1<<hr);
9384                     regs[k].isconst&=~(1<<hr);
9385                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9386                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9387                       branch_regs[k].regmap[hr]=f_regmap[hr];
9388                       branch_regs[k].dirty&=~(1<<hr);
9389                       branch_regs[k].wasconst&=~(1<<hr);
9390                       branch_regs[k].isconst&=~(1<<hr);
9391                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9392                         regmap_pre[k+2][hr]=f_regmap[hr];
9393                         regs[k+2].wasdirty&=~(1<<hr);
9394                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9395                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9396                       }
9397                     }
9398                     else
9399                     {
9400                       regmap_pre[k+1][hr]=f_regmap[hr];
9401                       regs[k+1].wasdirty&=~(1<<hr);
9402                     }
9403                   }
9404                   if(regs[j].regmap[hr]==f_regmap[hr])
9405                     regs[j].regmap_entry[hr]=f_regmap[hr];
9406                   break;
9407                 }
9408                 if(j==i) break;
9409                 if(regs[j].regmap[hr]>=0)
9410                   break;
9411                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9412                   //printf("no-match due to different register\n");
9413                   break;
9414                 }
9415                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9416                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9417                   break;
9418                 }
9419                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9420                 {
9421                   // Stop on unconditional branch
9422                   break;
9423                 }
9424                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9425                 {
9426                   if(ooo[j]) {
9427                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9428                       break;
9429                   }else{
9430                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9431                       break;
9432                   }
9433                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9434                     //printf("no-match due to different register (branch)\n");
9435                     break;
9436                   }
9437                 }
9438                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9439                   //printf("No free regs for store %x\n",start+j*4);
9440                   break;
9441                 }
9442                 if(f_regmap[hr]>=64) {
9443                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9444                     break;
9445                   }
9446                   else
9447                   {
9448                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9449                       break;
9450                     }
9451                   }
9452                 }
9453               }
9454             }
9455           }
9456         }
9457       }
9458     }else{
9459       // Non branch or undetermined branch target
9460       for(hr=0;hr<HOST_REGS;hr++)
9461       {
9462         if(hr!=EXCLUDE_REG) {
9463           if(regs[i].regmap[hr]>64) {
9464             if(!((regs[i].dirty>>hr)&1))
9465               f_regmap[hr]=regs[i].regmap[hr];
9466           }
9467           else if(regs[i].regmap[hr]>=0) {
9468             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9469               // dealloc old register
9470               int n;
9471               for(n=0;n<HOST_REGS;n++)
9472               {
9473                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9474               }
9475               // and alloc new one
9476               f_regmap[hr]=regs[i].regmap[hr];
9477             }
9478           }
9479         }
9480       }
9481       // Try to restore cycle count at branch targets
9482       if(bt[i]) {
9483         for(j=i;j<slen-1;j++) {
9484           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9485           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9486             //printf("no free regs for store %x\n",start+j*4);
9487             break;
9488           }
9489         }
9490         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9491           int k=i;
9492           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9493           while(k<j) {
9494             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9495             regs[k].regmap[HOST_CCREG]=CCREG;
9496             regmap_pre[k+1][HOST_CCREG]=CCREG;
9497             regs[k+1].wasdirty|=1<<HOST_CCREG;
9498             regs[k].dirty|=1<<HOST_CCREG;
9499             regs[k].wasconst&=~(1<<HOST_CCREG);
9500             regs[k].isconst&=~(1<<HOST_CCREG);
9501             k++;
9502           }
9503           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9504         }
9505         // Work backwards from the branch target
9506         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9507         {
9508           //printf("Extend backwards\n");
9509           int k;
9510           k=i;
9511           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9512             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9513               //printf("no free regs for store %x\n",start+(k-1)*4);
9514               break;
9515             }
9516             k--;
9517           }
9518           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9519             //printf("Extend CC, %x ->\n",start+k*4);
9520             while(k<=i) {
9521               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9522               regs[k].regmap[HOST_CCREG]=CCREG;
9523               regmap_pre[k+1][HOST_CCREG]=CCREG;
9524               regs[k+1].wasdirty|=1<<HOST_CCREG;
9525               regs[k].dirty|=1<<HOST_CCREG;
9526               regs[k].wasconst&=~(1<<HOST_CCREG);
9527               regs[k].isconst&=~(1<<HOST_CCREG);
9528               k++;
9529             }
9530           }
9531           else {
9532             //printf("Fail Extend CC, %x ->\n",start+k*4);
9533           }
9534         }
9535       }
9536       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9537          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9538          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9539          itype[i]!=FCONV&&itype[i]!=FCOMP)
9540       {
9541         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9542       }
9543     }
9544   }
9545
9546   // Cache memory offset or tlb map pointer if a register is available
9547   #ifndef HOST_IMM_ADDR32
9548   #ifndef RAM_OFFSET
9549   if(0)
9550   #endif
9551   {
9552     int earliest_available[HOST_REGS];
9553     int loop_start[HOST_REGS];
9554     int score[HOST_REGS];
9555     int end[HOST_REGS];
9556     int reg=ROREG;
9557
9558     // Init
9559     for(hr=0;hr<HOST_REGS;hr++) {
9560       score[hr]=0;earliest_available[hr]=0;
9561       loop_start[hr]=MAXBLOCK;
9562     }
9563     for(i=0;i<slen-1;i++)
9564     {
9565       // Can't do anything if no registers are available
9566       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9567         for(hr=0;hr<HOST_REGS;hr++) {
9568           score[hr]=0;earliest_available[hr]=i+1;
9569           loop_start[hr]=MAXBLOCK;
9570         }
9571       }
9572       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9573         if(!ooo[i]) {
9574           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9575             for(hr=0;hr<HOST_REGS;hr++) {
9576               score[hr]=0;earliest_available[hr]=i+1;
9577               loop_start[hr]=MAXBLOCK;
9578             }
9579           }
9580         }else{
9581           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9582             for(hr=0;hr<HOST_REGS;hr++) {
9583               score[hr]=0;earliest_available[hr]=i+1;
9584               loop_start[hr]=MAXBLOCK;
9585             }
9586           }
9587         }
9588       }
9589       // Mark unavailable registers
9590       for(hr=0;hr<HOST_REGS;hr++) {
9591         if(regs[i].regmap[hr]>=0) {
9592           score[hr]=0;earliest_available[hr]=i+1;
9593           loop_start[hr]=MAXBLOCK;
9594         }
9595         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9596           if(branch_regs[i].regmap[hr]>=0) {
9597             score[hr]=0;earliest_available[hr]=i+2;
9598             loop_start[hr]=MAXBLOCK;
9599           }
9600         }
9601       }
9602       // No register allocations after unconditional jumps
9603       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9604       {
9605         for(hr=0;hr<HOST_REGS;hr++) {
9606           score[hr]=0;earliest_available[hr]=i+2;
9607           loop_start[hr]=MAXBLOCK;
9608         }
9609         i++; // Skip delay slot too
9610         //printf("skip delay slot: %x\n",start+i*4);
9611       }
9612       else
9613       // Possible match
9614       if(itype[i]==LOAD||itype[i]==LOADLR||
9615          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9616         for(hr=0;hr<HOST_REGS;hr++) {
9617           if(hr!=EXCLUDE_REG) {
9618             end[hr]=i-1;
9619             for(j=i;j<slen-1;j++) {
9620               if(regs[j].regmap[hr]>=0) break;
9621               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9622                 if(branch_regs[j].regmap[hr]>=0) break;
9623                 if(ooo[j]) {
9624                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9625                 }else{
9626                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9627                 }
9628               }
9629               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9630               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9631                 int t=(ba[j]-start)>>2;
9632                 if(t<j&&t>=earliest_available[hr]) {
9633                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9634                     // Score a point for hoisting loop invariant
9635                     if(t<loop_start[hr]) loop_start[hr]=t;
9636                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9637                     score[hr]++;
9638                     end[hr]=j;
9639                   }
9640                 }
9641                 else if(t<j) {
9642                   if(regs[t].regmap[hr]==reg) {
9643                     // Score a point if the branch target matches this register
9644                     score[hr]++;
9645                     end[hr]=j;
9646                   }
9647                 }
9648                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9649                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9650                   score[hr]++;
9651                   end[hr]=j;
9652                 }
9653               }
9654               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9655               {
9656                 // Stop on unconditional branch
9657                 break;
9658               }
9659               else
9660               if(itype[j]==LOAD||itype[j]==LOADLR||
9661                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9662                 score[hr]++;
9663                 end[hr]=j;
9664               }
9665             }
9666           }
9667         }
9668         // Find highest score and allocate that register
9669         int maxscore=0;
9670         for(hr=0;hr<HOST_REGS;hr++) {
9671           if(hr!=EXCLUDE_REG) {
9672             if(score[hr]>score[maxscore]) {
9673               maxscore=hr;
9674               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9675             }
9676           }
9677         }
9678         if(score[maxscore]>1)
9679         {
9680           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9681           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9682             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9683             assert(regs[j].regmap[maxscore]<0);
9684             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9685             regs[j].regmap[maxscore]=reg;
9686             regs[j].dirty&=~(1<<maxscore);
9687             regs[j].wasconst&=~(1<<maxscore);
9688             regs[j].isconst&=~(1<<maxscore);
9689             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9690               branch_regs[j].regmap[maxscore]=reg;
9691               branch_regs[j].wasdirty&=~(1<<maxscore);
9692               branch_regs[j].dirty&=~(1<<maxscore);
9693               branch_regs[j].wasconst&=~(1<<maxscore);
9694               branch_regs[j].isconst&=~(1<<maxscore);
9695               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9696                 regmap_pre[j+2][maxscore]=reg;
9697                 regs[j+2].wasdirty&=~(1<<maxscore);
9698               }
9699               // loop optimization (loop_preload)
9700               int t=(ba[j]-start)>>2;
9701               if(t==loop_start[maxscore]) {
9702                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9703                   regs[t].regmap_entry[maxscore]=reg;
9704               }
9705             }
9706             else
9707             {
9708               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9709                 regmap_pre[j+1][maxscore]=reg;
9710                 regs[j+1].wasdirty&=~(1<<maxscore);
9711               }
9712             }
9713           }
9714           i=j-1;
9715           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9716           for(hr=0;hr<HOST_REGS;hr++) {
9717             score[hr]=0;earliest_available[hr]=i+i;
9718             loop_start[hr]=MAXBLOCK;
9719           }
9720         }
9721       }
9722     }
9723   }
9724   #endif
9725
9726   // This allocates registers (if possible) one instruction prior
9727   // to use, which can avoid a load-use penalty on certain CPUs.
9728   for(i=0;i<slen-1;i++)
9729   {
9730     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9731     {
9732       if(!bt[i+1])
9733       {
9734         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9735            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9736         {
9737           if(rs1[i+1]) {
9738             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9739             {
9740               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9741               {
9742                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9743                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9744                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9745                 regs[i].isconst&=~(1<<hr);
9746                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9747                 constmap[i][hr]=constmap[i+1][hr];
9748                 regs[i+1].wasdirty&=~(1<<hr);
9749                 regs[i].dirty&=~(1<<hr);
9750               }
9751             }
9752           }
9753           if(rs2[i+1]) {
9754             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9755             {
9756               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9757               {
9758                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9759                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9760                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9761                 regs[i].isconst&=~(1<<hr);
9762                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9763                 constmap[i][hr]=constmap[i+1][hr];
9764                 regs[i+1].wasdirty&=~(1<<hr);
9765                 regs[i].dirty&=~(1<<hr);
9766               }
9767             }
9768           }
9769           // Preload target address for load instruction (non-constant)
9770           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9771             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9772             {
9773               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9774               {
9775                 regs[i].regmap[hr]=rs1[i+1];
9776                 regmap_pre[i+1][hr]=rs1[i+1];
9777                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9778                 regs[i].isconst&=~(1<<hr);
9779                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9780                 constmap[i][hr]=constmap[i+1][hr];
9781                 regs[i+1].wasdirty&=~(1<<hr);
9782                 regs[i].dirty&=~(1<<hr);
9783               }
9784             }
9785           }
9786           // Load source into target register
9787           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9788             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9789             {
9790               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9791               {
9792                 regs[i].regmap[hr]=rs1[i+1];
9793                 regmap_pre[i+1][hr]=rs1[i+1];
9794                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9795                 regs[i].isconst&=~(1<<hr);
9796                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9797                 constmap[i][hr]=constmap[i+1][hr];
9798                 regs[i+1].wasdirty&=~(1<<hr);
9799                 regs[i].dirty&=~(1<<hr);
9800               }
9801             }
9802           }
9803           // Address for store instruction (non-constant)
9804           if(itype[i+1]==STORE||itype[i+1]==STORELR
9805              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9806             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9807               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9808               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9809               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9810               assert(hr>=0);
9811               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9812               {
9813                 regs[i].regmap[hr]=rs1[i+1];
9814                 regmap_pre[i+1][hr]=rs1[i+1];
9815                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9816                 regs[i].isconst&=~(1<<hr);
9817                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9818                 constmap[i][hr]=constmap[i+1][hr];
9819                 regs[i+1].wasdirty&=~(1<<hr);
9820                 regs[i].dirty&=~(1<<hr);
9821               }
9822             }
9823           }
9824           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9825             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9826               int nr;
9827               hr=get_reg(regs[i+1].regmap,FTEMP);
9828               assert(hr>=0);
9829               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9830               {
9831                 regs[i].regmap[hr]=rs1[i+1];
9832                 regmap_pre[i+1][hr]=rs1[i+1];
9833                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9834                 regs[i].isconst&=~(1<<hr);
9835                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9836                 constmap[i][hr]=constmap[i+1][hr];
9837                 regs[i+1].wasdirty&=~(1<<hr);
9838                 regs[i].dirty&=~(1<<hr);
9839               }
9840               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9841               {
9842                 // move it to another register
9843                 regs[i+1].regmap[hr]=-1;
9844                 regmap_pre[i+2][hr]=-1;
9845                 regs[i+1].regmap[nr]=FTEMP;
9846                 regmap_pre[i+2][nr]=FTEMP;
9847                 regs[i].regmap[nr]=rs1[i+1];
9848                 regmap_pre[i+1][nr]=rs1[i+1];
9849                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9850                 regs[i].isconst&=~(1<<nr);
9851                 regs[i+1].isconst&=~(1<<nr);
9852                 regs[i].dirty&=~(1<<nr);
9853                 regs[i+1].wasdirty&=~(1<<nr);
9854                 regs[i+1].dirty&=~(1<<nr);
9855                 regs[i+2].wasdirty&=~(1<<nr);
9856               }
9857             }
9858           }
9859           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9860             if(itype[i+1]==LOAD)
9861               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9862             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9863               hr=get_reg(regs[i+1].regmap,FTEMP);
9864             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9865               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9866               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9867             }
9868             if(hr>=0&&regs[i].regmap[hr]<0) {
9869               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9870               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9871                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9872                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9873                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9874                 regs[i].isconst&=~(1<<hr);
9875                 regs[i+1].wasdirty&=~(1<<hr);
9876                 regs[i].dirty&=~(1<<hr);
9877               }
9878             }
9879           }
9880         }
9881       }
9882     }
9883   }
9884
9885   /* Pass 6 - Optimize clean/dirty state */
9886   clean_registers(0,slen-1,1);
9887
9888   /* Pass 7 - Identify 32-bit registers */
9889   for (i=slen-1;i>=0;i--)
9890   {
9891     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9892     {
9893       // Conditional branch
9894       if((source[i]>>16)!=0x1000&&i<slen-2) {
9895         // Mark this address as a branch target since it may be called
9896         // upon return from interrupt
9897         bt[i+2]=1;
9898       }
9899     }
9900   }
9901
9902   if(itype[slen-1]==SPAN) {
9903     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9904   }
9905
9906 #ifdef DISASM
9907   /* Debug/disassembly */
9908   for(i=0;i<slen;i++)
9909   {
9910     printf("U:");
9911     int r;
9912     for(r=1;r<=CCREG;r++) {
9913       if((unneeded_reg[i]>>r)&1) {
9914         if(r==HIREG) printf(" HI");
9915         else if(r==LOREG) printf(" LO");
9916         else printf(" r%d",r);
9917       }
9918     }
9919     printf("\n");
9920     #if defined(__i386__) || defined(__x86_64__)
9921     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9922     #endif
9923     #ifdef __arm__
9924     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9925     #endif
9926     printf("needs: ");
9927     if(needed_reg[i]&1) printf("eax ");
9928     if((needed_reg[i]>>1)&1) printf("ecx ");
9929     if((needed_reg[i]>>2)&1) printf("edx ");
9930     if((needed_reg[i]>>3)&1) printf("ebx ");
9931     if((needed_reg[i]>>5)&1) printf("ebp ");
9932     if((needed_reg[i]>>6)&1) printf("esi ");
9933     if((needed_reg[i]>>7)&1) printf("edi ");
9934     printf("\n");
9935     #if defined(__i386__) || defined(__x86_64__)
9936     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9937     printf("dirty: ");
9938     if(regs[i].wasdirty&1) printf("eax ");
9939     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9940     if((regs[i].wasdirty>>2)&1) printf("edx ");
9941     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9942     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9943     if((regs[i].wasdirty>>6)&1) printf("esi ");
9944     if((regs[i].wasdirty>>7)&1) printf("edi ");
9945     #endif
9946     #ifdef __arm__
9947     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9948     printf("dirty: ");
9949     if(regs[i].wasdirty&1) printf("r0 ");
9950     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9951     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9952     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9953     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9954     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9955     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9956     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9957     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9958     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9959     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9960     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9961     #endif
9962     printf("\n");
9963     disassemble_inst(i);
9964     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9965     #if defined(__i386__) || defined(__x86_64__)
9966     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9967     if(regs[i].dirty&1) printf("eax ");
9968     if((regs[i].dirty>>1)&1) printf("ecx ");
9969     if((regs[i].dirty>>2)&1) printf("edx ");
9970     if((regs[i].dirty>>3)&1) printf("ebx ");
9971     if((regs[i].dirty>>5)&1) printf("ebp ");
9972     if((regs[i].dirty>>6)&1) printf("esi ");
9973     if((regs[i].dirty>>7)&1) printf("edi ");
9974     #endif
9975     #ifdef __arm__
9976     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9977     if(regs[i].dirty&1) printf("r0 ");
9978     if((regs[i].dirty>>1)&1) printf("r1 ");
9979     if((regs[i].dirty>>2)&1) printf("r2 ");
9980     if((regs[i].dirty>>3)&1) printf("r3 ");
9981     if((regs[i].dirty>>4)&1) printf("r4 ");
9982     if((regs[i].dirty>>5)&1) printf("r5 ");
9983     if((regs[i].dirty>>6)&1) printf("r6 ");
9984     if((regs[i].dirty>>7)&1) printf("r7 ");
9985     if((regs[i].dirty>>8)&1) printf("r8 ");
9986     if((regs[i].dirty>>9)&1) printf("r9 ");
9987     if((regs[i].dirty>>10)&1) printf("r10 ");
9988     if((regs[i].dirty>>12)&1) printf("r12 ");
9989     #endif
9990     printf("\n");
9991     if(regs[i].isconst) {
9992       printf("constants: ");
9993       #if defined(__i386__) || defined(__x86_64__)
9994       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9995       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9996       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9997       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9998       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9999       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10000       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10001       #endif
10002       #ifdef __arm__
10003       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10004       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10005       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10006       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10007       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10008       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10009       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10010       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10011       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10012       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10013       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10014       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10015       #endif
10016       printf("\n");
10017     }
10018     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10019       #if defined(__i386__) || defined(__x86_64__)
10020       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10021       if(branch_regs[i].dirty&1) printf("eax ");
10022       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10023       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10024       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10025       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10026       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10027       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10028       #endif
10029       #ifdef __arm__
10030       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10031       if(branch_regs[i].dirty&1) printf("r0 ");
10032       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10033       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10034       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10035       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10036       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10037       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10038       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10039       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10040       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10041       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10042       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10043       #endif
10044     }
10045   }
10046 #endif // DISASM
10047
10048   /* Pass 8 - Assembly */
10049   linkcount=0;stubcount=0;
10050   ds=0;is_delayslot=0;
10051   cop1_usable=0;
10052   uint64_t is32_pre=0;
10053   u_int dirty_pre=0;
10054   void *beginning=start_block();
10055   if((u_int)addr&1) {
10056     ds=1;
10057     pagespan_ds();
10058   }
10059   void *instr_addr0_override = NULL;
10060
10061   if (start == 0x80030000) {
10062     // nasty hack for fastbios thing
10063     // override block entry to this code
10064     instr_addr0_override = out;
10065     emit_movimm(start,0);
10066     // abuse io address var as a flag that we
10067     // have already returned here once
10068     emit_readword((int)&address,1);
10069     emit_writeword(0,(int)&pcaddr);
10070     emit_writeword(0,(int)&address);
10071     emit_cmp(0,1);
10072     emit_jne((int)new_dyna_leave);
10073   }
10074   for(i=0;i<slen;i++)
10075   {
10076     //if(ds) printf("ds: ");
10077     disassemble_inst(i);
10078     if(ds) {
10079       ds=0; // Skip delay slot
10080       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10081       instr_addr[i] = NULL;
10082     } else {
10083       speculate_register_values(i);
10084       #ifndef DESTRUCTIVE_WRITEBACK
10085       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10086       {
10087         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10088               unneeded_reg[i],unneeded_reg_upper[i]);
10089       }
10090       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10091         is32_pre=branch_regs[i].is32;
10092         dirty_pre=branch_regs[i].dirty;
10093       }else{
10094         is32_pre=regs[i].is32;
10095         dirty_pre=regs[i].dirty;
10096       }
10097       #endif
10098       // write back
10099       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10100       {
10101         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10102                       unneeded_reg[i],unneeded_reg_upper[i]);
10103         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10104       }
10105       // branch target entry point
10106       instr_addr[i] = out;
10107       assem_debug("<->\n");
10108       drc_dbg_emit_do_cmp(i);
10109
10110       // load regs
10111       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10112         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10113       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10114       address_generation(i,&regs[i],regs[i].regmap_entry);
10115       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10116       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10117       {
10118         // Load the delay slot registers if necessary
10119         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10120           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10121         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10122           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10123         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10124           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10125       }
10126       else if(i+1<slen)
10127       {
10128         // Preload registers for following instruction
10129         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10130           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10131             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10132         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10133           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10134             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10135       }
10136       // TODO: if(is_ooo(i)) address_generation(i+1);
10137       if(itype[i]==CJUMP||itype[i]==FJUMP)
10138         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10139       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10140         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10141       if(bt[i]) cop1_usable=0;
10142       // assemble
10143       switch(itype[i]) {
10144         case ALU:
10145           alu_assemble(i,&regs[i]);break;
10146         case IMM16:
10147           imm16_assemble(i,&regs[i]);break;
10148         case SHIFT:
10149           shift_assemble(i,&regs[i]);break;
10150         case SHIFTIMM:
10151           shiftimm_assemble(i,&regs[i]);break;
10152         case LOAD:
10153           load_assemble(i,&regs[i]);break;
10154         case LOADLR:
10155           loadlr_assemble(i,&regs[i]);break;
10156         case STORE:
10157           store_assemble(i,&regs[i]);break;
10158         case STORELR:
10159           storelr_assemble(i,&regs[i]);break;
10160         case COP0:
10161           cop0_assemble(i,&regs[i]);break;
10162         case COP1:
10163           cop1_assemble(i,&regs[i]);break;
10164         case C1LS:
10165           c1ls_assemble(i,&regs[i]);break;
10166         case COP2:
10167           cop2_assemble(i,&regs[i]);break;
10168         case C2LS:
10169           c2ls_assemble(i,&regs[i]);break;
10170         case C2OP:
10171           c2op_assemble(i,&regs[i]);break;
10172         case FCONV:
10173           fconv_assemble(i,&regs[i]);break;
10174         case FLOAT:
10175           float_assemble(i,&regs[i]);break;
10176         case FCOMP:
10177           fcomp_assemble(i,&regs[i]);break;
10178         case MULTDIV:
10179           multdiv_assemble(i,&regs[i]);break;
10180         case MOV:
10181           mov_assemble(i,&regs[i]);break;
10182         case SYSCALL:
10183           syscall_assemble(i,&regs[i]);break;
10184         case HLECALL:
10185           hlecall_assemble(i,&regs[i]);break;
10186         case INTCALL:
10187           intcall_assemble(i,&regs[i]);break;
10188         case UJUMP:
10189           ujump_assemble(i,&regs[i]);ds=1;break;
10190         case RJUMP:
10191           rjump_assemble(i,&regs[i]);ds=1;break;
10192         case CJUMP:
10193           cjump_assemble(i,&regs[i]);ds=1;break;
10194         case SJUMP:
10195           sjump_assemble(i,&regs[i]);ds=1;break;
10196         case FJUMP:
10197           fjump_assemble(i,&regs[i]);ds=1;break;
10198         case SPAN:
10199           pagespan_assemble(i,&regs[i]);break;
10200       }
10201       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10202         literal_pool(1024);
10203       else
10204         literal_pool_jumpover(256);
10205     }
10206   }
10207   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10208   // If the block did not end with an unconditional branch,
10209   // add a jump to the next instruction.
10210   if(i>1) {
10211     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10212       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10213       assert(i==slen);
10214       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10215         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10216         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10217           emit_loadreg(CCREG,HOST_CCREG);
10218         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10219       }
10220       else if(!likely[i-2])
10221       {
10222         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10223         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10224       }
10225       else
10226       {
10227         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10228         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10229       }
10230       add_to_linker((int)out,start+i*4,0);
10231       emit_jmp(0);
10232     }
10233   }
10234   else
10235   {
10236     assert(i>0);
10237     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10238     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10239     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10240       emit_loadreg(CCREG,HOST_CCREG);
10241     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10242     add_to_linker((int)out,start+i*4,0);
10243     emit_jmp(0);
10244   }
10245
10246   // TODO: delay slot stubs?
10247   // Stubs
10248   for(i=0;i<stubcount;i++)
10249   {
10250     switch(stubs[i].type)
10251     {
10252       case LOADB_STUB:
10253       case LOADH_STUB:
10254       case LOADW_STUB:
10255       case LOADD_STUB:
10256       case LOADBU_STUB:
10257       case LOADHU_STUB:
10258         do_readstub(i);break;
10259       case STOREB_STUB:
10260       case STOREH_STUB:
10261       case STOREW_STUB:
10262       case STORED_STUB:
10263         do_writestub(i);break;
10264       case CC_STUB:
10265         do_ccstub(i);break;
10266       case INVCODE_STUB:
10267         do_invstub(i);break;
10268       case FP_STUB:
10269         do_cop1stub(i);break;
10270       case STORELR_STUB:
10271         do_unalignedwritestub(i);break;
10272     }
10273   }
10274
10275   if (instr_addr0_override)
10276     instr_addr[0] = instr_addr0_override;
10277
10278   /* Pass 9 - Linker */
10279   for(i=0;i<linkcount;i++)
10280   {
10281     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10282     literal_pool(64);
10283     if(!link_addr[i][2])
10284     {
10285       void *stub=out;
10286       void *addr=check_addr(link_addr[i][1]);
10287       emit_extjump(link_addr[i][0],link_addr[i][1]);
10288       if(addr) {
10289         set_jump_target(link_addr[i][0], addr);
10290         add_link(link_addr[i][1],stub);
10291       }
10292       else set_jump_target(link_addr[i][0], stub);
10293     }
10294     else
10295     {
10296       // Internal branch
10297       int target=(link_addr[i][1]-start)>>2;
10298       assert(target>=0&&target<slen);
10299       assert(instr_addr[target]);
10300       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10301       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10302       //#else
10303       set_jump_target(link_addr[i][0],instr_addr[target]);
10304       //#endif
10305     }
10306   }
10307   // External Branch Targets (jump_in)
10308   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10309   for(i=0;i<slen;i++)
10310   {
10311     if(bt[i]||i==0)
10312     {
10313       if(instr_addr[i]) // TODO - delay slots (=null)
10314       {
10315         u_int vaddr=start+i*4;
10316         u_int page=get_page(vaddr);
10317         u_int vpage=get_vpage(vaddr);
10318         literal_pool(256);
10319         {
10320           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10321           assem_debug("jump_in: %x\n",start+i*4);
10322           ll_add(jump_dirty+vpage,vaddr,out);
10323           void *entry_point = do_dirty_stub(i);
10324           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
10325           // If there was an existing entry in the hash table,
10326           // replace it with the new address.
10327           // Don't add new entries.  We'll insert the
10328           // ones that actually get used in check_addr().
10329           struct ht_entry *ht_bin = hash_table_get(vaddr);
10330           if (ht_bin->vaddr[0] == vaddr)
10331             ht_bin->tcaddr[0] = entry_point;
10332           if (ht_bin->vaddr[1] == vaddr)
10333             ht_bin->tcaddr[1] = entry_point;
10334         }
10335       }
10336     }
10337   }
10338   // Write out the literal pool if necessary
10339   literal_pool(0);
10340   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10341   // Align code
10342   if(((u_int)out)&7) emit_addnop(13);
10343   #endif
10344   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10345   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10346   memcpy(copy,source,slen*4);
10347   copy+=slen*4;
10348
10349   end_block(beginning);
10350
10351   // If we're within 256K of the end of the buffer,
10352   // start over from the beginning. (Is 256K enough?)
10353   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10354
10355   // Trap writes to any of the pages we compiled
10356   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10357     invalid_code[i]=0;
10358   }
10359   inv_code_start=inv_code_end=~0;
10360
10361   // for PCSX we need to mark all mirrors too
10362   if(get_page(start)<(RAM_SIZE>>12))
10363     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10364       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10365       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10366       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10367
10368   /* Pass 10 - Free memory by expiring oldest blocks */
10369
10370   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10371   while(expirep!=end)
10372   {
10373     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10374     uintptr_t base=(uintptr_t)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10375     inv_debug("EXP: Phase %d\n",expirep);
10376     switch((expirep>>11)&3)
10377     {
10378       case 0:
10379         // Clear jump_in and jump_dirty
10380         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10381         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10382         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10383         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10384         break;
10385       case 1:
10386         // Clear pointers
10387         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10388         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10389         break;
10390       case 2:
10391         // Clear hash table
10392         for(i=0;i<32;i++) {
10393           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
10394           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
10395              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10396             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
10397             ht_bin->vaddr[1] = -1;
10398             ht_bin->tcaddr[1] = NULL;
10399           }
10400           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
10401              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10402             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
10403             ht_bin->vaddr[0] = ht_bin->vaddr[1];
10404             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
10405             ht_bin->vaddr[1] = -1;
10406             ht_bin->tcaddr[1] = NULL;
10407           }
10408         }
10409         break;
10410       case 3:
10411         // Clear jump_out
10412         #ifdef __arm__
10413         if((expirep&2047)==0)
10414           do_clear_cache();
10415         #endif
10416         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10417         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10418         break;
10419     }
10420     expirep=(expirep+1)&65535;
10421   }
10422   return 0;
10423 }
10424
10425 // vim:shiftwidth=2:expandtab