drc: rework for 64bit, part 1
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 //#define DISASM
42 //#define assem_debug printf
43 //#define inv_debug printf
44 #define assem_debug(...)
45 #define inv_debug(...)
46
47 #ifdef __i386__
48 #include "assem_x86.h"
49 #endif
50 #ifdef __x86_64__
51 #include "assem_x64.h"
52 #endif
53 #ifdef __arm__
54 #include "assem_arm.h"
55 #endif
56
57 #define MAXBLOCK 4096
58 #define MAX_OUTPUT_BLOCK_SIZE 262144
59
60 struct regstat
61 {
62   signed char regmap_entry[HOST_REGS];
63   signed char regmap[HOST_REGS];
64   uint64_t was32;
65   uint64_t is32;
66   uint64_t wasdirty;
67   uint64_t dirty;
68   uint64_t u;
69   uint64_t uu;
70   u_int wasconst;
71   u_int isconst;
72   u_int loadedconst;             // host regs that have constants loaded
73   u_int waswritten;              // MIPS regs that were used as store base before
74 };
75
76 // note: asm depends on this layout
77 struct ll_entry
78 {
79   u_int vaddr;
80   u_int reg_sv_flags;
81   void *addr;
82   struct ll_entry *next;
83 };
84
85 struct ht_entry
86 {
87   u_int vaddr[2];
88   void *tcaddr[2];
89 };
90
91   // used by asm:
92   u_char *out;
93   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
94   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
95   struct ll_entry *jump_dirty[4096];
96
97   static struct ll_entry *jump_out[4096];
98   static u_int start;
99   static u_int *source;
100   static char insn[MAXBLOCK][10];
101   static u_char itype[MAXBLOCK];
102   static u_char opcode[MAXBLOCK];
103   static u_char opcode2[MAXBLOCK];
104   static u_char bt[MAXBLOCK];
105   static u_char rs1[MAXBLOCK];
106   static u_char rs2[MAXBLOCK];
107   static u_char rt1[MAXBLOCK];
108   static u_char rt2[MAXBLOCK];
109   static u_char us1[MAXBLOCK];
110   static u_char us2[MAXBLOCK];
111   static u_char dep1[MAXBLOCK];
112   static u_char dep2[MAXBLOCK];
113   static u_char lt1[MAXBLOCK];
114   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
115   static uint64_t gte_rt[MAXBLOCK];
116   static uint64_t gte_unneeded[MAXBLOCK];
117   static u_int smrv[32]; // speculated MIPS register values
118   static u_int smrv_strong; // mask or regs that are likely to have correct values
119   static u_int smrv_weak; // same, but somewhat less likely
120   static u_int smrv_strong_next; // same, but after current insn executes
121   static u_int smrv_weak_next;
122   static int imm[MAXBLOCK];
123   static u_int ba[MAXBLOCK];
124   static char likely[MAXBLOCK];
125   static char is_ds[MAXBLOCK];
126   static char ooo[MAXBLOCK];
127   static uint64_t unneeded_reg[MAXBLOCK];
128   static uint64_t unneeded_reg_upper[MAXBLOCK];
129   static uint64_t branch_unneeded_reg[MAXBLOCK];
130   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
131   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
132   static uint64_t current_constmap[HOST_REGS];
133   static uint64_t constmap[MAXBLOCK][HOST_REGS];
134   static struct regstat regs[MAXBLOCK];
135   static struct regstat branch_regs[MAXBLOCK];
136   static signed char minimum_free_regs[MAXBLOCK];
137   static u_int needed_reg[MAXBLOCK];
138   static u_int wont_dirty[MAXBLOCK];
139   static u_int will_dirty[MAXBLOCK];
140   static int ccadj[MAXBLOCK];
141   static int slen;
142   static void *instr_addr[MAXBLOCK];
143   static u_int link_addr[MAXBLOCK][3];
144   static int linkcount;
145   static u_int stubs[MAXBLOCK*3][8];
146   static int stubcount;
147   static u_int literals[1024][2];
148   static int literalcount;
149   static int is_delayslot;
150   static int cop1_usable;
151   static char shadow[1048576]  __attribute__((aligned(16)));
152   static void *copy;
153   static int expirep;
154   static u_int stop_after_jal;
155 #ifndef RAM_FIXED
156   static u_int ram_offset;
157 #else
158   static const u_int ram_offset=0;
159 #endif
160
161   int new_dynarec_hacks;
162   int new_dynarec_did_compile;
163   extern u_char restore_candidate[512];
164   extern int cycle_count;
165
166   /* registers that may be allocated */
167   /* 1-31 gpr */
168 #define HIREG 32 // hi
169 #define LOREG 33 // lo
170 #define FSREG 34 // FPU status (FCSR)
171 #define CSREG 35 // Coprocessor status
172 #define CCREG 36 // Cycle count
173 #define INVCP 37 // Pointer to invalid_code
174 //#define MMREG 38 // Pointer to memory_map
175 #define ROREG 39 // ram offset (if rdram!=0x80000000)
176 #define TEMPREG 40
177 #define FTEMP 40 // FPU temporary register
178 #define PTEMP 41 // Prefetch temporary register
179 //#define TLREG 42 // TLB mapping offset
180 #define RHASH 43 // Return address hash
181 #define RHTBL 44 // Return address hash table address
182 #define RTEMP 45 // JR/JALR address register
183 #define MAXREG 45
184 #define AGEN1 46 // Address generation temporary register
185 //#define AGEN2 47 // Address generation temporary register
186 //#define MGEN1 48 // Maptable address generation temporary register
187 //#define MGEN2 49 // Maptable address generation temporary register
188 #define BTREG 50 // Branch target temporary register
189
190   /* instruction types */
191 #define NOP 0     // No operation
192 #define LOAD 1    // Load
193 #define STORE 2   // Store
194 #define LOADLR 3  // Unaligned load
195 #define STORELR 4 // Unaligned store
196 #define MOV 5     // Move
197 #define ALU 6     // Arithmetic/logic
198 #define MULTDIV 7 // Multiply/divide
199 #define SHIFT 8   // Shift by register
200 #define SHIFTIMM 9// Shift by immediate
201 #define IMM16 10  // 16-bit immediate
202 #define RJUMP 11  // Unconditional jump to register
203 #define UJUMP 12  // Unconditional jump
204 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
205 #define SJUMP 14  // Conditional branch (regimm format)
206 #define COP0 15   // Coprocessor 0
207 #define COP1 16   // Coprocessor 1
208 #define C1LS 17   // Coprocessor 1 load/store
209 #define FJUMP 18  // Conditional branch (floating point)
210 #define FLOAT 19  // Floating point unit
211 #define FCONV 20  // Convert integer to float
212 #define FCOMP 21  // Floating point compare (sets FSREG)
213 #define SYSCALL 22// SYSCALL
214 #define OTHER 23  // Other
215 #define SPAN 24   // Branch/delay slot spans 2 pages
216 #define NI 25     // Not implemented
217 #define HLECALL 26// PCSX fake opcodes for HLE
218 #define COP2 27   // Coprocessor 2 move
219 #define C2LS 28   // Coprocessor 2 load/store
220 #define C2OP 29   // Coprocessor 2 operation
221 #define INTCALL 30// Call interpreter to handle rare corner cases
222
223   /* stubs */
224 #define CC_STUB 1
225 #define FP_STUB 2
226 #define LOADB_STUB 3
227 #define LOADH_STUB 4
228 #define LOADW_STUB 5
229 #define LOADD_STUB 6
230 #define LOADBU_STUB 7
231 #define LOADHU_STUB 8
232 #define STOREB_STUB 9
233 #define STOREH_STUB 10
234 #define STOREW_STUB 11
235 #define STORED_STUB 12
236 #define STORELR_STUB 13
237 #define INVCODE_STUB 14
238
239   /* branch codes */
240 #define TAKEN 1
241 #define NOTTAKEN 2
242 #define NULLDS 3
243
244 // asm linkage
245 int new_recompile_block(int addr);
246 void *get_addr_ht(u_int vaddr);
247 void invalidate_block(u_int block);
248 void invalidate_addr(u_int addr);
249 void remove_hash(int vaddr);
250 void dyna_linker();
251 void dyna_linker_ds();
252 void verify_code();
253 void verify_code_vm();
254 void verify_code_ds();
255 void cc_interrupt();
256 void fp_exception();
257 void fp_exception_ds();
258 void jump_syscall_hle();
259 void jump_hlecall();
260 void jump_intcall();
261 void new_dyna_leave();
262
263 // Needed by assembler
264 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
265 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
266 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
267 static void load_all_regs(signed char i_regmap[]);
268 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
269 static void load_regs_entry(int t);
270 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
271
272 static int verify_dirty(u_int *ptr);
273 static int get_final_value(int hr, int i, int *value);
274 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
275 static void add_to_linker(int addr,int target,int ext);
276
277 static int tracedebug=0;
278
279 static void mprotect_w_x(void *start, void *end, int is_x)
280 {
281 #ifdef NO_WRITE_EXEC
282   #if defined(VITA)
283   // *Open* enables write on all memory that was
284   // allocated by sceKernelAllocMemBlockForVM()?
285   if (is_x)
286     sceKernelCloseVMDomain();
287   else
288     sceKernelOpenVMDomain();
289   #else
290   u_long mstart = (u_long)start & ~4095ul;
291   u_long mend = (u_long)end;
292   if (mprotect((void *)mstart, mend - mstart,
293                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
294     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
295   #endif
296 #endif
297 }
298
299 static void start_tcache_write(void *start, void *end)
300 {
301   mprotect_w_x(start, end, 0);
302 }
303
304 static void end_tcache_write(void *start, void *end)
305 {
306 #ifdef __arm__
307   size_t len = (char *)end - (char *)start;
308   #if   defined(__BLACKBERRY_QNX__)
309   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
310   #elif defined(__MACH__)
311   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
312   #elif defined(VITA)
313   sceKernelSyncVMDomain(sceBlock, start, len);
314   #elif defined(_3DS)
315   ctr_flush_invalidate_cache();
316   #else
317   __clear_cache(start, end);
318   #endif
319   (void)len;
320 #endif
321
322   mprotect_w_x(start, end, 1);
323 }
324
325 static void *start_block(void)
326 {
327   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
328   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
329     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
330   start_tcache_write(out, end);
331   return out;
332 }
333
334 static void end_block(void *start)
335 {
336   end_tcache_write(start, out);
337 }
338
339 //#define DEBUG_CYCLE_COUNT 1
340
341 #define NO_CYCLE_PENALTY_THR 12
342
343 int cycle_multiplier; // 100 for 1.0
344
345 static int CLOCK_ADJUST(int x)
346 {
347   int s=(x>>31)|1;
348   return (x * cycle_multiplier + s * 50) / 100;
349 }
350
351 static u_int get_page(u_int vaddr)
352 {
353   u_int page=vaddr&~0xe0000000;
354   if (page < 0x1000000)
355     page &= ~0x0e00000; // RAM mirrors
356   page>>=12;
357   if(page>2048) page=2048+(page&2047);
358   return page;
359 }
360
361 // no virtual mem in PCSX
362 static u_int get_vpage(u_int vaddr)
363 {
364   return get_page(vaddr);
365 }
366
367 static struct ht_entry *hash_table_get(u_int vaddr)
368 {
369   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
370 }
371
372 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
373 {
374   ht_bin->vaddr[1] = ht_bin->vaddr[0];
375   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
376   ht_bin->vaddr[0] = vaddr;
377   ht_bin->tcaddr[0] = tcaddr;
378 }
379
380 // some messy ari64's code, seems to rely on unsigned 32bit overflow
381 static int doesnt_expire_soon(void *tcaddr)
382 {
383   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
384   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
385 }
386
387 // Get address from virtual address
388 // This is called from the recompiled JR/JALR instructions
389 void *get_addr(u_int vaddr)
390 {
391   u_int page=get_page(vaddr);
392   u_int vpage=get_vpage(vaddr);
393   struct ll_entry *head;
394   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
395   head=jump_in[page];
396   while(head!=NULL) {
397     if(head->vaddr==vaddr) {
398   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
399       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
400       return head->addr;
401     }
402     head=head->next;
403   }
404   head=jump_dirty[vpage];
405   while(head!=NULL) {
406     if(head->vaddr==vaddr) {
407       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
408       // Don't restore blocks which are about to expire from the cache
409       if (doesnt_expire_soon(head->addr))
410       if (verify_dirty(head->addr)) {
411         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
412         invalid_code[vaddr>>12]=0;
413         inv_code_start=inv_code_end=~0;
414         if(vpage<2048) {
415           restore_candidate[vpage>>3]|=1<<(vpage&7);
416         }
417         else restore_candidate[page>>3]|=1<<(page&7);
418         struct ht_entry *ht_bin = hash_table_get(vaddr);
419         if (ht_bin->vaddr[0] == vaddr)
420           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
421         else
422           hash_table_add(ht_bin, vaddr, head->addr);
423
424         return head->addr;
425       }
426     }
427     head=head->next;
428   }
429   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
430   int r=new_recompile_block(vaddr);
431   if(r==0) return get_addr(vaddr);
432   // Execute in unmapped page, generate pagefault execption
433   Status|=2;
434   Cause=(vaddr<<31)|0x8;
435   EPC=(vaddr&1)?vaddr-5:vaddr;
436   BadVAddr=(vaddr&~1);
437   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
438   EntryHi=BadVAddr&0xFFFFE000;
439   return get_addr_ht(0x80000000);
440 }
441 // Look up address in hash table first
442 void *get_addr_ht(u_int vaddr)
443 {
444   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
445   const struct ht_entry *ht_bin = hash_table_get(vaddr);
446   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
447   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
448   return get_addr(vaddr);
449 }
450
451 void clear_all_regs(signed char regmap[])
452 {
453   int hr;
454   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
455 }
456
457 signed char get_reg(signed char regmap[],int r)
458 {
459   int hr;
460   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
461   return -1;
462 }
463
464 // Find a register that is available for two consecutive cycles
465 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
466 {
467   int hr;
468   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
469   return -1;
470 }
471
472 int count_free_regs(signed char regmap[])
473 {
474   int count=0;
475   int hr;
476   for(hr=0;hr<HOST_REGS;hr++)
477   {
478     if(hr!=EXCLUDE_REG) {
479       if(regmap[hr]<0) count++;
480     }
481   }
482   return count;
483 }
484
485 void dirty_reg(struct regstat *cur,signed char reg)
486 {
487   int hr;
488   if(!reg) return;
489   for (hr=0;hr<HOST_REGS;hr++) {
490     if((cur->regmap[hr]&63)==reg) {
491       cur->dirty|=1<<hr;
492     }
493   }
494 }
495
496 // If we dirty the lower half of a 64 bit register which is now being
497 // sign-extended, we need to dump the upper half.
498 // Note: Do this only after completion of the instruction, because
499 // some instructions may need to read the full 64-bit value even if
500 // overwriting it (eg SLTI, DSRA32).
501 static void flush_dirty_uppers(struct regstat *cur)
502 {
503   int hr,reg;
504   for (hr=0;hr<HOST_REGS;hr++) {
505     if((cur->dirty>>hr)&1) {
506       reg=cur->regmap[hr];
507       if(reg>=64)
508         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
509     }
510   }
511 }
512
513 void set_const(struct regstat *cur,signed char reg,uint64_t value)
514 {
515   int hr;
516   if(!reg) return;
517   for (hr=0;hr<HOST_REGS;hr++) {
518     if(cur->regmap[hr]==reg) {
519       cur->isconst|=1<<hr;
520       current_constmap[hr]=value;
521     }
522     else if((cur->regmap[hr]^64)==reg) {
523       cur->isconst|=1<<hr;
524       current_constmap[hr]=value>>32;
525     }
526   }
527 }
528
529 void clear_const(struct regstat *cur,signed char reg)
530 {
531   int hr;
532   if(!reg) return;
533   for (hr=0;hr<HOST_REGS;hr++) {
534     if((cur->regmap[hr]&63)==reg) {
535       cur->isconst&=~(1<<hr);
536     }
537   }
538 }
539
540 int is_const(struct regstat *cur,signed char reg)
541 {
542   int hr;
543   if(reg<0) return 0;
544   if(!reg) return 1;
545   for (hr=0;hr<HOST_REGS;hr++) {
546     if((cur->regmap[hr]&63)==reg) {
547       return (cur->isconst>>hr)&1;
548     }
549   }
550   return 0;
551 }
552 uint64_t get_const(struct regstat *cur,signed char reg)
553 {
554   int hr;
555   if(!reg) return 0;
556   for (hr=0;hr<HOST_REGS;hr++) {
557     if(cur->regmap[hr]==reg) {
558       return current_constmap[hr];
559     }
560   }
561   SysPrintf("Unknown constant in r%d\n",reg);
562   exit(1);
563 }
564
565 // Least soon needed registers
566 // Look at the next ten instructions and see which registers
567 // will be used.  Try not to reallocate these.
568 void lsn(u_char hsn[], int i, int *preferred_reg)
569 {
570   int j;
571   int b=-1;
572   for(j=0;j<9;j++)
573   {
574     if(i+j>=slen) {
575       j=slen-i-1;
576       break;
577     }
578     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
579     {
580       // Don't go past an unconditonal jump
581       j++;
582       break;
583     }
584   }
585   for(;j>=0;j--)
586   {
587     if(rs1[i+j]) hsn[rs1[i+j]]=j;
588     if(rs2[i+j]) hsn[rs2[i+j]]=j;
589     if(rt1[i+j]) hsn[rt1[i+j]]=j;
590     if(rt2[i+j]) hsn[rt2[i+j]]=j;
591     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
592       // Stores can allocate zero
593       hsn[rs1[i+j]]=j;
594       hsn[rs2[i+j]]=j;
595     }
596     // On some architectures stores need invc_ptr
597     #if defined(HOST_IMM8)
598     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
599       hsn[INVCP]=j;
600     }
601     #endif
602     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
603     {
604       hsn[CCREG]=j;
605       b=j;
606     }
607   }
608   if(b>=0)
609   {
610     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
611     {
612       // Follow first branch
613       int t=(ba[i+b]-start)>>2;
614       j=7-b;if(t+j>=slen) j=slen-t-1;
615       for(;j>=0;j--)
616       {
617         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
618         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
619         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
620         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
621       }
622     }
623     // TODO: preferred register based on backward branch
624   }
625   // Delay slot should preferably not overwrite branch conditions or cycle count
626   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
627     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
628     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
629     hsn[CCREG]=1;
630     // ...or hash tables
631     hsn[RHASH]=1;
632     hsn[RHTBL]=1;
633   }
634   // Coprocessor load/store needs FTEMP, even if not declared
635   if(itype[i]==C1LS||itype[i]==C2LS) {
636     hsn[FTEMP]=0;
637   }
638   // Load L/R also uses FTEMP as a temporary register
639   if(itype[i]==LOADLR) {
640     hsn[FTEMP]=0;
641   }
642   // Also SWL/SWR/SDL/SDR
643   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
644     hsn[FTEMP]=0;
645   }
646   // Don't remove the miniht registers
647   if(itype[i]==UJUMP||itype[i]==RJUMP)
648   {
649     hsn[RHASH]=0;
650     hsn[RHTBL]=0;
651   }
652 }
653
654 // We only want to allocate registers if we're going to use them again soon
655 int needed_again(int r, int i)
656 {
657   int j;
658   int b=-1;
659   int rn=10;
660
661   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
662   {
663     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
664       return 0; // Don't need any registers if exiting the block
665   }
666   for(j=0;j<9;j++)
667   {
668     if(i+j>=slen) {
669       j=slen-i-1;
670       break;
671     }
672     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
673     {
674       // Don't go past an unconditonal jump
675       j++;
676       break;
677     }
678     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
679     {
680       break;
681     }
682   }
683   for(;j>=1;j--)
684   {
685     if(rs1[i+j]==r) rn=j;
686     if(rs2[i+j]==r) rn=j;
687     if((unneeded_reg[i+j]>>r)&1) rn=10;
688     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
689     {
690       b=j;
691     }
692   }
693   /*
694   if(b>=0)
695   {
696     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
697     {
698       // Follow first branch
699       int o=rn;
700       int t=(ba[i+b]-start)>>2;
701       j=7-b;if(t+j>=slen) j=slen-t-1;
702       for(;j>=0;j--)
703       {
704         if(!((unneeded_reg[t+j]>>r)&1)) {
705           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
706           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
707         }
708         else rn=o;
709       }
710     }
711   }*/
712   if(rn<10) return 1;
713   (void)b;
714   return 0;
715 }
716
717 // Try to match register allocations at the end of a loop with those
718 // at the beginning
719 int loop_reg(int i, int r, int hr)
720 {
721   int j,k;
722   for(j=0;j<9;j++)
723   {
724     if(i+j>=slen) {
725       j=slen-i-1;
726       break;
727     }
728     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
729     {
730       // Don't go past an unconditonal jump
731       j++;
732       break;
733     }
734   }
735   k=0;
736   if(i>0){
737     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
738       k--;
739   }
740   for(;k<j;k++)
741   {
742     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
743     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
744     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
745     {
746       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
747       {
748         int t=(ba[i+k]-start)>>2;
749         int reg=get_reg(regs[t].regmap_entry,r);
750         if(reg>=0) return reg;
751         //reg=get_reg(regs[t+1].regmap_entry,r);
752         //if(reg>=0) return reg;
753       }
754     }
755   }
756   return hr;
757 }
758
759
760 // Allocate every register, preserving source/target regs
761 void alloc_all(struct regstat *cur,int i)
762 {
763   int hr;
764
765   for(hr=0;hr<HOST_REGS;hr++) {
766     if(hr!=EXCLUDE_REG) {
767       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
768          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
769       {
770         cur->regmap[hr]=-1;
771         cur->dirty&=~(1<<hr);
772       }
773       // Don't need zeros
774       if((cur->regmap[hr]&63)==0)
775       {
776         cur->regmap[hr]=-1;
777         cur->dirty&=~(1<<hr);
778       }
779     }
780   }
781 }
782
783 #ifdef __i386__
784 #include "assem_x86.c"
785 #endif
786 #ifdef __x86_64__
787 #include "assem_x64.c"
788 #endif
789 #ifdef __arm__
790 #include "assem_arm.c"
791 #endif
792
793 // Add virtual address mapping to linked list
794 void ll_add(struct ll_entry **head,int vaddr,void *addr)
795 {
796   struct ll_entry *new_entry;
797   new_entry=malloc(sizeof(struct ll_entry));
798   assert(new_entry!=NULL);
799   new_entry->vaddr=vaddr;
800   new_entry->reg_sv_flags=0;
801   new_entry->addr=addr;
802   new_entry->next=*head;
803   *head=new_entry;
804 }
805
806 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
807 {
808   ll_add(head,vaddr,addr);
809   (*head)->reg_sv_flags=reg_sv_flags;
810 }
811
812 // Check if an address is already compiled
813 // but don't return addresses which are about to expire from the cache
814 void *check_addr(u_int vaddr)
815 {
816   struct ht_entry *ht_bin = hash_table_get(vaddr);
817   size_t i;
818   for (i = 0; i < sizeof(ht_bin->vaddr)/sizeof(ht_bin->vaddr[0]); i++) {
819     if (ht_bin->vaddr[i] == vaddr)
820       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
821         if (isclean(ht_bin->tcaddr[i]))
822           return ht_bin->tcaddr[i];
823   }
824   u_int page=get_page(vaddr);
825   struct ll_entry *head;
826   head=jump_in[page];
827   while (head != NULL) {
828     if (head->vaddr == vaddr) {
829       if (doesnt_expire_soon(head->addr)) {
830         // Update existing entry with current address
831         if (ht_bin->vaddr[0] == vaddr) {
832           ht_bin->tcaddr[0] = head->addr;
833           return head->addr;
834         }
835         if (ht_bin->vaddr[1] == vaddr) {
836           ht_bin->tcaddr[1] = head->addr;
837           return head->addr;
838         }
839         // Insert into hash table with low priority.
840         // Don't evict existing entries, as they are probably
841         // addresses that are being accessed frequently.
842         if (ht_bin->vaddr[0] == -1) {
843           ht_bin->vaddr[0] = vaddr;
844           ht_bin->tcaddr[0] = head->addr;
845         }
846         else if (ht_bin->vaddr[1] == -1) {
847           ht_bin->vaddr[1] = vaddr;
848           ht_bin->tcaddr[1] = head->addr;
849         }
850         return head->addr;
851       }
852     }
853     head=head->next;
854   }
855   return 0;
856 }
857
858 void remove_hash(int vaddr)
859 {
860   //printf("remove hash: %x\n",vaddr);
861   struct ht_entry *ht_bin = hash_table_get(vaddr);
862   if (ht_bin->vaddr[1] == vaddr) {
863     ht_bin->vaddr[1] = -1;
864     ht_bin->tcaddr[1] = NULL;
865   }
866   if (ht_bin->vaddr[0] == vaddr) {
867     ht_bin->vaddr[0] = ht_bin->vaddr[1];
868     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
869     ht_bin->vaddr[1] = -1;
870     ht_bin->tcaddr[1] = NULL;
871   }
872 }
873
874 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
875 {
876   struct ll_entry *next;
877   while(*head) {
878     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
879        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
880     {
881       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
882       remove_hash((*head)->vaddr);
883       next=(*head)->next;
884       free(*head);
885       *head=next;
886     }
887     else
888     {
889       head=&((*head)->next);
890     }
891   }
892 }
893
894 // Remove all entries from linked list
895 void ll_clear(struct ll_entry **head)
896 {
897   struct ll_entry *cur;
898   struct ll_entry *next;
899   if((cur=*head)) {
900     *head=0;
901     while(cur) {
902       next=cur->next;
903       free(cur);
904       cur=next;
905     }
906   }
907 }
908
909 // Dereference the pointers and remove if it matches
910 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
911 {
912   while(head) {
913     int ptr=get_pointer(head->addr);
914     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
915     if(((ptr>>shift)==(addr>>shift)) ||
916        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
917     {
918       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
919       void *host_addr=find_extjump_insn(head->addr);
920       #ifdef __arm__
921         mark_clear_cache(host_addr);
922       #endif
923       set_jump_target(host_addr, head->addr);
924     }
925     head=head->next;
926   }
927 }
928
929 // This is called when we write to a compiled block (see do_invstub)
930 void invalidate_page(u_int page)
931 {
932   struct ll_entry *head;
933   struct ll_entry *next;
934   head=jump_in[page];
935   jump_in[page]=0;
936   while(head!=NULL) {
937     inv_debug("INVALIDATE: %x\n",head->vaddr);
938     remove_hash(head->vaddr);
939     next=head->next;
940     free(head);
941     head=next;
942   }
943   head=jump_out[page];
944   jump_out[page]=0;
945   while(head!=NULL) {
946     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
947     void *host_addr=find_extjump_insn(head->addr);
948     #ifdef __arm__
949       mark_clear_cache(host_addr);
950     #endif
951     set_jump_target(host_addr, head->addr);
952     next=head->next;
953     free(head);
954     head=next;
955   }
956 }
957
958 static void invalidate_block_range(u_int block, u_int first, u_int last)
959 {
960   u_int page=get_page(block<<12);
961   //printf("first=%d last=%d\n",first,last);
962   invalidate_page(page);
963   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
964   assert(last<page+5);
965   // Invalidate the adjacent pages if a block crosses a 4K boundary
966   while(first<page) {
967     invalidate_page(first);
968     first++;
969   }
970   for(first=page+1;first<last;first++) {
971     invalidate_page(first);
972   }
973   #ifdef __arm__
974     do_clear_cache();
975   #endif
976
977   // Don't trap writes
978   invalid_code[block]=1;
979
980   #ifdef USE_MINI_HT
981   memset(mini_ht,-1,sizeof(mini_ht));
982   #endif
983 }
984
985 void invalidate_block(u_int block)
986 {
987   u_int page=get_page(block<<12);
988   u_int vpage=get_vpage(block<<12);
989   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
990   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
991   u_int first,last;
992   first=last=page;
993   struct ll_entry *head;
994   head=jump_dirty[vpage];
995   //printf("page=%d vpage=%d\n",page,vpage);
996   while(head!=NULL) {
997     u_int start,end;
998     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
999       get_bounds((int)head->addr,&start,&end);
1000       //printf("start: %x end: %x\n",start,end);
1001       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1002         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1003           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1004           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1005         }
1006       }
1007     }
1008     head=head->next;
1009   }
1010   invalidate_block_range(block,first,last);
1011 }
1012
1013 void invalidate_addr(u_int addr)
1014 {
1015   //static int rhits;
1016   // this check is done by the caller
1017   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1018   u_int page=get_vpage(addr);
1019   if(page<2048) { // RAM
1020     struct ll_entry *head;
1021     u_int addr_min=~0, addr_max=0;
1022     u_int mask=RAM_SIZE-1;
1023     u_int addr_main=0x80000000|(addr&mask);
1024     int pg1;
1025     inv_code_start=addr_main&~0xfff;
1026     inv_code_end=addr_main|0xfff;
1027     pg1=page;
1028     if (pg1>0) {
1029       // must check previous page too because of spans..
1030       pg1--;
1031       inv_code_start-=0x1000;
1032     }
1033     for(;pg1<=page;pg1++) {
1034       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1035         u_int start,end;
1036         get_bounds((int)head->addr,&start,&end);
1037         if(ram_offset) {
1038           start-=ram_offset;
1039           end-=ram_offset;
1040         }
1041         if(start<=addr_main&&addr_main<end) {
1042           if(start<addr_min) addr_min=start;
1043           if(end>addr_max) addr_max=end;
1044         }
1045         else if(addr_main<start) {
1046           if(start<inv_code_end)
1047             inv_code_end=start-1;
1048         }
1049         else {
1050           if(end>inv_code_start)
1051             inv_code_start=end;
1052         }
1053       }
1054     }
1055     if (addr_min!=~0) {
1056       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1057       inv_code_start=inv_code_end=~0;
1058       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1059       return;
1060     }
1061     else {
1062       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1063       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1064       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1065       return;
1066     }
1067   }
1068   invalidate_block(addr>>12);
1069 }
1070
1071 // This is called when loading a save state.
1072 // Anything could have changed, so invalidate everything.
1073 void invalidate_all_pages()
1074 {
1075   u_int page;
1076   for(page=0;page<4096;page++)
1077     invalidate_page(page);
1078   for(page=0;page<1048576;page++)
1079     if(!invalid_code[page]) {
1080       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1081       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1082     }
1083   #ifdef USE_MINI_HT
1084   memset(mini_ht,-1,sizeof(mini_ht));
1085   #endif
1086 }
1087
1088 // Add an entry to jump_out after making a link
1089 void add_link(u_int vaddr,void *src)
1090 {
1091   u_int page=get_page(vaddr);
1092   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1093   int *ptr=(int *)(src+4);
1094   assert((*ptr&0x0fff0000)==0x059f0000);
1095   (void)ptr;
1096   ll_add(jump_out+page,vaddr,src);
1097   //int ptr=get_pointer(src);
1098   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1099 }
1100
1101 // If a code block was found to be unmodified (bit was set in
1102 // restore_candidate) and it remains unmodified (bit is clear
1103 // in invalid_code) then move the entries for that 4K page from
1104 // the dirty list to the clean list.
1105 void clean_blocks(u_int page)
1106 {
1107   struct ll_entry *head;
1108   inv_debug("INV: clean_blocks page=%d\n",page);
1109   head=jump_dirty[page];
1110   while(head!=NULL) {
1111     if(!invalid_code[head->vaddr>>12]) {
1112       // Don't restore blocks which are about to expire from the cache
1113       if (doesnt_expire_soon(head->addr)) {
1114         u_int start,end;
1115         if(verify_dirty(head->addr)) {
1116           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1117           u_int i;
1118           u_int inv=0;
1119           get_bounds((int)head->addr,&start,&end);
1120           if(start-(u_int)rdram<RAM_SIZE) {
1121             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1122               inv|=invalid_code[i];
1123             }
1124           }
1125           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1126             inv=1;
1127           }
1128           if(!inv) {
1129             void *clean_addr = get_clean_addr(head->addr);
1130             if (doesnt_expire_soon(clean_addr)) {
1131               u_int ppage=page;
1132               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1133               //printf("page=%x, addr=%x\n",page,head->vaddr);
1134               //assert(head->vaddr>>12==(page|0x80000));
1135               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1136               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1137               if (ht_bin->vaddr[0] == head->vaddr)
1138                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1139               if (ht_bin->vaddr[1] == head->vaddr)
1140                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1141             }
1142           }
1143         }
1144       }
1145     }
1146     head=head->next;
1147   }
1148 }
1149
1150
1151 void mov_alloc(struct regstat *current,int i)
1152 {
1153   // Note: Don't need to actually alloc the source registers
1154   if((~current->is32>>rs1[i])&1) {
1155     //alloc_reg64(current,i,rs1[i]);
1156     alloc_reg64(current,i,rt1[i]);
1157     current->is32&=~(1LL<<rt1[i]);
1158   } else {
1159     //alloc_reg(current,i,rs1[i]);
1160     alloc_reg(current,i,rt1[i]);
1161     current->is32|=(1LL<<rt1[i]);
1162   }
1163   clear_const(current,rs1[i]);
1164   clear_const(current,rt1[i]);
1165   dirty_reg(current,rt1[i]);
1166 }
1167
1168 void shiftimm_alloc(struct regstat *current,int i)
1169 {
1170   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1171   {
1172     if(rt1[i]) {
1173       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1174       else lt1[i]=rs1[i];
1175       alloc_reg(current,i,rt1[i]);
1176       current->is32|=1LL<<rt1[i];
1177       dirty_reg(current,rt1[i]);
1178       if(is_const(current,rs1[i])) {
1179         int v=get_const(current,rs1[i]);
1180         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1181         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1182         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1183       }
1184       else clear_const(current,rt1[i]);
1185     }
1186   }
1187   else
1188   {
1189     clear_const(current,rs1[i]);
1190     clear_const(current,rt1[i]);
1191   }
1192
1193   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1194   {
1195     if(rt1[i]) {
1196       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1197       alloc_reg64(current,i,rt1[i]);
1198       current->is32&=~(1LL<<rt1[i]);
1199       dirty_reg(current,rt1[i]);
1200     }
1201   }
1202   if(opcode2[i]==0x3c) // DSLL32
1203   {
1204     if(rt1[i]) {
1205       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1206       alloc_reg64(current,i,rt1[i]);
1207       current->is32&=~(1LL<<rt1[i]);
1208       dirty_reg(current,rt1[i]);
1209     }
1210   }
1211   if(opcode2[i]==0x3e) // DSRL32
1212   {
1213     if(rt1[i]) {
1214       alloc_reg64(current,i,rs1[i]);
1215       if(imm[i]==32) {
1216         alloc_reg64(current,i,rt1[i]);
1217         current->is32&=~(1LL<<rt1[i]);
1218       } else {
1219         alloc_reg(current,i,rt1[i]);
1220         current->is32|=1LL<<rt1[i];
1221       }
1222       dirty_reg(current,rt1[i]);
1223     }
1224   }
1225   if(opcode2[i]==0x3f) // DSRA32
1226   {
1227     if(rt1[i]) {
1228       alloc_reg64(current,i,rs1[i]);
1229       alloc_reg(current,i,rt1[i]);
1230       current->is32|=1LL<<rt1[i];
1231       dirty_reg(current,rt1[i]);
1232     }
1233   }
1234 }
1235
1236 void shift_alloc(struct regstat *current,int i)
1237 {
1238   if(rt1[i]) {
1239     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1240     {
1241       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1242       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1243       alloc_reg(current,i,rt1[i]);
1244       if(rt1[i]==rs2[i]) {
1245         alloc_reg_temp(current,i,-1);
1246         minimum_free_regs[i]=1;
1247       }
1248       current->is32|=1LL<<rt1[i];
1249     } else { // DSLLV/DSRLV/DSRAV
1250       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1251       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1252       alloc_reg64(current,i,rt1[i]);
1253       current->is32&=~(1LL<<rt1[i]);
1254       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1255       {
1256         alloc_reg_temp(current,i,-1);
1257         minimum_free_regs[i]=1;
1258       }
1259     }
1260     clear_const(current,rs1[i]);
1261     clear_const(current,rs2[i]);
1262     clear_const(current,rt1[i]);
1263     dirty_reg(current,rt1[i]);
1264   }
1265 }
1266
1267 void alu_alloc(struct regstat *current,int i)
1268 {
1269   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1270     if(rt1[i]) {
1271       if(rs1[i]&&rs2[i]) {
1272         alloc_reg(current,i,rs1[i]);
1273         alloc_reg(current,i,rs2[i]);
1274       }
1275       else {
1276         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1277         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1278       }
1279       alloc_reg(current,i,rt1[i]);
1280     }
1281     current->is32|=1LL<<rt1[i];
1282   }
1283   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1284     if(rt1[i]) {
1285       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1286       {
1287         alloc_reg64(current,i,rs1[i]);
1288         alloc_reg64(current,i,rs2[i]);
1289         alloc_reg(current,i,rt1[i]);
1290       } else {
1291         alloc_reg(current,i,rs1[i]);
1292         alloc_reg(current,i,rs2[i]);
1293         alloc_reg(current,i,rt1[i]);
1294       }
1295     }
1296     current->is32|=1LL<<rt1[i];
1297   }
1298   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1299     if(rt1[i]) {
1300       if(rs1[i]&&rs2[i]) {
1301         alloc_reg(current,i,rs1[i]);
1302         alloc_reg(current,i,rs2[i]);
1303       }
1304       else
1305       {
1306         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1307         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1308       }
1309       alloc_reg(current,i,rt1[i]);
1310       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1311       {
1312         if(!((current->uu>>rt1[i])&1)) {
1313           alloc_reg64(current,i,rt1[i]);
1314         }
1315         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1316           if(rs1[i]&&rs2[i]) {
1317             alloc_reg64(current,i,rs1[i]);
1318             alloc_reg64(current,i,rs2[i]);
1319           }
1320           else
1321           {
1322             // Is is really worth it to keep 64-bit values in registers?
1323             #ifdef NATIVE_64BIT
1324             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1325             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1326             #endif
1327           }
1328         }
1329         current->is32&=~(1LL<<rt1[i]);
1330       } else {
1331         current->is32|=1LL<<rt1[i];
1332       }
1333     }
1334   }
1335   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1336     if(rt1[i]) {
1337       if(rs1[i]&&rs2[i]) {
1338         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1339           alloc_reg64(current,i,rs1[i]);
1340           alloc_reg64(current,i,rs2[i]);
1341           alloc_reg64(current,i,rt1[i]);
1342         } else {
1343           alloc_reg(current,i,rs1[i]);
1344           alloc_reg(current,i,rs2[i]);
1345           alloc_reg(current,i,rt1[i]);
1346         }
1347       }
1348       else {
1349         alloc_reg(current,i,rt1[i]);
1350         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1351           // DADD used as move, or zeroing
1352           // If we have a 64-bit source, then make the target 64 bits too
1353           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1354             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1355             alloc_reg64(current,i,rt1[i]);
1356           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1357             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1358             alloc_reg64(current,i,rt1[i]);
1359           }
1360           if(opcode2[i]>=0x2e&&rs2[i]) {
1361             // DSUB used as negation - 64-bit result
1362             // If we have a 32-bit register, extend it to 64 bits
1363             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1364             alloc_reg64(current,i,rt1[i]);
1365           }
1366         }
1367       }
1368       if(rs1[i]&&rs2[i]) {
1369         current->is32&=~(1LL<<rt1[i]);
1370       } else if(rs1[i]) {
1371         current->is32&=~(1LL<<rt1[i]);
1372         if((current->is32>>rs1[i])&1)
1373           current->is32|=1LL<<rt1[i];
1374       } else if(rs2[i]) {
1375         current->is32&=~(1LL<<rt1[i]);
1376         if((current->is32>>rs2[i])&1)
1377           current->is32|=1LL<<rt1[i];
1378       } else {
1379         current->is32|=1LL<<rt1[i];
1380       }
1381     }
1382   }
1383   clear_const(current,rs1[i]);
1384   clear_const(current,rs2[i]);
1385   clear_const(current,rt1[i]);
1386   dirty_reg(current,rt1[i]);
1387 }
1388
1389 void imm16_alloc(struct regstat *current,int i)
1390 {
1391   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1392   else lt1[i]=rs1[i];
1393   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1394   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1395     current->is32&=~(1LL<<rt1[i]);
1396     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1397       // TODO: Could preserve the 32-bit flag if the immediate is zero
1398       alloc_reg64(current,i,rt1[i]);
1399       alloc_reg64(current,i,rs1[i]);
1400     }
1401     clear_const(current,rs1[i]);
1402     clear_const(current,rt1[i]);
1403   }
1404   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1405     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1406     current->is32|=1LL<<rt1[i];
1407     clear_const(current,rs1[i]);
1408     clear_const(current,rt1[i]);
1409   }
1410   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1411     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1412       if(rs1[i]!=rt1[i]) {
1413         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1414         alloc_reg64(current,i,rt1[i]);
1415         current->is32&=~(1LL<<rt1[i]);
1416       }
1417     }
1418     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1419     if(is_const(current,rs1[i])) {
1420       int v=get_const(current,rs1[i]);
1421       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1422       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1423       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1424     }
1425     else clear_const(current,rt1[i]);
1426   }
1427   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1428     if(is_const(current,rs1[i])) {
1429       int v=get_const(current,rs1[i]);
1430       set_const(current,rt1[i],v+imm[i]);
1431     }
1432     else clear_const(current,rt1[i]);
1433     current->is32|=1LL<<rt1[i];
1434   }
1435   else {
1436     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1437     current->is32|=1LL<<rt1[i];
1438   }
1439   dirty_reg(current,rt1[i]);
1440 }
1441
1442 void load_alloc(struct regstat *current,int i)
1443 {
1444   clear_const(current,rt1[i]);
1445   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1446   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1447   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1448   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1449     alloc_reg(current,i,rt1[i]);
1450     assert(get_reg(current->regmap,rt1[i])>=0);
1451     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1452     {
1453       current->is32&=~(1LL<<rt1[i]);
1454       alloc_reg64(current,i,rt1[i]);
1455     }
1456     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1457     {
1458       current->is32&=~(1LL<<rt1[i]);
1459       alloc_reg64(current,i,rt1[i]);
1460       alloc_all(current,i);
1461       alloc_reg64(current,i,FTEMP);
1462       minimum_free_regs[i]=HOST_REGS;
1463     }
1464     else current->is32|=1LL<<rt1[i];
1465     dirty_reg(current,rt1[i]);
1466     // LWL/LWR need a temporary register for the old value
1467     if(opcode[i]==0x22||opcode[i]==0x26)
1468     {
1469       alloc_reg(current,i,FTEMP);
1470       alloc_reg_temp(current,i,-1);
1471       minimum_free_regs[i]=1;
1472     }
1473   }
1474   else
1475   {
1476     // Load to r0 or unneeded register (dummy load)
1477     // but we still need a register to calculate the address
1478     if(opcode[i]==0x22||opcode[i]==0x26)
1479     {
1480       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1481     }
1482     alloc_reg_temp(current,i,-1);
1483     minimum_free_regs[i]=1;
1484     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1485     {
1486       alloc_all(current,i);
1487       alloc_reg64(current,i,FTEMP);
1488       minimum_free_regs[i]=HOST_REGS;
1489     }
1490   }
1491 }
1492
1493 void store_alloc(struct regstat *current,int i)
1494 {
1495   clear_const(current,rs2[i]);
1496   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1497   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1498   alloc_reg(current,i,rs2[i]);
1499   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1500     alloc_reg64(current,i,rs2[i]);
1501     if(rs2[i]) alloc_reg(current,i,FTEMP);
1502   }
1503   #if defined(HOST_IMM8)
1504   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1505   else alloc_reg(current,i,INVCP);
1506   #endif
1507   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1508     alloc_reg(current,i,FTEMP);
1509   }
1510   // We need a temporary register for address generation
1511   alloc_reg_temp(current,i,-1);
1512   minimum_free_regs[i]=1;
1513 }
1514
1515 void c1ls_alloc(struct regstat *current,int i)
1516 {
1517   //clear_const(current,rs1[i]); // FIXME
1518   clear_const(current,rt1[i]);
1519   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1520   alloc_reg(current,i,CSREG); // Status
1521   alloc_reg(current,i,FTEMP);
1522   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1523     alloc_reg64(current,i,FTEMP);
1524   }
1525   #if defined(HOST_IMM8)
1526   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1527   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1528     alloc_reg(current,i,INVCP);
1529   #endif
1530   // We need a temporary register for address generation
1531   alloc_reg_temp(current,i,-1);
1532 }
1533
1534 void c2ls_alloc(struct regstat *current,int i)
1535 {
1536   clear_const(current,rt1[i]);
1537   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1538   alloc_reg(current,i,FTEMP);
1539   #if defined(HOST_IMM8)
1540   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1541   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1542     alloc_reg(current,i,INVCP);
1543   #endif
1544   // We need a temporary register for address generation
1545   alloc_reg_temp(current,i,-1);
1546   minimum_free_regs[i]=1;
1547 }
1548
1549 #ifndef multdiv_alloc
1550 void multdiv_alloc(struct regstat *current,int i)
1551 {
1552   //  case 0x18: MULT
1553   //  case 0x19: MULTU
1554   //  case 0x1A: DIV
1555   //  case 0x1B: DIVU
1556   //  case 0x1C: DMULT
1557   //  case 0x1D: DMULTU
1558   //  case 0x1E: DDIV
1559   //  case 0x1F: DDIVU
1560   clear_const(current,rs1[i]);
1561   clear_const(current,rs2[i]);
1562   if(rs1[i]&&rs2[i])
1563   {
1564     if((opcode2[i]&4)==0) // 32-bit
1565     {
1566       current->u&=~(1LL<<HIREG);
1567       current->u&=~(1LL<<LOREG);
1568       alloc_reg(current,i,HIREG);
1569       alloc_reg(current,i,LOREG);
1570       alloc_reg(current,i,rs1[i]);
1571       alloc_reg(current,i,rs2[i]);
1572       current->is32|=1LL<<HIREG;
1573       current->is32|=1LL<<LOREG;
1574       dirty_reg(current,HIREG);
1575       dirty_reg(current,LOREG);
1576     }
1577     else // 64-bit
1578     {
1579       current->u&=~(1LL<<HIREG);
1580       current->u&=~(1LL<<LOREG);
1581       current->uu&=~(1LL<<HIREG);
1582       current->uu&=~(1LL<<LOREG);
1583       alloc_reg64(current,i,HIREG);
1584       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1585       alloc_reg64(current,i,rs1[i]);
1586       alloc_reg64(current,i,rs2[i]);
1587       alloc_all(current,i);
1588       current->is32&=~(1LL<<HIREG);
1589       current->is32&=~(1LL<<LOREG);
1590       dirty_reg(current,HIREG);
1591       dirty_reg(current,LOREG);
1592       minimum_free_regs[i]=HOST_REGS;
1593     }
1594   }
1595   else
1596   {
1597     // Multiply by zero is zero.
1598     // MIPS does not have a divide by zero exception.
1599     // The result is undefined, we return zero.
1600     alloc_reg(current,i,HIREG);
1601     alloc_reg(current,i,LOREG);
1602     current->is32|=1LL<<HIREG;
1603     current->is32|=1LL<<LOREG;
1604     dirty_reg(current,HIREG);
1605     dirty_reg(current,LOREG);
1606   }
1607 }
1608 #endif
1609
1610 void cop0_alloc(struct regstat *current,int i)
1611 {
1612   if(opcode2[i]==0) // MFC0
1613   {
1614     if(rt1[i]) {
1615       clear_const(current,rt1[i]);
1616       alloc_all(current,i);
1617       alloc_reg(current,i,rt1[i]);
1618       current->is32|=1LL<<rt1[i];
1619       dirty_reg(current,rt1[i]);
1620     }
1621   }
1622   else if(opcode2[i]==4) // MTC0
1623   {
1624     if(rs1[i]){
1625       clear_const(current,rs1[i]);
1626       alloc_reg(current,i,rs1[i]);
1627       alloc_all(current,i);
1628     }
1629     else {
1630       alloc_all(current,i); // FIXME: Keep r0
1631       current->u&=~1LL;
1632       alloc_reg(current,i,0);
1633     }
1634   }
1635   else
1636   {
1637     // TLBR/TLBWI/TLBWR/TLBP/ERET
1638     assert(opcode2[i]==0x10);
1639     alloc_all(current,i);
1640   }
1641   minimum_free_regs[i]=HOST_REGS;
1642 }
1643
1644 void cop1_alloc(struct regstat *current,int i)
1645 {
1646   alloc_reg(current,i,CSREG); // Load status
1647   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1648   {
1649     if(rt1[i]){
1650       clear_const(current,rt1[i]);
1651       if(opcode2[i]==1) {
1652         alloc_reg64(current,i,rt1[i]); // DMFC1
1653         current->is32&=~(1LL<<rt1[i]);
1654       }else{
1655         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1656         current->is32|=1LL<<rt1[i];
1657       }
1658       dirty_reg(current,rt1[i]);
1659     }
1660     alloc_reg_temp(current,i,-1);
1661   }
1662   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1663   {
1664     if(rs1[i]){
1665       clear_const(current,rs1[i]);
1666       if(opcode2[i]==5)
1667         alloc_reg64(current,i,rs1[i]); // DMTC1
1668       else
1669         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1670       alloc_reg_temp(current,i,-1);
1671     }
1672     else {
1673       current->u&=~1LL;
1674       alloc_reg(current,i,0);
1675       alloc_reg_temp(current,i,-1);
1676     }
1677   }
1678   minimum_free_regs[i]=1;
1679 }
1680 void fconv_alloc(struct regstat *current,int i)
1681 {
1682   alloc_reg(current,i,CSREG); // Load status
1683   alloc_reg_temp(current,i,-1);
1684   minimum_free_regs[i]=1;
1685 }
1686 void float_alloc(struct regstat *current,int i)
1687 {
1688   alloc_reg(current,i,CSREG); // Load status
1689   alloc_reg_temp(current,i,-1);
1690   minimum_free_regs[i]=1;
1691 }
1692 void c2op_alloc(struct regstat *current,int i)
1693 {
1694   alloc_reg_temp(current,i,-1);
1695 }
1696 void fcomp_alloc(struct regstat *current,int i)
1697 {
1698   alloc_reg(current,i,CSREG); // Load status
1699   alloc_reg(current,i,FSREG); // Load flags
1700   dirty_reg(current,FSREG); // Flag will be modified
1701   alloc_reg_temp(current,i,-1);
1702   minimum_free_regs[i]=1;
1703 }
1704
1705 void syscall_alloc(struct regstat *current,int i)
1706 {
1707   alloc_cc(current,i);
1708   dirty_reg(current,CCREG);
1709   alloc_all(current,i);
1710   minimum_free_regs[i]=HOST_REGS;
1711   current->isconst=0;
1712 }
1713
1714 void delayslot_alloc(struct regstat *current,int i)
1715 {
1716   switch(itype[i]) {
1717     case UJUMP:
1718     case CJUMP:
1719     case SJUMP:
1720     case RJUMP:
1721     case FJUMP:
1722     case SYSCALL:
1723     case HLECALL:
1724     case SPAN:
1725       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1726       SysPrintf("Disabled speculative precompilation\n");
1727       stop_after_jal=1;
1728       break;
1729     case IMM16:
1730       imm16_alloc(current,i);
1731       break;
1732     case LOAD:
1733     case LOADLR:
1734       load_alloc(current,i);
1735       break;
1736     case STORE:
1737     case STORELR:
1738       store_alloc(current,i);
1739       break;
1740     case ALU:
1741       alu_alloc(current,i);
1742       break;
1743     case SHIFT:
1744       shift_alloc(current,i);
1745       break;
1746     case MULTDIV:
1747       multdiv_alloc(current,i);
1748       break;
1749     case SHIFTIMM:
1750       shiftimm_alloc(current,i);
1751       break;
1752     case MOV:
1753       mov_alloc(current,i);
1754       break;
1755     case COP0:
1756       cop0_alloc(current,i);
1757       break;
1758     case COP1:
1759     case COP2:
1760       cop1_alloc(current,i);
1761       break;
1762     case C1LS:
1763       c1ls_alloc(current,i);
1764       break;
1765     case C2LS:
1766       c2ls_alloc(current,i);
1767       break;
1768     case FCONV:
1769       fconv_alloc(current,i);
1770       break;
1771     case FLOAT:
1772       float_alloc(current,i);
1773       break;
1774     case FCOMP:
1775       fcomp_alloc(current,i);
1776       break;
1777     case C2OP:
1778       c2op_alloc(current,i);
1779       break;
1780   }
1781 }
1782
1783 // Special case where a branch and delay slot span two pages in virtual memory
1784 static void pagespan_alloc(struct regstat *current,int i)
1785 {
1786   current->isconst=0;
1787   current->wasconst=0;
1788   regs[i].wasconst=0;
1789   minimum_free_regs[i]=HOST_REGS;
1790   alloc_all(current,i);
1791   alloc_cc(current,i);
1792   dirty_reg(current,CCREG);
1793   if(opcode[i]==3) // JAL
1794   {
1795     alloc_reg(current,i,31);
1796     dirty_reg(current,31);
1797   }
1798   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1799   {
1800     alloc_reg(current,i,rs1[i]);
1801     if (rt1[i]!=0) {
1802       alloc_reg(current,i,rt1[i]);
1803       dirty_reg(current,rt1[i]);
1804     }
1805   }
1806   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1807   {
1808     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1809     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1810     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1811     {
1812       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1813       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1814     }
1815   }
1816   else
1817   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1818   {
1819     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1820     if(!((current->is32>>rs1[i])&1))
1821     {
1822       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1823     }
1824   }
1825   else
1826   if(opcode[i]==0x11) // BC1
1827   {
1828     alloc_reg(current,i,FSREG);
1829     alloc_reg(current,i,CSREG);
1830   }
1831   //else ...
1832 }
1833
1834 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1835 {
1836   stubs[stubcount][0]=type;
1837   stubs[stubcount][1]=addr;
1838   stubs[stubcount][2]=retaddr;
1839   stubs[stubcount][3]=a;
1840   stubs[stubcount][4]=b;
1841   stubs[stubcount][5]=c;
1842   stubs[stubcount][6]=d;
1843   stubs[stubcount][7]=e;
1844   stubcount++;
1845 }
1846
1847 // Write out a single register
1848 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1849 {
1850   int hr;
1851   for(hr=0;hr<HOST_REGS;hr++) {
1852     if(hr!=EXCLUDE_REG) {
1853       if((regmap[hr]&63)==r) {
1854         if((dirty>>hr)&1) {
1855           if(regmap[hr]<64) {
1856             emit_storereg(r,hr);
1857           }else{
1858             emit_storereg(r|64,hr);
1859           }
1860         }
1861       }
1862     }
1863   }
1864 }
1865
1866 int mchecksum()
1867 {
1868   //if(!tracedebug) return 0;
1869   int i;
1870   int sum=0;
1871   for(i=0;i<2097152;i++) {
1872     unsigned int temp=sum;
1873     sum<<=1;
1874     sum|=(~temp)>>31;
1875     sum^=((u_int *)rdram)[i];
1876   }
1877   return sum;
1878 }
1879 int rchecksum()
1880 {
1881   int i;
1882   int sum=0;
1883   for(i=0;i<64;i++)
1884     sum^=((u_int *)reg)[i];
1885   return sum;
1886 }
1887 void rlist()
1888 {
1889   int i;
1890   printf("TRACE: ");
1891   for(i=0;i<32;i++)
1892     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1893   printf("\n");
1894 }
1895
1896 void enabletrace()
1897 {
1898   tracedebug=1;
1899 }
1900
1901 void memdebug(int i)
1902 {
1903   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1904   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1905   //rlist();
1906   //if(tracedebug) {
1907   //if(Count>=-2084597794) {
1908   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1909   //if(0) {
1910     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1911     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1912     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1913     rlist();
1914     #ifdef __i386__
1915     printf("TRACE: %x\n",(&i)[-1]);
1916     #endif
1917     #ifdef __arm__
1918     int j;
1919     printf("TRACE: %x \n",(&j)[10]);
1920     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1921     #endif
1922     //fflush(stdout);
1923   }
1924   //printf("TRACE: %x\n",(&i)[-1]);
1925 }
1926
1927 void alu_assemble(int i,struct regstat *i_regs)
1928 {
1929   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1930     if(rt1[i]) {
1931       signed char s1,s2,t;
1932       t=get_reg(i_regs->regmap,rt1[i]);
1933       if(t>=0) {
1934         s1=get_reg(i_regs->regmap,rs1[i]);
1935         s2=get_reg(i_regs->regmap,rs2[i]);
1936         if(rs1[i]&&rs2[i]) {
1937           assert(s1>=0);
1938           assert(s2>=0);
1939           if(opcode2[i]&2) emit_sub(s1,s2,t);
1940           else emit_add(s1,s2,t);
1941         }
1942         else if(rs1[i]) {
1943           if(s1>=0) emit_mov(s1,t);
1944           else emit_loadreg(rs1[i],t);
1945         }
1946         else if(rs2[i]) {
1947           if(s2>=0) {
1948             if(opcode2[i]&2) emit_neg(s2,t);
1949             else emit_mov(s2,t);
1950           }
1951           else {
1952             emit_loadreg(rs2[i],t);
1953             if(opcode2[i]&2) emit_neg(t,t);
1954           }
1955         }
1956         else emit_zeroreg(t);
1957       }
1958     }
1959   }
1960   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1961     if(rt1[i]) {
1962       signed char s1l,s2l,s1h,s2h,tl,th;
1963       tl=get_reg(i_regs->regmap,rt1[i]);
1964       th=get_reg(i_regs->regmap,rt1[i]|64);
1965       if(tl>=0) {
1966         s1l=get_reg(i_regs->regmap,rs1[i]);
1967         s2l=get_reg(i_regs->regmap,rs2[i]);
1968         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1969         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1970         if(rs1[i]&&rs2[i]) {
1971           assert(s1l>=0);
1972           assert(s2l>=0);
1973           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1974           else emit_adds(s1l,s2l,tl);
1975           if(th>=0) {
1976             #ifdef INVERTED_CARRY
1977             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1978             #else
1979             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1980             #endif
1981             else emit_add(s1h,s2h,th);
1982           }
1983         }
1984         else if(rs1[i]) {
1985           if(s1l>=0) emit_mov(s1l,tl);
1986           else emit_loadreg(rs1[i],tl);
1987           if(th>=0) {
1988             if(s1h>=0) emit_mov(s1h,th);
1989             else emit_loadreg(rs1[i]|64,th);
1990           }
1991         }
1992         else if(rs2[i]) {
1993           if(s2l>=0) {
1994             if(opcode2[i]&2) emit_negs(s2l,tl);
1995             else emit_mov(s2l,tl);
1996           }
1997           else {
1998             emit_loadreg(rs2[i],tl);
1999             if(opcode2[i]&2) emit_negs(tl,tl);
2000           }
2001           if(th>=0) {
2002             #ifdef INVERTED_CARRY
2003             if(s2h>=0) emit_mov(s2h,th);
2004             else emit_loadreg(rs2[i]|64,th);
2005             if(opcode2[i]&2) {
2006               emit_adcimm(-1,th); // x86 has inverted carry flag
2007               emit_not(th,th);
2008             }
2009             #else
2010             if(opcode2[i]&2) {
2011               if(s2h>=0) emit_rscimm(s2h,0,th);
2012               else {
2013                 emit_loadreg(rs2[i]|64,th);
2014                 emit_rscimm(th,0,th);
2015               }
2016             }else{
2017               if(s2h>=0) emit_mov(s2h,th);
2018               else emit_loadreg(rs2[i]|64,th);
2019             }
2020             #endif
2021           }
2022         }
2023         else {
2024           emit_zeroreg(tl);
2025           if(th>=0) emit_zeroreg(th);
2026         }
2027       }
2028     }
2029   }
2030   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2031     if(rt1[i]) {
2032       signed char s1l,s1h,s2l,s2h,t;
2033       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2034       {
2035         t=get_reg(i_regs->regmap,rt1[i]);
2036         //assert(t>=0);
2037         if(t>=0) {
2038           s1l=get_reg(i_regs->regmap,rs1[i]);
2039           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2040           s2l=get_reg(i_regs->regmap,rs2[i]);
2041           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2042           if(rs2[i]==0) // rx<r0
2043           {
2044             assert(s1h>=0);
2045             if(opcode2[i]==0x2a) // SLT
2046               emit_shrimm(s1h,31,t);
2047             else // SLTU (unsigned can not be less than zero)
2048               emit_zeroreg(t);
2049           }
2050           else if(rs1[i]==0) // r0<rx
2051           {
2052             assert(s2h>=0);
2053             if(opcode2[i]==0x2a) // SLT
2054               emit_set_gz64_32(s2h,s2l,t);
2055             else // SLTU (set if not zero)
2056               emit_set_nz64_32(s2h,s2l,t);
2057           }
2058           else {
2059             assert(s1l>=0);assert(s1h>=0);
2060             assert(s2l>=0);assert(s2h>=0);
2061             if(opcode2[i]==0x2a) // SLT
2062               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2063             else // SLTU
2064               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2065           }
2066         }
2067       } else {
2068         t=get_reg(i_regs->regmap,rt1[i]);
2069         //assert(t>=0);
2070         if(t>=0) {
2071           s1l=get_reg(i_regs->regmap,rs1[i]);
2072           s2l=get_reg(i_regs->regmap,rs2[i]);
2073           if(rs2[i]==0) // rx<r0
2074           {
2075             assert(s1l>=0);
2076             if(opcode2[i]==0x2a) // SLT
2077               emit_shrimm(s1l,31,t);
2078             else // SLTU (unsigned can not be less than zero)
2079               emit_zeroreg(t);
2080           }
2081           else if(rs1[i]==0) // r0<rx
2082           {
2083             assert(s2l>=0);
2084             if(opcode2[i]==0x2a) // SLT
2085               emit_set_gz32(s2l,t);
2086             else // SLTU (set if not zero)
2087               emit_set_nz32(s2l,t);
2088           }
2089           else{
2090             assert(s1l>=0);assert(s2l>=0);
2091             if(opcode2[i]==0x2a) // SLT
2092               emit_set_if_less32(s1l,s2l,t);
2093             else // SLTU
2094               emit_set_if_carry32(s1l,s2l,t);
2095           }
2096         }
2097       }
2098     }
2099   }
2100   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2101     if(rt1[i]) {
2102       signed char s1l,s1h,s2l,s2h,th,tl;
2103       tl=get_reg(i_regs->regmap,rt1[i]);
2104       th=get_reg(i_regs->regmap,rt1[i]|64);
2105       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2106       {
2107         assert(tl>=0);
2108         if(tl>=0) {
2109           s1l=get_reg(i_regs->regmap,rs1[i]);
2110           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2111           s2l=get_reg(i_regs->regmap,rs2[i]);
2112           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2113           if(rs1[i]&&rs2[i]) {
2114             assert(s1l>=0);assert(s1h>=0);
2115             assert(s2l>=0);assert(s2h>=0);
2116             if(opcode2[i]==0x24) { // AND
2117               emit_and(s1l,s2l,tl);
2118               emit_and(s1h,s2h,th);
2119             } else
2120             if(opcode2[i]==0x25) { // OR
2121               emit_or(s1l,s2l,tl);
2122               emit_or(s1h,s2h,th);
2123             } else
2124             if(opcode2[i]==0x26) { // XOR
2125               emit_xor(s1l,s2l,tl);
2126               emit_xor(s1h,s2h,th);
2127             } else
2128             if(opcode2[i]==0x27) { // NOR
2129               emit_or(s1l,s2l,tl);
2130               emit_or(s1h,s2h,th);
2131               emit_not(tl,tl);
2132               emit_not(th,th);
2133             }
2134           }
2135           else
2136           {
2137             if(opcode2[i]==0x24) { // AND
2138               emit_zeroreg(tl);
2139               emit_zeroreg(th);
2140             } else
2141             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2142               if(rs1[i]){
2143                 if(s1l>=0) emit_mov(s1l,tl);
2144                 else emit_loadreg(rs1[i],tl);
2145                 if(s1h>=0) emit_mov(s1h,th);
2146                 else emit_loadreg(rs1[i]|64,th);
2147               }
2148               else
2149               if(rs2[i]){
2150                 if(s2l>=0) emit_mov(s2l,tl);
2151                 else emit_loadreg(rs2[i],tl);
2152                 if(s2h>=0) emit_mov(s2h,th);
2153                 else emit_loadreg(rs2[i]|64,th);
2154               }
2155               else{
2156                 emit_zeroreg(tl);
2157                 emit_zeroreg(th);
2158               }
2159             } else
2160             if(opcode2[i]==0x27) { // NOR
2161               if(rs1[i]){
2162                 if(s1l>=0) emit_not(s1l,tl);
2163                 else{
2164                   emit_loadreg(rs1[i],tl);
2165                   emit_not(tl,tl);
2166                 }
2167                 if(s1h>=0) emit_not(s1h,th);
2168                 else{
2169                   emit_loadreg(rs1[i]|64,th);
2170                   emit_not(th,th);
2171                 }
2172               }
2173               else
2174               if(rs2[i]){
2175                 if(s2l>=0) emit_not(s2l,tl);
2176                 else{
2177                   emit_loadreg(rs2[i],tl);
2178                   emit_not(tl,tl);
2179                 }
2180                 if(s2h>=0) emit_not(s2h,th);
2181                 else{
2182                   emit_loadreg(rs2[i]|64,th);
2183                   emit_not(th,th);
2184                 }
2185               }
2186               else {
2187                 emit_movimm(-1,tl);
2188                 emit_movimm(-1,th);
2189               }
2190             }
2191           }
2192         }
2193       }
2194       else
2195       {
2196         // 32 bit
2197         if(tl>=0) {
2198           s1l=get_reg(i_regs->regmap,rs1[i]);
2199           s2l=get_reg(i_regs->regmap,rs2[i]);
2200           if(rs1[i]&&rs2[i]) {
2201             assert(s1l>=0);
2202             assert(s2l>=0);
2203             if(opcode2[i]==0x24) { // AND
2204               emit_and(s1l,s2l,tl);
2205             } else
2206             if(opcode2[i]==0x25) { // OR
2207               emit_or(s1l,s2l,tl);
2208             } else
2209             if(opcode2[i]==0x26) { // XOR
2210               emit_xor(s1l,s2l,tl);
2211             } else
2212             if(opcode2[i]==0x27) { // NOR
2213               emit_or(s1l,s2l,tl);
2214               emit_not(tl,tl);
2215             }
2216           }
2217           else
2218           {
2219             if(opcode2[i]==0x24) { // AND
2220               emit_zeroreg(tl);
2221             } else
2222             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2223               if(rs1[i]){
2224                 if(s1l>=0) emit_mov(s1l,tl);
2225                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2226               }
2227               else
2228               if(rs2[i]){
2229                 if(s2l>=0) emit_mov(s2l,tl);
2230                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2231               }
2232               else emit_zeroreg(tl);
2233             } else
2234             if(opcode2[i]==0x27) { // NOR
2235               if(rs1[i]){
2236                 if(s1l>=0) emit_not(s1l,tl);
2237                 else {
2238                   emit_loadreg(rs1[i],tl);
2239                   emit_not(tl,tl);
2240                 }
2241               }
2242               else
2243               if(rs2[i]){
2244                 if(s2l>=0) emit_not(s2l,tl);
2245                 else {
2246                   emit_loadreg(rs2[i],tl);
2247                   emit_not(tl,tl);
2248                 }
2249               }
2250               else emit_movimm(-1,tl);
2251             }
2252           }
2253         }
2254       }
2255     }
2256   }
2257 }
2258
2259 void imm16_assemble(int i,struct regstat *i_regs)
2260 {
2261   if (opcode[i]==0x0f) { // LUI
2262     if(rt1[i]) {
2263       signed char t;
2264       t=get_reg(i_regs->regmap,rt1[i]);
2265       //assert(t>=0);
2266       if(t>=0) {
2267         if(!((i_regs->isconst>>t)&1))
2268           emit_movimm(imm[i]<<16,t);
2269       }
2270     }
2271   }
2272   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2273     if(rt1[i]) {
2274       signed char s,t;
2275       t=get_reg(i_regs->regmap,rt1[i]);
2276       s=get_reg(i_regs->regmap,rs1[i]);
2277       if(rs1[i]) {
2278         //assert(t>=0);
2279         //assert(s>=0);
2280         if(t>=0) {
2281           if(!((i_regs->isconst>>t)&1)) {
2282             if(s<0) {
2283               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2284               emit_addimm(t,imm[i],t);
2285             }else{
2286               if(!((i_regs->wasconst>>s)&1))
2287                 emit_addimm(s,imm[i],t);
2288               else
2289                 emit_movimm(constmap[i][s]+imm[i],t);
2290             }
2291           }
2292         }
2293       } else {
2294         if(t>=0) {
2295           if(!((i_regs->isconst>>t)&1))
2296             emit_movimm(imm[i],t);
2297         }
2298       }
2299     }
2300   }
2301   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2302     if(rt1[i]) {
2303       signed char sh,sl,th,tl;
2304       th=get_reg(i_regs->regmap,rt1[i]|64);
2305       tl=get_reg(i_regs->regmap,rt1[i]);
2306       sh=get_reg(i_regs->regmap,rs1[i]|64);
2307       sl=get_reg(i_regs->regmap,rs1[i]);
2308       if(tl>=0) {
2309         if(rs1[i]) {
2310           assert(sh>=0);
2311           assert(sl>=0);
2312           if(th>=0) {
2313             emit_addimm64_32(sh,sl,imm[i],th,tl);
2314           }
2315           else {
2316             emit_addimm(sl,imm[i],tl);
2317           }
2318         } else {
2319           emit_movimm(imm[i],tl);
2320           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2321         }
2322       }
2323     }
2324   }
2325   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2326     if(rt1[i]) {
2327       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2328       signed char sh,sl,t;
2329       t=get_reg(i_regs->regmap,rt1[i]);
2330       sh=get_reg(i_regs->regmap,rs1[i]|64);
2331       sl=get_reg(i_regs->regmap,rs1[i]);
2332       //assert(t>=0);
2333       if(t>=0) {
2334         if(rs1[i]>0) {
2335           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2336           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2337             if(opcode[i]==0x0a) { // SLTI
2338               if(sl<0) {
2339                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2340                 emit_slti32(t,imm[i],t);
2341               }else{
2342                 emit_slti32(sl,imm[i],t);
2343               }
2344             }
2345             else { // SLTIU
2346               if(sl<0) {
2347                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2348                 emit_sltiu32(t,imm[i],t);
2349               }else{
2350                 emit_sltiu32(sl,imm[i],t);
2351               }
2352             }
2353           }else{ // 64-bit
2354             assert(sl>=0);
2355             if(opcode[i]==0x0a) // SLTI
2356               emit_slti64_32(sh,sl,imm[i],t);
2357             else // SLTIU
2358               emit_sltiu64_32(sh,sl,imm[i],t);
2359           }
2360         }else{
2361           // SLTI(U) with r0 is just stupid,
2362           // nonetheless examples can be found
2363           if(opcode[i]==0x0a) // SLTI
2364             if(0<imm[i]) emit_movimm(1,t);
2365             else emit_zeroreg(t);
2366           else // SLTIU
2367           {
2368             if(imm[i]) emit_movimm(1,t);
2369             else emit_zeroreg(t);
2370           }
2371         }
2372       }
2373     }
2374   }
2375   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2376     if(rt1[i]) {
2377       signed char sh,sl,th,tl;
2378       th=get_reg(i_regs->regmap,rt1[i]|64);
2379       tl=get_reg(i_regs->regmap,rt1[i]);
2380       sh=get_reg(i_regs->regmap,rs1[i]|64);
2381       sl=get_reg(i_regs->regmap,rs1[i]);
2382       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2383         if(opcode[i]==0x0c) //ANDI
2384         {
2385           if(rs1[i]) {
2386             if(sl<0) {
2387               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2388               emit_andimm(tl,imm[i],tl);
2389             }else{
2390               if(!((i_regs->wasconst>>sl)&1))
2391                 emit_andimm(sl,imm[i],tl);
2392               else
2393                 emit_movimm(constmap[i][sl]&imm[i],tl);
2394             }
2395           }
2396           else
2397             emit_zeroreg(tl);
2398           if(th>=0) emit_zeroreg(th);
2399         }
2400         else
2401         {
2402           if(rs1[i]) {
2403             if(sl<0) {
2404               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2405             }
2406             if(th>=0) {
2407               if(sh<0) {
2408                 emit_loadreg(rs1[i]|64,th);
2409               }else{
2410                 emit_mov(sh,th);
2411               }
2412             }
2413             if(opcode[i]==0x0d) { // ORI
2414               if(sl<0) {
2415                 emit_orimm(tl,imm[i],tl);
2416               }else{
2417                 if(!((i_regs->wasconst>>sl)&1))
2418                   emit_orimm(sl,imm[i],tl);
2419                 else
2420                   emit_movimm(constmap[i][sl]|imm[i],tl);
2421               }
2422             }
2423             if(opcode[i]==0x0e) { // XORI
2424               if(sl<0) {
2425                 emit_xorimm(tl,imm[i],tl);
2426               }else{
2427                 if(!((i_regs->wasconst>>sl)&1))
2428                   emit_xorimm(sl,imm[i],tl);
2429                 else
2430                   emit_movimm(constmap[i][sl]^imm[i],tl);
2431               }
2432             }
2433           }
2434           else {
2435             emit_movimm(imm[i],tl);
2436             if(th>=0) emit_zeroreg(th);
2437           }
2438         }
2439       }
2440     }
2441   }
2442 }
2443
2444 void shiftimm_assemble(int i,struct regstat *i_regs)
2445 {
2446   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2447   {
2448     if(rt1[i]) {
2449       signed char s,t;
2450       t=get_reg(i_regs->regmap,rt1[i]);
2451       s=get_reg(i_regs->regmap,rs1[i]);
2452       //assert(t>=0);
2453       if(t>=0&&!((i_regs->isconst>>t)&1)){
2454         if(rs1[i]==0)
2455         {
2456           emit_zeroreg(t);
2457         }
2458         else
2459         {
2460           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2461           if(imm[i]) {
2462             if(opcode2[i]==0) // SLL
2463             {
2464               emit_shlimm(s<0?t:s,imm[i],t);
2465             }
2466             if(opcode2[i]==2) // SRL
2467             {
2468               emit_shrimm(s<0?t:s,imm[i],t);
2469             }
2470             if(opcode2[i]==3) // SRA
2471             {
2472               emit_sarimm(s<0?t:s,imm[i],t);
2473             }
2474           }else{
2475             // Shift by zero
2476             if(s>=0 && s!=t) emit_mov(s,t);
2477           }
2478         }
2479       }
2480       //emit_storereg(rt1[i],t); //DEBUG
2481     }
2482   }
2483   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2484   {
2485     if(rt1[i]) {
2486       signed char sh,sl,th,tl;
2487       th=get_reg(i_regs->regmap,rt1[i]|64);
2488       tl=get_reg(i_regs->regmap,rt1[i]);
2489       sh=get_reg(i_regs->regmap,rs1[i]|64);
2490       sl=get_reg(i_regs->regmap,rs1[i]);
2491       if(tl>=0) {
2492         if(rs1[i]==0)
2493         {
2494           emit_zeroreg(tl);
2495           if(th>=0) emit_zeroreg(th);
2496         }
2497         else
2498         {
2499           assert(sl>=0);
2500           assert(sh>=0);
2501           if(imm[i]) {
2502             if(opcode2[i]==0x38) // DSLL
2503             {
2504               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2505               emit_shlimm(sl,imm[i],tl);
2506             }
2507             if(opcode2[i]==0x3a) // DSRL
2508             {
2509               emit_shrdimm(sl,sh,imm[i],tl);
2510               if(th>=0) emit_shrimm(sh,imm[i],th);
2511             }
2512             if(opcode2[i]==0x3b) // DSRA
2513             {
2514               emit_shrdimm(sl,sh,imm[i],tl);
2515               if(th>=0) emit_sarimm(sh,imm[i],th);
2516             }
2517           }else{
2518             // Shift by zero
2519             if(sl!=tl) emit_mov(sl,tl);
2520             if(th>=0&&sh!=th) emit_mov(sh,th);
2521           }
2522         }
2523       }
2524     }
2525   }
2526   if(opcode2[i]==0x3c) // DSLL32
2527   {
2528     if(rt1[i]) {
2529       signed char sl,tl,th;
2530       tl=get_reg(i_regs->regmap,rt1[i]);
2531       th=get_reg(i_regs->regmap,rt1[i]|64);
2532       sl=get_reg(i_regs->regmap,rs1[i]);
2533       if(th>=0||tl>=0){
2534         assert(tl>=0);
2535         assert(th>=0);
2536         assert(sl>=0);
2537         emit_mov(sl,th);
2538         emit_zeroreg(tl);
2539         if(imm[i]>32)
2540         {
2541           emit_shlimm(th,imm[i]&31,th);
2542         }
2543       }
2544     }
2545   }
2546   if(opcode2[i]==0x3e) // DSRL32
2547   {
2548     if(rt1[i]) {
2549       signed char sh,tl,th;
2550       tl=get_reg(i_regs->regmap,rt1[i]);
2551       th=get_reg(i_regs->regmap,rt1[i]|64);
2552       sh=get_reg(i_regs->regmap,rs1[i]|64);
2553       if(tl>=0){
2554         assert(sh>=0);
2555         emit_mov(sh,tl);
2556         if(th>=0) emit_zeroreg(th);
2557         if(imm[i]>32)
2558         {
2559           emit_shrimm(tl,imm[i]&31,tl);
2560         }
2561       }
2562     }
2563   }
2564   if(opcode2[i]==0x3f) // DSRA32
2565   {
2566     if(rt1[i]) {
2567       signed char sh,tl;
2568       tl=get_reg(i_regs->regmap,rt1[i]);
2569       sh=get_reg(i_regs->regmap,rs1[i]|64);
2570       if(tl>=0){
2571         assert(sh>=0);
2572         emit_mov(sh,tl);
2573         if(imm[i]>32)
2574         {
2575           emit_sarimm(tl,imm[i]&31,tl);
2576         }
2577       }
2578     }
2579   }
2580 }
2581
2582 #ifndef shift_assemble
2583 void shift_assemble(int i,struct regstat *i_regs)
2584 {
2585   printf("Need shift_assemble for this architecture.\n");
2586   exit(1);
2587 }
2588 #endif
2589
2590 void load_assemble(int i,struct regstat *i_regs)
2591 {
2592   int s,th,tl,addr,map=-1;
2593   int offset;
2594   int jaddr=0;
2595   int memtarget=0,c=0;
2596   int fastload_reg_override=0;
2597   u_int hr,reglist=0;
2598   th=get_reg(i_regs->regmap,rt1[i]|64);
2599   tl=get_reg(i_regs->regmap,rt1[i]);
2600   s=get_reg(i_regs->regmap,rs1[i]);
2601   offset=imm[i];
2602   for(hr=0;hr<HOST_REGS;hr++) {
2603     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2604   }
2605   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2606   if(s>=0) {
2607     c=(i_regs->wasconst>>s)&1;
2608     if (c) {
2609       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2610     }
2611   }
2612   //printf("load_assemble: c=%d\n",c);
2613   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2614   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2615   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2616     ||rt1[i]==0) {
2617       // could be FIFO, must perform the read
2618       // ||dummy read
2619       assem_debug("(forced read)\n");
2620       tl=get_reg(i_regs->regmap,-1);
2621       assert(tl>=0);
2622   }
2623   if(offset||s<0||c) addr=tl;
2624   else addr=s;
2625   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2626  if(tl>=0) {
2627   //printf("load_assemble: c=%d\n",c);
2628   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2629   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2630   reglist&=~(1<<tl);
2631   if(th>=0) reglist&=~(1<<th);
2632   if(!c) {
2633     #ifdef RAM_OFFSET
2634     map=get_reg(i_regs->regmap,ROREG);
2635     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2636     #endif
2637     #ifdef R29_HACK
2638     // Strmnnrmn's speed hack
2639     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2640     #endif
2641     {
2642       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2643     }
2644   }
2645   else if(ram_offset&&memtarget) {
2646     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2647     fastload_reg_override=HOST_TEMPREG;
2648   }
2649   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2650   if (opcode[i]==0x20) { // LB
2651     if(!c||memtarget) {
2652       if(!dummy) {
2653         #ifdef HOST_IMM_ADDR32
2654         if(c)
2655           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2656         else
2657         #endif
2658         {
2659           //emit_xorimm(addr,3,tl);
2660           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2661           int x=0,a=tl;
2662 #ifdef BIG_ENDIAN_MIPS
2663           if(!c) emit_xorimm(addr,3,tl);
2664           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2665 #else
2666           if(!c) a=addr;
2667 #endif
2668           if(fastload_reg_override) a=fastload_reg_override;
2669
2670           emit_movsbl_indexed_tlb(x,a,map,tl);
2671         }
2672       }
2673       if(jaddr)
2674         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2675     }
2676     else
2677       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2678   }
2679   if (opcode[i]==0x21) { // LH
2680     if(!c||memtarget) {
2681       if(!dummy) {
2682         #ifdef HOST_IMM_ADDR32
2683         if(c)
2684           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2685         else
2686         #endif
2687         {
2688           int x=0,a=tl;
2689 #ifdef BIG_ENDIAN_MIPS
2690           if(!c) emit_xorimm(addr,2,tl);
2691           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2692 #else
2693           if(!c) a=addr;
2694 #endif
2695           if(fastload_reg_override) a=fastload_reg_override;
2696           //#ifdef
2697           //emit_movswl_indexed_tlb(x,tl,map,tl);
2698           //else
2699           if(map>=0) {
2700             emit_movswl_indexed(x,a,tl);
2701           }else{
2702             #if 1 //def RAM_OFFSET
2703             emit_movswl_indexed(x,a,tl);
2704             #else
2705             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2706             #endif
2707           }
2708         }
2709       }
2710       if(jaddr)
2711         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2712     }
2713     else
2714       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2715   }
2716   if (opcode[i]==0x23) { // LW
2717     if(!c||memtarget) {
2718       if(!dummy) {
2719         int a=addr;
2720         if(fastload_reg_override) a=fastload_reg_override;
2721         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2722         #ifdef HOST_IMM_ADDR32
2723         if(c)
2724           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2725         else
2726         #endif
2727         emit_readword_indexed_tlb(0,a,map,tl);
2728       }
2729       if(jaddr)
2730         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2731     }
2732     else
2733       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2734   }
2735   if (opcode[i]==0x24) { // LBU
2736     if(!c||memtarget) {
2737       if(!dummy) {
2738         #ifdef HOST_IMM_ADDR32
2739         if(c)
2740           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2741         else
2742         #endif
2743         {
2744           //emit_xorimm(addr,3,tl);
2745           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2746           int x=0,a=tl;
2747 #ifdef BIG_ENDIAN_MIPS
2748           if(!c) emit_xorimm(addr,3,tl);
2749           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2750 #else
2751           if(!c) a=addr;
2752 #endif
2753           if(fastload_reg_override) a=fastload_reg_override;
2754
2755           emit_movzbl_indexed_tlb(x,a,map,tl);
2756         }
2757       }
2758       if(jaddr)
2759         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2760     }
2761     else
2762       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2763   }
2764   if (opcode[i]==0x25) { // LHU
2765     if(!c||memtarget) {
2766       if(!dummy) {
2767         #ifdef HOST_IMM_ADDR32
2768         if(c)
2769           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2770         else
2771         #endif
2772         {
2773           int x=0,a=tl;
2774 #ifdef BIG_ENDIAN_MIPS
2775           if(!c) emit_xorimm(addr,2,tl);
2776           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2777 #else
2778           if(!c) a=addr;
2779 #endif
2780           if(fastload_reg_override) a=fastload_reg_override;
2781           //#ifdef
2782           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2783           //#else
2784           if(map>=0) {
2785             emit_movzwl_indexed(x,a,tl);
2786           }else{
2787             #if 1 //def RAM_OFFSET
2788             emit_movzwl_indexed(x,a,tl);
2789             #else
2790             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2791             #endif
2792           }
2793         }
2794       }
2795       if(jaddr)
2796         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2797     }
2798     else
2799       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2800   }
2801   if (opcode[i]==0x27) { // LWU
2802     assert(th>=0);
2803     if(!c||memtarget) {
2804       if(!dummy) {
2805         int a=addr;
2806         if(fastload_reg_override) a=fastload_reg_override;
2807         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2808         #ifdef HOST_IMM_ADDR32
2809         if(c)
2810           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2811         else
2812         #endif
2813         emit_readword_indexed_tlb(0,a,map,tl);
2814       }
2815       if(jaddr)
2816         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2817     }
2818     else {
2819       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2820     }
2821     emit_zeroreg(th);
2822   }
2823   if (opcode[i]==0x37) { // LD
2824     if(!c||memtarget) {
2825       if(!dummy) {
2826         int a=addr;
2827         if(fastload_reg_override) a=fastload_reg_override;
2828         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2829         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2830         #ifdef HOST_IMM_ADDR32
2831         if(c)
2832           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2833         else
2834         #endif
2835         emit_readdword_indexed_tlb(0,a,map,th,tl);
2836       }
2837       if(jaddr)
2838         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2839     }
2840     else
2841       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2842   }
2843  }
2844   //emit_storereg(rt1[i],tl); // DEBUG
2845   //if(opcode[i]==0x23)
2846   //if(opcode[i]==0x24)
2847   //if(opcode[i]==0x23||opcode[i]==0x24)
2848   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2849   {
2850     //emit_pusha();
2851     save_regs(0x100f);
2852         emit_readword((int)&last_count,ECX);
2853         #ifdef __i386__
2854         if(get_reg(i_regs->regmap,CCREG)<0)
2855           emit_loadreg(CCREG,HOST_CCREG);
2856         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2857         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2858         emit_writeword(HOST_CCREG,(int)&Count);
2859         #endif
2860         #ifdef __arm__
2861         if(get_reg(i_regs->regmap,CCREG)<0)
2862           emit_loadreg(CCREG,0);
2863         else
2864           emit_mov(HOST_CCREG,0);
2865         emit_add(0,ECX,0);
2866         emit_addimm(0,2*ccadj[i],0);
2867         emit_writeword(0,(int)&Count);
2868         #endif
2869     emit_call((int)memdebug);
2870     //emit_popa();
2871     restore_regs(0x100f);
2872   }*/
2873 }
2874
2875 #ifndef loadlr_assemble
2876 void loadlr_assemble(int i,struct regstat *i_regs)
2877 {
2878   printf("Need loadlr_assemble for this architecture.\n");
2879   exit(1);
2880 }
2881 #endif
2882
2883 void store_assemble(int i,struct regstat *i_regs)
2884 {
2885   int s,th,tl,map=-1;
2886   int addr,temp;
2887   int offset;
2888   int jaddr=0,type;
2889   int memtarget=0,c=0;
2890   int agr=AGEN1+(i&1);
2891   int faststore_reg_override=0;
2892   u_int hr,reglist=0;
2893   th=get_reg(i_regs->regmap,rs2[i]|64);
2894   tl=get_reg(i_regs->regmap,rs2[i]);
2895   s=get_reg(i_regs->regmap,rs1[i]);
2896   temp=get_reg(i_regs->regmap,agr);
2897   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2898   offset=imm[i];
2899   if(s>=0) {
2900     c=(i_regs->wasconst>>s)&1;
2901     if(c) {
2902       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2903     }
2904   }
2905   assert(tl>=0);
2906   assert(temp>=0);
2907   for(hr=0;hr<HOST_REGS;hr++) {
2908     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2909   }
2910   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2911   if(offset||s<0||c) addr=temp;
2912   else addr=s;
2913   if(!c) {
2914     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2915   }
2916   else if(ram_offset&&memtarget) {
2917     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2918     faststore_reg_override=HOST_TEMPREG;
2919   }
2920
2921   if (opcode[i]==0x28) { // SB
2922     if(!c||memtarget) {
2923       int x=0,a=temp;
2924 #ifdef BIG_ENDIAN_MIPS
2925       if(!c) emit_xorimm(addr,3,temp);
2926       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2927 #else
2928       if(!c) a=addr;
2929 #endif
2930       if(faststore_reg_override) a=faststore_reg_override;
2931       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2932       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2933     }
2934     type=STOREB_STUB;
2935   }
2936   if (opcode[i]==0x29) { // SH
2937     if(!c||memtarget) {
2938       int x=0,a=temp;
2939 #ifdef BIG_ENDIAN_MIPS
2940       if(!c) emit_xorimm(addr,2,temp);
2941       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2942 #else
2943       if(!c) a=addr;
2944 #endif
2945       if(faststore_reg_override) a=faststore_reg_override;
2946       //#ifdef
2947       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2948       //#else
2949       if(map>=0) {
2950         emit_writehword_indexed(tl,x,a);
2951       }else
2952         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2953         emit_writehword_indexed(tl,x,a);
2954     }
2955     type=STOREH_STUB;
2956   }
2957   if (opcode[i]==0x2B) { // SW
2958     if(!c||memtarget) {
2959       int a=addr;
2960       if(faststore_reg_override) a=faststore_reg_override;
2961       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2962       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2963     }
2964     type=STOREW_STUB;
2965   }
2966   if (opcode[i]==0x3F) { // SD
2967     if(!c||memtarget) {
2968       int a=addr;
2969       if(faststore_reg_override) a=faststore_reg_override;
2970       if(rs2[i]) {
2971         assert(th>=0);
2972         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2973         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2974         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2975       }else{
2976         // Store zero
2977         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2978         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2979         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2980       }
2981     }
2982     type=STORED_STUB;
2983   }
2984   if(jaddr) {
2985     // PCSX store handlers don't check invcode again
2986     reglist|=1<<addr;
2987     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2988     jaddr=0;
2989   }
2990   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2991     if(!c||memtarget) {
2992       #ifdef DESTRUCTIVE_SHIFT
2993       // The x86 shift operation is 'destructive'; it overwrites the
2994       // source register, so we need to make a copy first and use that.
2995       addr=temp;
2996       #endif
2997       #if defined(HOST_IMM8)
2998       int ir=get_reg(i_regs->regmap,INVCP);
2999       assert(ir>=0);
3000       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3001       #else
3002       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3003       #endif
3004       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3005       emit_callne(invalidate_addr_reg[addr]);
3006       #else
3007       int jaddr2=(int)out;
3008       emit_jne(0);
3009       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3010       #endif
3011     }
3012   }
3013   u_int addr_val=constmap[i][s]+offset;
3014   if(jaddr) {
3015     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3016   } else if(c&&!memtarget) {
3017     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3018   }
3019   // basic current block modification detection..
3020   // not looking back as that should be in mips cache already
3021   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3022     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3023     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3024     if(i_regs->regmap==regs[i].regmap) {
3025       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3026       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3027       emit_movimm(start+i*4+4,0);
3028       emit_writeword(0,(int)&pcaddr);
3029       emit_jmp((int)do_interrupt);
3030     }
3031   }
3032   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3033   //if(opcode[i]==0x2B || opcode[i]==0x28)
3034   //if(opcode[i]==0x2B || opcode[i]==0x29)
3035   //if(opcode[i]==0x2B)
3036   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3037   {
3038     #ifdef __i386__
3039     emit_pusha();
3040     #endif
3041     #ifdef __arm__
3042     save_regs(0x100f);
3043     #endif
3044         emit_readword((int)&last_count,ECX);
3045         #ifdef __i386__
3046         if(get_reg(i_regs->regmap,CCREG)<0)
3047           emit_loadreg(CCREG,HOST_CCREG);
3048         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3049         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3050         emit_writeword(HOST_CCREG,(int)&Count);
3051         #endif
3052         #ifdef __arm__
3053         if(get_reg(i_regs->regmap,CCREG)<0)
3054           emit_loadreg(CCREG,0);
3055         else
3056           emit_mov(HOST_CCREG,0);
3057         emit_add(0,ECX,0);
3058         emit_addimm(0,2*ccadj[i],0);
3059         emit_writeword(0,(int)&Count);
3060         #endif
3061     emit_call((int)memdebug);
3062     #ifdef __i386__
3063     emit_popa();
3064     #endif
3065     #ifdef __arm__
3066     restore_regs(0x100f);
3067     #endif
3068   }*/
3069 }
3070
3071 void storelr_assemble(int i,struct regstat *i_regs)
3072 {
3073   int s,th,tl;
3074   int temp;
3075   int temp2=-1;
3076   int offset;
3077   int jaddr=0;
3078   void *case1, *case2, *case3;
3079   void *done0, *done1, *done2;
3080   int memtarget=0,c=0;
3081   int agr=AGEN1+(i&1);
3082   u_int hr,reglist=0;
3083   th=get_reg(i_regs->regmap,rs2[i]|64);
3084   tl=get_reg(i_regs->regmap,rs2[i]);
3085   s=get_reg(i_regs->regmap,rs1[i]);
3086   temp=get_reg(i_regs->regmap,agr);
3087   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3088   offset=imm[i];
3089   if(s>=0) {
3090     c=(i_regs->isconst>>s)&1;
3091     if(c) {
3092       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3093     }
3094   }
3095   assert(tl>=0);
3096   for(hr=0;hr<HOST_REGS;hr++) {
3097     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3098   }
3099   assert(temp>=0);
3100   if(!c) {
3101     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3102     if(!offset&&s!=temp) emit_mov(s,temp);
3103     jaddr=(int)out;
3104     emit_jno(0);
3105   }
3106   else
3107   {
3108     if(!memtarget||!rs1[i]) {
3109       jaddr=(int)out;
3110       emit_jmp(0);
3111     }
3112   }
3113   #ifdef RAM_OFFSET
3114   int map=get_reg(i_regs->regmap,ROREG);
3115   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3116   #else
3117   if((u_int)rdram!=0x80000000)
3118     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3119   #endif
3120
3121   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3122     temp2=get_reg(i_regs->regmap,FTEMP);
3123     if(!rs2[i]) temp2=th=tl;
3124   }
3125
3126 #ifndef BIG_ENDIAN_MIPS
3127     emit_xorimm(temp,3,temp);
3128 #endif
3129   emit_testimm(temp,2);
3130   case2=out;
3131   emit_jne(0);
3132   emit_testimm(temp,1);
3133   case1=out;
3134   emit_jne(0);
3135   // 0
3136   if (opcode[i]==0x2A) { // SWL
3137     emit_writeword_indexed(tl,0,temp);
3138   }
3139   if (opcode[i]==0x2E) { // SWR
3140     emit_writebyte_indexed(tl,3,temp);
3141   }
3142   if (opcode[i]==0x2C) { // SDL
3143     emit_writeword_indexed(th,0,temp);
3144     if(rs2[i]) emit_mov(tl,temp2);
3145   }
3146   if (opcode[i]==0x2D) { // SDR
3147     emit_writebyte_indexed(tl,3,temp);
3148     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3149   }
3150   done0=out;
3151   emit_jmp(0);
3152   // 1
3153   set_jump_target(case1, out);
3154   if (opcode[i]==0x2A) { // SWL
3155     // Write 3 msb into three least significant bytes
3156     if(rs2[i]) emit_rorimm(tl,8,tl);
3157     emit_writehword_indexed(tl,-1,temp);
3158     if(rs2[i]) emit_rorimm(tl,16,tl);
3159     emit_writebyte_indexed(tl,1,temp);
3160     if(rs2[i]) emit_rorimm(tl,8,tl);
3161   }
3162   if (opcode[i]==0x2E) { // SWR
3163     // Write two lsb into two most significant bytes
3164     emit_writehword_indexed(tl,1,temp);
3165   }
3166   if (opcode[i]==0x2C) { // SDL
3167     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3168     // Write 3 msb into three least significant bytes
3169     if(rs2[i]) emit_rorimm(th,8,th);
3170     emit_writehword_indexed(th,-1,temp);
3171     if(rs2[i]) emit_rorimm(th,16,th);
3172     emit_writebyte_indexed(th,1,temp);
3173     if(rs2[i]) emit_rorimm(th,8,th);
3174   }
3175   if (opcode[i]==0x2D) { // SDR
3176     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3177     // Write two lsb into two most significant bytes
3178     emit_writehword_indexed(tl,1,temp);
3179   }
3180   done1=out;
3181   emit_jmp(0);
3182   // 2
3183   set_jump_target(case2, out);
3184   emit_testimm(temp,1);
3185   case3=out;
3186   emit_jne(0);
3187   if (opcode[i]==0x2A) { // SWL
3188     // Write two msb into two least significant bytes
3189     if(rs2[i]) emit_rorimm(tl,16,tl);
3190     emit_writehword_indexed(tl,-2,temp);
3191     if(rs2[i]) emit_rorimm(tl,16,tl);
3192   }
3193   if (opcode[i]==0x2E) { // SWR
3194     // Write 3 lsb into three most significant bytes
3195     emit_writebyte_indexed(tl,-1,temp);
3196     if(rs2[i]) emit_rorimm(tl,8,tl);
3197     emit_writehword_indexed(tl,0,temp);
3198     if(rs2[i]) emit_rorimm(tl,24,tl);
3199   }
3200   if (opcode[i]==0x2C) { // SDL
3201     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3202     // Write two msb into two least significant bytes
3203     if(rs2[i]) emit_rorimm(th,16,th);
3204     emit_writehword_indexed(th,-2,temp);
3205     if(rs2[i]) emit_rorimm(th,16,th);
3206   }
3207   if (opcode[i]==0x2D) { // SDR
3208     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3209     // Write 3 lsb into three most significant bytes
3210     emit_writebyte_indexed(tl,-1,temp);
3211     if(rs2[i]) emit_rorimm(tl,8,tl);
3212     emit_writehword_indexed(tl,0,temp);
3213     if(rs2[i]) emit_rorimm(tl,24,tl);
3214   }
3215   done2=out;
3216   emit_jmp(0);
3217   // 3
3218   set_jump_target(case3, out);
3219   if (opcode[i]==0x2A) { // SWL
3220     // Write msb into least significant byte
3221     if(rs2[i]) emit_rorimm(tl,24,tl);
3222     emit_writebyte_indexed(tl,-3,temp);
3223     if(rs2[i]) emit_rorimm(tl,8,tl);
3224   }
3225   if (opcode[i]==0x2E) { // SWR
3226     // Write entire word
3227     emit_writeword_indexed(tl,-3,temp);
3228   }
3229   if (opcode[i]==0x2C) { // SDL
3230     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3231     // Write msb into least significant byte
3232     if(rs2[i]) emit_rorimm(th,24,th);
3233     emit_writebyte_indexed(th,-3,temp);
3234     if(rs2[i]) emit_rorimm(th,8,th);
3235   }
3236   if (opcode[i]==0x2D) { // SDR
3237     if(rs2[i]) emit_mov(th,temp2);
3238     // Write entire word
3239     emit_writeword_indexed(tl,-3,temp);
3240   }
3241   set_jump_target(done0, out);
3242   set_jump_target(done1, out);
3243   set_jump_target(done2, out);
3244   if (opcode[i]==0x2C) { // SDL
3245     emit_testimm(temp,4);
3246     done0=out;
3247     emit_jne(0);
3248     emit_andimm(temp,~3,temp);
3249     emit_writeword_indexed(temp2,4,temp);
3250     set_jump_target(done0, out);
3251   }
3252   if (opcode[i]==0x2D) { // SDR
3253     emit_testimm(temp,4);
3254     done0=out;
3255     emit_jeq(0);
3256     emit_andimm(temp,~3,temp);
3257     emit_writeword_indexed(temp2,-4,temp);
3258     set_jump_target(done0, out);
3259   }
3260   if(!c||!memtarget)
3261     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3262   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3263     #ifdef RAM_OFFSET
3264     int map=get_reg(i_regs->regmap,ROREG);
3265     if(map<0) map=HOST_TEMPREG;
3266     gen_orig_addr_w(temp,map);
3267     #else
3268     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3269     #endif
3270     #if defined(HOST_IMM8)
3271     int ir=get_reg(i_regs->regmap,INVCP);
3272     assert(ir>=0);
3273     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3274     #else
3275     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3276     #endif
3277     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3278     emit_callne(invalidate_addr_reg[temp]);
3279     #else
3280     int jaddr2=(int)out;
3281     emit_jne(0);
3282     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3283     #endif
3284   }
3285   /*
3286     emit_pusha();
3287     //save_regs(0x100f);
3288         emit_readword((int)&last_count,ECX);
3289         if(get_reg(i_regs->regmap,CCREG)<0)
3290           emit_loadreg(CCREG,HOST_CCREG);
3291         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3292         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3293         emit_writeword(HOST_CCREG,(int)&Count);
3294     emit_call((int)memdebug);
3295     emit_popa();
3296     //restore_regs(0x100f);
3297   */
3298 }
3299
3300 void c1ls_assemble(int i,struct regstat *i_regs)
3301 {
3302   cop1_unusable(i, i_regs);
3303 }
3304
3305 void c2ls_assemble(int i,struct regstat *i_regs)
3306 {
3307   int s,tl;
3308   int ar;
3309   int offset;
3310   int memtarget=0,c=0;
3311   int jaddr2=0,type;
3312   int agr=AGEN1+(i&1);
3313   int fastio_reg_override=0;
3314   u_int hr,reglist=0;
3315   u_int copr=(source[i]>>16)&0x1f;
3316   s=get_reg(i_regs->regmap,rs1[i]);
3317   tl=get_reg(i_regs->regmap,FTEMP);
3318   offset=imm[i];
3319   assert(rs1[i]>0);
3320   assert(tl>=0);
3321
3322   for(hr=0;hr<HOST_REGS;hr++) {
3323     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3324   }
3325   if(i_regs->regmap[HOST_CCREG]==CCREG)
3326     reglist&=~(1<<HOST_CCREG);
3327
3328   // get the address
3329   if (opcode[i]==0x3a) { // SWC2
3330     ar=get_reg(i_regs->regmap,agr);
3331     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3332     reglist|=1<<ar;
3333   } else { // LWC2
3334     ar=tl;
3335   }
3336   if(s>=0) c=(i_regs->wasconst>>s)&1;
3337   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3338   if (!offset&&!c&&s>=0) ar=s;
3339   assert(ar>=0);
3340
3341   if (opcode[i]==0x3a) { // SWC2
3342     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3343     type=STOREW_STUB;
3344   }
3345   else
3346     type=LOADW_STUB;
3347
3348   if(c&&!memtarget) {
3349     jaddr2=(int)out;
3350     emit_jmp(0); // inline_readstub/inline_writestub?
3351   }
3352   else {
3353     if(!c) {
3354       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3355     }
3356     else if(ram_offset&&memtarget) {
3357       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3358       fastio_reg_override=HOST_TEMPREG;
3359     }
3360     if (opcode[i]==0x32) { // LWC2
3361       #ifdef HOST_IMM_ADDR32
3362       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3363       else
3364       #endif
3365       int a=ar;
3366       if(fastio_reg_override) a=fastio_reg_override;
3367       emit_readword_indexed(0,a,tl);
3368     }
3369     if (opcode[i]==0x3a) { // SWC2
3370       #ifdef DESTRUCTIVE_SHIFT
3371       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3372       #endif
3373       int a=ar;
3374       if(fastio_reg_override) a=fastio_reg_override;
3375       emit_writeword_indexed(tl,0,a);
3376     }
3377   }
3378   if(jaddr2)
3379     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3380   if(opcode[i]==0x3a) // SWC2
3381   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3382 #if defined(HOST_IMM8)
3383     int ir=get_reg(i_regs->regmap,INVCP);
3384     assert(ir>=0);
3385     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3386 #else
3387     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3388 #endif
3389     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3390     emit_callne(invalidate_addr_reg[ar]);
3391     #else
3392     int jaddr3=(int)out;
3393     emit_jne(0);
3394     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3395     #endif
3396   }
3397   if (opcode[i]==0x32) { // LWC2
3398     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3399   }
3400 }
3401
3402 #ifndef multdiv_assemble
3403 void multdiv_assemble(int i,struct regstat *i_regs)
3404 {
3405   printf("Need multdiv_assemble for this architecture.\n");
3406   exit(1);
3407 }
3408 #endif
3409
3410 void mov_assemble(int i,struct regstat *i_regs)
3411 {
3412   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3413   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3414   if(rt1[i]) {
3415     signed char sh,sl,th,tl;
3416     th=get_reg(i_regs->regmap,rt1[i]|64);
3417     tl=get_reg(i_regs->regmap,rt1[i]);
3418     //assert(tl>=0);
3419     if(tl>=0) {
3420       sh=get_reg(i_regs->regmap,rs1[i]|64);
3421       sl=get_reg(i_regs->regmap,rs1[i]);
3422       if(sl>=0) emit_mov(sl,tl);
3423       else emit_loadreg(rs1[i],tl);
3424       if(th>=0) {
3425         if(sh>=0) emit_mov(sh,th);
3426         else emit_loadreg(rs1[i]|64,th);
3427       }
3428     }
3429   }
3430 }
3431
3432 #ifndef fconv_assemble
3433 void fconv_assemble(int i,struct regstat *i_regs)
3434 {
3435   printf("Need fconv_assemble for this architecture.\n");
3436   exit(1);
3437 }
3438 #endif
3439
3440 #if 0
3441 void float_assemble(int i,struct regstat *i_regs)
3442 {
3443   printf("Need float_assemble for this architecture.\n");
3444   exit(1);
3445 }
3446 #endif
3447
3448 void syscall_assemble(int i,struct regstat *i_regs)
3449 {
3450   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3451   assert(ccreg==HOST_CCREG);
3452   assert(!is_delayslot);
3453   (void)ccreg;
3454   emit_movimm(start+i*4,EAX); // Get PC
3455   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3456   emit_jmp((int)jump_syscall_hle); // XXX
3457 }
3458
3459 void hlecall_assemble(int i,struct regstat *i_regs)
3460 {
3461   extern void psxNULL();
3462   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3463   assert(ccreg==HOST_CCREG);
3464   assert(!is_delayslot);
3465   (void)ccreg;
3466   emit_movimm(start+i*4+4,0); // Get PC
3467   uint32_t hleCode = source[i] & 0x03ffffff;
3468   if (hleCode >= (sizeof(psxHLEt) / sizeof(psxHLEt[0])))
3469     emit_movimm((int)psxNULL,1);
3470   else
3471     emit_movimm((int)psxHLEt[hleCode],1);
3472   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3473   emit_jmp((int)jump_hlecall);
3474 }
3475
3476 void intcall_assemble(int i,struct regstat *i_regs)
3477 {
3478   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3479   assert(ccreg==HOST_CCREG);
3480   assert(!is_delayslot);
3481   (void)ccreg;
3482   emit_movimm(start+i*4,0); // Get PC
3483   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3484   emit_jmp((int)jump_intcall);
3485 }
3486
3487 void ds_assemble(int i,struct regstat *i_regs)
3488 {
3489   speculate_register_values(i);
3490   is_delayslot=1;
3491   switch(itype[i]) {
3492     case ALU:
3493       alu_assemble(i,i_regs);break;
3494     case IMM16:
3495       imm16_assemble(i,i_regs);break;
3496     case SHIFT:
3497       shift_assemble(i,i_regs);break;
3498     case SHIFTIMM:
3499       shiftimm_assemble(i,i_regs);break;
3500     case LOAD:
3501       load_assemble(i,i_regs);break;
3502     case LOADLR:
3503       loadlr_assemble(i,i_regs);break;
3504     case STORE:
3505       store_assemble(i,i_regs);break;
3506     case STORELR:
3507       storelr_assemble(i,i_regs);break;
3508     case COP0:
3509       cop0_assemble(i,i_regs);break;
3510     case COP1:
3511       cop1_assemble(i,i_regs);break;
3512     case C1LS:
3513       c1ls_assemble(i,i_regs);break;
3514     case COP2:
3515       cop2_assemble(i,i_regs);break;
3516     case C2LS:
3517       c2ls_assemble(i,i_regs);break;
3518     case C2OP:
3519       c2op_assemble(i,i_regs);break;
3520     case FCONV:
3521       fconv_assemble(i,i_regs);break;
3522     case FLOAT:
3523       float_assemble(i,i_regs);break;
3524     case FCOMP:
3525       fcomp_assemble(i,i_regs);break;
3526     case MULTDIV:
3527       multdiv_assemble(i,i_regs);break;
3528     case MOV:
3529       mov_assemble(i,i_regs);break;
3530     case SYSCALL:
3531     case HLECALL:
3532     case INTCALL:
3533     case SPAN:
3534     case UJUMP:
3535     case RJUMP:
3536     case CJUMP:
3537     case SJUMP:
3538     case FJUMP:
3539       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3540   }
3541   is_delayslot=0;
3542 }
3543
3544 // Is the branch target a valid internal jump?
3545 int internal_branch(uint64_t i_is32,int addr)
3546 {
3547   if(addr&1) return 0; // Indirect (register) jump
3548   if(addr>=start && addr<start+slen*4-4)
3549   {
3550     //int t=(addr-start)>>2;
3551     // Delay slots are not valid branch targets
3552     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3553     // 64 -> 32 bit transition requires a recompile
3554     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3555     {
3556       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3557       else printf("optimizable: yes\n");
3558     }*/
3559     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3560     return 1;
3561   }
3562   return 0;
3563 }
3564
3565 #ifndef wb_invalidate
3566 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3567   uint64_t u,uint64_t uu)
3568 {
3569   int hr;
3570   for(hr=0;hr<HOST_REGS;hr++) {
3571     if(hr!=EXCLUDE_REG) {
3572       if(pre[hr]!=entry[hr]) {
3573         if(pre[hr]>=0) {
3574           if((dirty>>hr)&1) {
3575             if(get_reg(entry,pre[hr])<0) {
3576               if(pre[hr]<64) {
3577                 if(!((u>>pre[hr])&1)) {
3578                   emit_storereg(pre[hr],hr);
3579                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3580                     emit_sarimm(hr,31,hr);
3581                     emit_storereg(pre[hr]|64,hr);
3582                   }
3583                 }
3584               }else{
3585                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3586                   emit_storereg(pre[hr],hr);
3587                 }
3588               }
3589             }
3590           }
3591         }
3592       }
3593     }
3594   }
3595   // Move from one register to another (no writeback)
3596   for(hr=0;hr<HOST_REGS;hr++) {
3597     if(hr!=EXCLUDE_REG) {
3598       if(pre[hr]!=entry[hr]) {
3599         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3600           int nr;
3601           if((nr=get_reg(entry,pre[hr]))>=0) {
3602             emit_mov(hr,nr);
3603           }
3604         }
3605       }
3606     }
3607   }
3608 }
3609 #endif
3610
3611 // Load the specified registers
3612 // This only loads the registers given as arguments because
3613 // we don't want to load things that will be overwritten
3614 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3615 {
3616   int hr;
3617   // Load 32-bit regs
3618   for(hr=0;hr<HOST_REGS;hr++) {
3619     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3620       if(entry[hr]!=regmap[hr]) {
3621         if(regmap[hr]==rs1||regmap[hr]==rs2)
3622         {
3623           if(regmap[hr]==0) {
3624             emit_zeroreg(hr);
3625           }
3626           else
3627           {
3628             emit_loadreg(regmap[hr],hr);
3629           }
3630         }
3631       }
3632     }
3633   }
3634   //Load 64-bit regs
3635   for(hr=0;hr<HOST_REGS;hr++) {
3636     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3637       if(entry[hr]!=regmap[hr]) {
3638         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3639         {
3640           assert(regmap[hr]!=64);
3641           if((is32>>(regmap[hr]&63))&1) {
3642             int lr=get_reg(regmap,regmap[hr]-64);
3643             if(lr>=0)
3644               emit_sarimm(lr,31,hr);
3645             else
3646               emit_loadreg(regmap[hr],hr);
3647           }
3648           else
3649           {
3650             emit_loadreg(regmap[hr],hr);
3651           }
3652         }
3653       }
3654     }
3655   }
3656 }
3657
3658 // Load registers prior to the start of a loop
3659 // so that they are not loaded within the loop
3660 static void loop_preload(signed char pre[],signed char entry[])
3661 {
3662   int hr;
3663   for(hr=0;hr<HOST_REGS;hr++) {
3664     if(hr!=EXCLUDE_REG) {
3665       if(pre[hr]!=entry[hr]) {
3666         if(entry[hr]>=0) {
3667           if(get_reg(pre,entry[hr])<0) {
3668             assem_debug("loop preload:\n");
3669             //printf("loop preload: %d\n",hr);
3670             if(entry[hr]==0) {
3671               emit_zeroreg(hr);
3672             }
3673             else if(entry[hr]<TEMPREG)
3674             {
3675               emit_loadreg(entry[hr],hr);
3676             }
3677             else if(entry[hr]-64<TEMPREG)
3678             {
3679               emit_loadreg(entry[hr],hr);
3680             }
3681           }
3682         }
3683       }
3684     }
3685   }
3686 }
3687
3688 // Generate address for load/store instruction
3689 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3690 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3691 {
3692   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3693     int ra=-1;
3694     int agr=AGEN1+(i&1);
3695     if(itype[i]==LOAD) {
3696       ra=get_reg(i_regs->regmap,rt1[i]);
3697       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3698       assert(ra>=0);
3699     }
3700     if(itype[i]==LOADLR) {
3701       ra=get_reg(i_regs->regmap,FTEMP);
3702     }
3703     if(itype[i]==STORE||itype[i]==STORELR) {
3704       ra=get_reg(i_regs->regmap,agr);
3705       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3706     }
3707     if(itype[i]==C1LS||itype[i]==C2LS) {
3708       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3709         ra=get_reg(i_regs->regmap,FTEMP);
3710       else { // SWC1/SDC1/SWC2/SDC2
3711         ra=get_reg(i_regs->regmap,agr);
3712         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3713       }
3714     }
3715     int rs=get_reg(i_regs->regmap,rs1[i]);
3716     if(ra>=0) {
3717       int offset=imm[i];
3718       int c=(i_regs->wasconst>>rs)&1;
3719       if(rs1[i]==0) {
3720         // Using r0 as a base address
3721         if(!entry||entry[ra]!=agr) {
3722           if (opcode[i]==0x22||opcode[i]==0x26) {
3723             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3724           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3725             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3726           }else{
3727             emit_movimm(offset,ra);
3728           }
3729         } // else did it in the previous cycle
3730       }
3731       else if(rs<0) {
3732         if(!entry||entry[ra]!=rs1[i])
3733           emit_loadreg(rs1[i],ra);
3734         //if(!entry||entry[ra]!=rs1[i])
3735         //  printf("poor load scheduling!\n");
3736       }
3737       else if(c) {
3738         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3739           if(!entry||entry[ra]!=agr) {
3740             if (opcode[i]==0x22||opcode[i]==0x26) {
3741               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3742             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3743               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3744             }else{
3745               #ifdef HOST_IMM_ADDR32
3746               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3747               #endif
3748               emit_movimm(constmap[i][rs]+offset,ra);
3749               regs[i].loadedconst|=1<<ra;
3750             }
3751           } // else did it in the previous cycle
3752         } // else load_consts already did it
3753       }
3754       if(offset&&!c&&rs1[i]) {
3755         if(rs>=0) {
3756           emit_addimm(rs,offset,ra);
3757         }else{
3758           emit_addimm(ra,offset,ra);
3759         }
3760       }
3761     }
3762   }
3763   // Preload constants for next instruction
3764   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3765     int agr,ra;
3766     // Actual address
3767     agr=AGEN1+((i+1)&1);
3768     ra=get_reg(i_regs->regmap,agr);
3769     if(ra>=0) {
3770       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3771       int offset=imm[i+1];
3772       int c=(regs[i+1].wasconst>>rs)&1;
3773       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3774         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3775           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3776         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3777           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3778         }else{
3779           #ifdef HOST_IMM_ADDR32
3780           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3781           #endif
3782           emit_movimm(constmap[i+1][rs]+offset,ra);
3783           regs[i+1].loadedconst|=1<<ra;
3784         }
3785       }
3786       else if(rs1[i+1]==0) {
3787         // Using r0 as a base address
3788         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3789           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3790         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3791           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3792         }else{
3793           emit_movimm(offset,ra);
3794         }
3795       }
3796     }
3797   }
3798 }
3799
3800 static int get_final_value(int hr, int i, int *value)
3801 {
3802   int reg=regs[i].regmap[hr];
3803   while(i<slen-1) {
3804     if(regs[i+1].regmap[hr]!=reg) break;
3805     if(!((regs[i+1].isconst>>hr)&1)) break;
3806     if(bt[i+1]) break;
3807     i++;
3808   }
3809   if(i<slen-1) {
3810     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3811       *value=constmap[i][hr];
3812       return 1;
3813     }
3814     if(!bt[i+1]) {
3815       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3816         // Load in delay slot, out-of-order execution
3817         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3818         {
3819           // Precompute load address
3820           *value=constmap[i][hr]+imm[i+2];
3821           return 1;
3822         }
3823       }
3824       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3825       {
3826         // Precompute load address
3827         *value=constmap[i][hr]+imm[i+1];
3828         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3829         return 1;
3830       }
3831     }
3832   }
3833   *value=constmap[i][hr];
3834   //printf("c=%x\n",(int)constmap[i][hr]);
3835   if(i==slen-1) return 1;
3836   if(reg<64) {
3837     return !((unneeded_reg[i+1]>>reg)&1);
3838   }else{
3839     return !((unneeded_reg_upper[i+1]>>reg)&1);
3840   }
3841 }
3842
3843 // Load registers with known constants
3844 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3845 {
3846   int hr,hr2;
3847   // propagate loaded constant flags
3848   if(i==0||bt[i])
3849     regs[i].loadedconst=0;
3850   else {
3851     for(hr=0;hr<HOST_REGS;hr++) {
3852       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3853          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3854       {
3855         regs[i].loadedconst|=1<<hr;
3856       }
3857     }
3858   }
3859   // Load 32-bit regs
3860   for(hr=0;hr<HOST_REGS;hr++) {
3861     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3862       //if(entry[hr]!=regmap[hr]) {
3863       if(!((regs[i].loadedconst>>hr)&1)) {
3864         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3865           int value,similar=0;
3866           if(get_final_value(hr,i,&value)) {
3867             // see if some other register has similar value
3868             for(hr2=0;hr2<HOST_REGS;hr2++) {
3869               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3870                 if(is_similar_value(value,constmap[i][hr2])) {
3871                   similar=1;
3872                   break;
3873                 }
3874               }
3875             }
3876             if(similar) {
3877               int value2;
3878               if(get_final_value(hr2,i,&value2)) // is this needed?
3879                 emit_movimm_from(value2,hr2,value,hr);
3880               else
3881                 emit_movimm(value,hr);
3882             }
3883             else if(value==0) {
3884               emit_zeroreg(hr);
3885             }
3886             else {
3887               emit_movimm(value,hr);
3888             }
3889           }
3890           regs[i].loadedconst|=1<<hr;
3891         }
3892       }
3893     }
3894   }
3895   // Load 64-bit regs
3896   for(hr=0;hr<HOST_REGS;hr++) {
3897     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3898       //if(entry[hr]!=regmap[hr]) {
3899       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3900         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3901           if((is32>>(regmap[hr]&63))&1) {
3902             int lr=get_reg(regmap,regmap[hr]-64);
3903             assert(lr>=0);
3904             emit_sarimm(lr,31,hr);
3905           }
3906           else
3907           {
3908             int value;
3909             if(get_final_value(hr,i,&value)) {
3910               if(value==0) {
3911                 emit_zeroreg(hr);
3912               }
3913               else {
3914                 emit_movimm(value,hr);
3915               }
3916             }
3917           }
3918         }
3919       }
3920     }
3921   }
3922 }
3923 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3924 {
3925   int hr;
3926   // Load 32-bit regs
3927   for(hr=0;hr<HOST_REGS;hr++) {
3928     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3929       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3930         int value=constmap[i][hr];
3931         if(value==0) {
3932           emit_zeroreg(hr);
3933         }
3934         else {
3935           emit_movimm(value,hr);
3936         }
3937       }
3938     }
3939   }
3940   // Load 64-bit regs
3941   for(hr=0;hr<HOST_REGS;hr++) {
3942     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3943       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3944         if((is32>>(regmap[hr]&63))&1) {
3945           int lr=get_reg(regmap,regmap[hr]-64);
3946           assert(lr>=0);
3947           emit_sarimm(lr,31,hr);
3948         }
3949         else
3950         {
3951           int value=constmap[i][hr];
3952           if(value==0) {
3953             emit_zeroreg(hr);
3954           }
3955           else {
3956             emit_movimm(value,hr);
3957           }
3958         }
3959       }
3960     }
3961   }
3962 }
3963
3964 // Write out all dirty registers (except cycle count)
3965 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3966 {
3967   int hr;
3968   for(hr=0;hr<HOST_REGS;hr++) {
3969     if(hr!=EXCLUDE_REG) {
3970       if(i_regmap[hr]>0) {
3971         if(i_regmap[hr]!=CCREG) {
3972           if((i_dirty>>hr)&1) {
3973             if(i_regmap[hr]<64) {
3974               emit_storereg(i_regmap[hr],hr);
3975             }else{
3976               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3977                 emit_storereg(i_regmap[hr],hr);
3978               }
3979             }
3980           }
3981         }
3982       }
3983     }
3984   }
3985 }
3986 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3987 // This writes the registers not written by store_regs_bt
3988 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3989 {
3990   int hr;
3991   int t=(addr-start)>>2;
3992   for(hr=0;hr<HOST_REGS;hr++) {
3993     if(hr!=EXCLUDE_REG) {
3994       if(i_regmap[hr]>0) {
3995         if(i_regmap[hr]!=CCREG) {
3996           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3997             if((i_dirty>>hr)&1) {
3998               if(i_regmap[hr]<64) {
3999                 emit_storereg(i_regmap[hr],hr);
4000               }else{
4001                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4002                   emit_storereg(i_regmap[hr],hr);
4003                 }
4004               }
4005             }
4006           }
4007         }
4008       }
4009     }
4010   }
4011 }
4012
4013 // Load all registers (except cycle count)
4014 void load_all_regs(signed char i_regmap[])
4015 {
4016   int hr;
4017   for(hr=0;hr<HOST_REGS;hr++) {
4018     if(hr!=EXCLUDE_REG) {
4019       if(i_regmap[hr]==0) {
4020         emit_zeroreg(hr);
4021       }
4022       else
4023       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4024       {
4025         emit_loadreg(i_regmap[hr],hr);
4026       }
4027     }
4028   }
4029 }
4030
4031 // Load all current registers also needed by next instruction
4032 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4033 {
4034   int hr;
4035   for(hr=0;hr<HOST_REGS;hr++) {
4036     if(hr!=EXCLUDE_REG) {
4037       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4038         if(i_regmap[hr]==0) {
4039           emit_zeroreg(hr);
4040         }
4041         else
4042         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4043         {
4044           emit_loadreg(i_regmap[hr],hr);
4045         }
4046       }
4047     }
4048   }
4049 }
4050
4051 // Load all regs, storing cycle count if necessary
4052 void load_regs_entry(int t)
4053 {
4054   int hr;
4055   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4056   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4057   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4058     emit_storereg(CCREG,HOST_CCREG);
4059   }
4060   // Load 32-bit regs
4061   for(hr=0;hr<HOST_REGS;hr++) {
4062     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4063       if(regs[t].regmap_entry[hr]==0) {
4064         emit_zeroreg(hr);
4065       }
4066       else if(regs[t].regmap_entry[hr]!=CCREG)
4067       {
4068         emit_loadreg(regs[t].regmap_entry[hr],hr);
4069       }
4070     }
4071   }
4072   // Load 64-bit regs
4073   for(hr=0;hr<HOST_REGS;hr++) {
4074     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4075       assert(regs[t].regmap_entry[hr]!=64);
4076       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4077         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4078         if(lr<0) {
4079           emit_loadreg(regs[t].regmap_entry[hr],hr);
4080         }
4081         else
4082         {
4083           emit_sarimm(lr,31,hr);
4084         }
4085       }
4086       else
4087       {
4088         emit_loadreg(regs[t].regmap_entry[hr],hr);
4089       }
4090     }
4091   }
4092 }
4093
4094 // Store dirty registers prior to branch
4095 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4096 {
4097   if(internal_branch(i_is32,addr))
4098   {
4099     int t=(addr-start)>>2;
4100     int hr;
4101     for(hr=0;hr<HOST_REGS;hr++) {
4102       if(hr!=EXCLUDE_REG) {
4103         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4104           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4105             if((i_dirty>>hr)&1) {
4106               if(i_regmap[hr]<64) {
4107                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4108                   emit_storereg(i_regmap[hr],hr);
4109                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4110                     #ifdef DESTRUCTIVE_WRITEBACK
4111                     emit_sarimm(hr,31,hr);
4112                     emit_storereg(i_regmap[hr]|64,hr);
4113                     #else
4114                     emit_sarimm(hr,31,HOST_TEMPREG);
4115                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4116                     #endif
4117                   }
4118                 }
4119               }else{
4120                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4121                   emit_storereg(i_regmap[hr],hr);
4122                 }
4123               }
4124             }
4125           }
4126         }
4127       }
4128     }
4129   }
4130   else
4131   {
4132     // Branch out of this block, write out all dirty regs
4133     wb_dirtys(i_regmap,i_is32,i_dirty);
4134   }
4135 }
4136
4137 // Load all needed registers for branch target
4138 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4139 {
4140   //if(addr>=start && addr<(start+slen*4))
4141   if(internal_branch(i_is32,addr))
4142   {
4143     int t=(addr-start)>>2;
4144     int hr;
4145     // Store the cycle count before loading something else
4146     if(i_regmap[HOST_CCREG]!=CCREG) {
4147       assert(i_regmap[HOST_CCREG]==-1);
4148     }
4149     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4150       emit_storereg(CCREG,HOST_CCREG);
4151     }
4152     // Load 32-bit regs
4153     for(hr=0;hr<HOST_REGS;hr++) {
4154       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4155         #ifdef DESTRUCTIVE_WRITEBACK
4156         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4157         #else
4158         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4159         #endif
4160           if(regs[t].regmap_entry[hr]==0) {
4161             emit_zeroreg(hr);
4162           }
4163           else if(regs[t].regmap_entry[hr]!=CCREG)
4164           {
4165             emit_loadreg(regs[t].regmap_entry[hr],hr);
4166           }
4167         }
4168       }
4169     }
4170     //Load 64-bit regs
4171     for(hr=0;hr<HOST_REGS;hr++) {
4172       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4173         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4174           assert(regs[t].regmap_entry[hr]!=64);
4175           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4176             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4177             if(lr<0) {
4178               emit_loadreg(regs[t].regmap_entry[hr],hr);
4179             }
4180             else
4181             {
4182               emit_sarimm(lr,31,hr);
4183             }
4184           }
4185           else
4186           {
4187             emit_loadreg(regs[t].regmap_entry[hr],hr);
4188           }
4189         }
4190         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4191           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4192           assert(lr>=0);
4193           emit_sarimm(lr,31,hr);
4194         }
4195       }
4196     }
4197   }
4198 }
4199
4200 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4201 {
4202   if(addr>=start && addr<start+slen*4-4)
4203   {
4204     int t=(addr-start)>>2;
4205     int hr;
4206     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4207     for(hr=0;hr<HOST_REGS;hr++)
4208     {
4209       if(hr!=EXCLUDE_REG)
4210       {
4211         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4212         {
4213           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4214           {
4215             return 0;
4216           }
4217           else
4218           if((i_dirty>>hr)&1)
4219           {
4220             if(i_regmap[hr]<TEMPREG)
4221             {
4222               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4223                 return 0;
4224             }
4225             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4226             {
4227               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4228                 return 0;
4229             }
4230           }
4231         }
4232         else // Same register but is it 32-bit or dirty?
4233         if(i_regmap[hr]>=0)
4234         {
4235           if(!((regs[t].dirty>>hr)&1))
4236           {
4237             if((i_dirty>>hr)&1)
4238             {
4239               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4240               {
4241                 //printf("%x: dirty no match\n",addr);
4242                 return 0;
4243               }
4244             }
4245           }
4246           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4247           {
4248             //printf("%x: is32 no match\n",addr);
4249             return 0;
4250           }
4251         }
4252       }
4253     }
4254     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4255     // Delay slots are not valid branch targets
4256     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4257     // Delay slots require additional processing, so do not match
4258     if(is_ds[t]) return 0;
4259   }
4260   else
4261   {
4262     int hr;
4263     for(hr=0;hr<HOST_REGS;hr++)
4264     {
4265       if(hr!=EXCLUDE_REG)
4266       {
4267         if(i_regmap[hr]>=0)
4268         {
4269           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4270           {
4271             if((i_dirty>>hr)&1)
4272             {
4273               return 0;
4274             }
4275           }
4276         }
4277       }
4278     }
4279   }
4280   return 1;
4281 }
4282
4283 #ifdef DRC_DBG
4284 static void drc_dbg_emit_do_cmp(int i)
4285 {
4286   extern void do_insn_cmp();
4287   extern int cycle;
4288   u_int hr,reglist=0;
4289
4290   for(hr=0;hr<HOST_REGS;hr++)
4291     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
4292   save_regs(reglist);
4293   emit_movimm(start+i*4,0);
4294   emit_writeword(0,(int)&pcaddr);
4295   emit_call((int)do_insn_cmp);
4296   //emit_readword((int)&cycle,0);
4297   //emit_addimm(0,2,0);
4298   //emit_writeword(0,(int)&cycle);
4299   restore_regs(reglist);
4300 }
4301 #else
4302 #define drc_dbg_emit_do_cmp(x)
4303 #endif
4304
4305 // Used when a branch jumps into the delay slot of another branch
4306 void ds_assemble_entry(int i)
4307 {
4308   int t=(ba[i]-start)>>2;
4309   if (!instr_addr[t])
4310     instr_addr[t] = out;
4311   assem_debug("Assemble delay slot at %x\n",ba[i]);
4312   assem_debug("<->\n");
4313   drc_dbg_emit_do_cmp(t);
4314   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4315     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4316   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4317   address_generation(t,&regs[t],regs[t].regmap_entry);
4318   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4319     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4320   cop1_usable=0;
4321   is_delayslot=0;
4322   switch(itype[t]) {
4323     case ALU:
4324       alu_assemble(t,&regs[t]);break;
4325     case IMM16:
4326       imm16_assemble(t,&regs[t]);break;
4327     case SHIFT:
4328       shift_assemble(t,&regs[t]);break;
4329     case SHIFTIMM:
4330       shiftimm_assemble(t,&regs[t]);break;
4331     case LOAD:
4332       load_assemble(t,&regs[t]);break;
4333     case LOADLR:
4334       loadlr_assemble(t,&regs[t]);break;
4335     case STORE:
4336       store_assemble(t,&regs[t]);break;
4337     case STORELR:
4338       storelr_assemble(t,&regs[t]);break;
4339     case COP0:
4340       cop0_assemble(t,&regs[t]);break;
4341     case COP1:
4342       cop1_assemble(t,&regs[t]);break;
4343     case C1LS:
4344       c1ls_assemble(t,&regs[t]);break;
4345     case COP2:
4346       cop2_assemble(t,&regs[t]);break;
4347     case C2LS:
4348       c2ls_assemble(t,&regs[t]);break;
4349     case C2OP:
4350       c2op_assemble(t,&regs[t]);break;
4351     case FCONV:
4352       fconv_assemble(t,&regs[t]);break;
4353     case FLOAT:
4354       float_assemble(t,&regs[t]);break;
4355     case FCOMP:
4356       fcomp_assemble(t,&regs[t]);break;
4357     case MULTDIV:
4358       multdiv_assemble(t,&regs[t]);break;
4359     case MOV:
4360       mov_assemble(t,&regs[t]);break;
4361     case SYSCALL:
4362     case HLECALL:
4363     case INTCALL:
4364     case SPAN:
4365     case UJUMP:
4366     case RJUMP:
4367     case CJUMP:
4368     case SJUMP:
4369     case FJUMP:
4370       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4371   }
4372   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4373   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4374   if(internal_branch(regs[t].is32,ba[i]+4))
4375     assem_debug("branch: internal\n");
4376   else
4377     assem_debug("branch: external\n");
4378   assert(internal_branch(regs[t].is32,ba[i]+4));
4379   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4380   emit_jmp(0);
4381 }
4382
4383 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4384 {
4385   int count;
4386   int jaddr;
4387   int idle=0;
4388   int t=0;
4389   if(itype[i]==RJUMP)
4390   {
4391     *adj=0;
4392   }
4393   //if(ba[i]>=start && ba[i]<(start+slen*4))
4394   if(internal_branch(branch_regs[i].is32,ba[i]))
4395   {
4396     t=(ba[i]-start)>>2;
4397     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4398     else *adj=ccadj[t];
4399   }
4400   else
4401   {
4402     *adj=0;
4403   }
4404   count=ccadj[i];
4405   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4406     // Idle loop
4407     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4408     idle=(int)out;
4409     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4410     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4411     jaddr=(int)out;
4412     emit_jmp(0);
4413   }
4414   else if(*adj==0||invert) {
4415     int cycles=CLOCK_ADJUST(count+2);
4416     // faster loop HACK
4417     if (t&&*adj) {
4418       int rel=t-i;
4419       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4420         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4421     }
4422     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4423     jaddr=(int)out;
4424     emit_jns(0);
4425   }
4426   else
4427   {
4428     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4429     jaddr=(int)out;
4430     emit_jns(0);
4431   }
4432   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4433 }
4434
4435 void do_ccstub(int n)
4436 {
4437   literal_pool(256);
4438   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4439   set_jump_target(stubs[n][1], out);
4440   int i=stubs[n][4];
4441   if(stubs[n][6]==NULLDS) {
4442     // Delay slot instruction is nullified ("likely" branch)
4443     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4444   }
4445   else if(stubs[n][6]!=TAKEN) {
4446     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4447   }
4448   else {
4449     if(internal_branch(branch_regs[i].is32,ba[i]))
4450       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4451   }
4452   if(stubs[n][5]!=-1)
4453   {
4454     // Save PC as return address
4455     emit_movimm(stubs[n][5],EAX);
4456     emit_writeword(EAX,(int)&pcaddr);
4457   }
4458   else
4459   {
4460     // Return address depends on which way the branch goes
4461     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4462     {
4463       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4464       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4465       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4466       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4467       if(rs1[i]==0)
4468       {
4469         s1l=s2l;s1h=s2h;
4470         s2l=s2h=-1;
4471       }
4472       else if(rs2[i]==0)
4473       {
4474         s2l=s2h=-1;
4475       }
4476       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4477         s1h=s2h=-1;
4478       }
4479       assert(s1l>=0);
4480       #ifdef DESTRUCTIVE_WRITEBACK
4481       if(rs1[i]) {
4482         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4483           emit_loadreg(rs1[i],s1l);
4484       }
4485       else {
4486         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4487           emit_loadreg(rs2[i],s1l);
4488       }
4489       if(s2l>=0)
4490         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4491           emit_loadreg(rs2[i],s2l);
4492       #endif
4493       int hr=0;
4494       int addr=-1,alt=-1,ntaddr=-1;
4495       while(hr<HOST_REGS)
4496       {
4497         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4498            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4499            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4500         {
4501           addr=hr++;break;
4502         }
4503         hr++;
4504       }
4505       while(hr<HOST_REGS)
4506       {
4507         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4508            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4509            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4510         {
4511           alt=hr++;break;
4512         }
4513         hr++;
4514       }
4515       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4516       {
4517         while(hr<HOST_REGS)
4518         {
4519           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4520              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4521              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4522           {
4523             ntaddr=hr;break;
4524           }
4525           hr++;
4526         }
4527         assert(hr<HOST_REGS);
4528       }
4529       if((opcode[i]&0x2f)==4) // BEQ
4530       {
4531         #ifdef HAVE_CMOV_IMM
4532         if(s1h<0) {
4533           if(s2l>=0) emit_cmp(s1l,s2l);
4534           else emit_test(s1l,s1l);
4535           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4536         }
4537         else
4538         #endif
4539         {
4540           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4541           if(s1h>=0) {
4542             if(s2h>=0) emit_cmp(s1h,s2h);
4543             else emit_test(s1h,s1h);
4544             emit_cmovne_reg(alt,addr);
4545           }
4546           if(s2l>=0) emit_cmp(s1l,s2l);
4547           else emit_test(s1l,s1l);
4548           emit_cmovne_reg(alt,addr);
4549         }
4550       }
4551       if((opcode[i]&0x2f)==5) // BNE
4552       {
4553         #ifdef HAVE_CMOV_IMM
4554         if(s1h<0) {
4555           if(s2l>=0) emit_cmp(s1l,s2l);
4556           else emit_test(s1l,s1l);
4557           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4558         }
4559         else
4560         #endif
4561         {
4562           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4563           if(s1h>=0) {
4564             if(s2h>=0) emit_cmp(s1h,s2h);
4565             else emit_test(s1h,s1h);
4566             emit_cmovne_reg(alt,addr);
4567           }
4568           if(s2l>=0) emit_cmp(s1l,s2l);
4569           else emit_test(s1l,s1l);
4570           emit_cmovne_reg(alt,addr);
4571         }
4572       }
4573       if((opcode[i]&0x2f)==6) // BLEZ
4574       {
4575         //emit_movimm(ba[i],alt);
4576         //emit_movimm(start+i*4+8,addr);
4577         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4578         emit_cmpimm(s1l,1);
4579         if(s1h>=0) emit_mov(addr,ntaddr);
4580         emit_cmovl_reg(alt,addr);
4581         if(s1h>=0) {
4582           emit_test(s1h,s1h);
4583           emit_cmovne_reg(ntaddr,addr);
4584           emit_cmovs_reg(alt,addr);
4585         }
4586       }
4587       if((opcode[i]&0x2f)==7) // BGTZ
4588       {
4589         //emit_movimm(ba[i],addr);
4590         //emit_movimm(start+i*4+8,ntaddr);
4591         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4592         emit_cmpimm(s1l,1);
4593         if(s1h>=0) emit_mov(addr,alt);
4594         emit_cmovl_reg(ntaddr,addr);
4595         if(s1h>=0) {
4596           emit_test(s1h,s1h);
4597           emit_cmovne_reg(alt,addr);
4598           emit_cmovs_reg(ntaddr,addr);
4599         }
4600       }
4601       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4602       {
4603         //emit_movimm(ba[i],alt);
4604         //emit_movimm(start+i*4+8,addr);
4605         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4606         if(s1h>=0) emit_test(s1h,s1h);
4607         else emit_test(s1l,s1l);
4608         emit_cmovs_reg(alt,addr);
4609       }
4610       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4611       {
4612         //emit_movimm(ba[i],addr);
4613         //emit_movimm(start+i*4+8,alt);
4614         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4615         if(s1h>=0) emit_test(s1h,s1h);
4616         else emit_test(s1l,s1l);
4617         emit_cmovs_reg(alt,addr);
4618       }
4619       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4620         if(source[i]&0x10000) // BC1T
4621         {
4622           //emit_movimm(ba[i],alt);
4623           //emit_movimm(start+i*4+8,addr);
4624           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4625           emit_testimm(s1l,0x800000);
4626           emit_cmovne_reg(alt,addr);
4627         }
4628         else // BC1F
4629         {
4630           //emit_movimm(ba[i],addr);
4631           //emit_movimm(start+i*4+8,alt);
4632           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4633           emit_testimm(s1l,0x800000);
4634           emit_cmovne_reg(alt,addr);
4635         }
4636       }
4637       emit_writeword(addr,(int)&pcaddr);
4638     }
4639     else
4640     if(itype[i]==RJUMP)
4641     {
4642       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4643       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4644         r=get_reg(branch_regs[i].regmap,RTEMP);
4645       }
4646       emit_writeword(r,(int)&pcaddr);
4647     }
4648     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4649   }
4650   // Update cycle count
4651   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4652   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4653   emit_call((int)cc_interrupt);
4654   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4655   if(stubs[n][6]==TAKEN) {
4656     if(internal_branch(branch_regs[i].is32,ba[i]))
4657       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4658     else if(itype[i]==RJUMP) {
4659       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4660         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4661       else
4662         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4663     }
4664   }else if(stubs[n][6]==NOTTAKEN) {
4665     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4666     else load_all_regs(branch_regs[i].regmap);
4667   }else if(stubs[n][6]==NULLDS) {
4668     // Delay slot instruction is nullified ("likely" branch)
4669     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4670     else load_all_regs(regs[i].regmap);
4671   }else{
4672     load_all_regs(branch_regs[i].regmap);
4673   }
4674   emit_jmp(stubs[n][2]); // return address
4675
4676   /* This works but uses a lot of memory...
4677   emit_readword((int)&last_count,ECX);
4678   emit_add(HOST_CCREG,ECX,EAX);
4679   emit_writeword(EAX,(int)&Count);
4680   emit_call((int)gen_interupt);
4681   emit_readword((int)&Count,HOST_CCREG);
4682   emit_readword((int)&next_interupt,EAX);
4683   emit_readword((int)&pending_exception,EBX);
4684   emit_writeword(EAX,(int)&last_count);
4685   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4686   emit_test(EBX,EBX);
4687   int jne_instr=(int)out;
4688   emit_jne(0);
4689   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4690   load_all_regs(branch_regs[i].regmap);
4691   emit_jmp(stubs[n][2]); // return address
4692   set_jump_target(jne_instr,(int)out);
4693   emit_readword((int)&pcaddr,EAX);
4694   // Call get_addr_ht instead of doing the hash table here.
4695   // This code is executed infrequently and takes up a lot of space
4696   // so smaller is better.
4697   emit_storereg(CCREG,HOST_CCREG);
4698   emit_pushreg(EAX);
4699   emit_call((int)get_addr_ht);
4700   emit_loadreg(CCREG,HOST_CCREG);
4701   emit_addimm(ESP,4,ESP);
4702   emit_jmpreg(EAX);*/
4703 }
4704
4705 static void add_to_linker(int addr,int target,int ext)
4706 {
4707   link_addr[linkcount][0]=addr;
4708   link_addr[linkcount][1]=target;
4709   link_addr[linkcount][2]=ext;
4710   linkcount++;
4711 }
4712
4713 static void ujump_assemble_write_ra(int i)
4714 {
4715   int rt;
4716   unsigned int return_address;
4717   rt=get_reg(branch_regs[i].regmap,31);
4718   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4719   //assert(rt>=0);
4720   return_address=start+i*4+8;
4721   if(rt>=0) {
4722     #ifdef USE_MINI_HT
4723     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4724       int temp=-1; // note: must be ds-safe
4725       #ifdef HOST_TEMPREG
4726       temp=HOST_TEMPREG;
4727       #endif
4728       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4729       else emit_movimm(return_address,rt);
4730     }
4731     else
4732     #endif
4733     {
4734       #ifdef REG_PREFETCH
4735       if(temp>=0)
4736       {
4737         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4738       }
4739       #endif
4740       emit_movimm(return_address,rt); // PC into link register
4741       #ifdef IMM_PREFETCH
4742       emit_prefetch(hash_table_get(return_address));
4743       #endif
4744     }
4745   }
4746 }
4747
4748 void ujump_assemble(int i,struct regstat *i_regs)
4749 {
4750   int ra_done=0;
4751   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4752   address_generation(i+1,i_regs,regs[i].regmap_entry);
4753   #ifdef REG_PREFETCH
4754   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4755   if(rt1[i]==31&&temp>=0)
4756   {
4757     signed char *i_regmap=i_regs->regmap;
4758     int return_address=start+i*4+8;
4759     if(get_reg(branch_regs[i].regmap,31)>0)
4760     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4761   }
4762   #endif
4763   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4764     ujump_assemble_write_ra(i); // writeback ra for DS
4765     ra_done=1;
4766   }
4767   ds_assemble(i+1,i_regs);
4768   uint64_t bc_unneeded=branch_regs[i].u;
4769   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4770   bc_unneeded|=1|(1LL<<rt1[i]);
4771   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4772   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4773                 bc_unneeded,bc_unneeded_upper);
4774   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4775   if(!ra_done&&rt1[i]==31)
4776     ujump_assemble_write_ra(i);
4777   int cc,adj;
4778   cc=get_reg(branch_regs[i].regmap,CCREG);
4779   assert(cc==HOST_CCREG);
4780   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4781   #ifdef REG_PREFETCH
4782   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4783   #endif
4784   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4785   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4786   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4787   if(internal_branch(branch_regs[i].is32,ba[i]))
4788     assem_debug("branch: internal\n");
4789   else
4790     assem_debug("branch: external\n");
4791   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4792     ds_assemble_entry(i);
4793   }
4794   else {
4795     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4796     emit_jmp(0);
4797   }
4798 }
4799
4800 static void rjump_assemble_write_ra(int i)
4801 {
4802   int rt,return_address;
4803   assert(rt1[i+1]!=rt1[i]);
4804   assert(rt2[i+1]!=rt1[i]);
4805   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4806   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4807   assert(rt>=0);
4808   return_address=start+i*4+8;
4809   #ifdef REG_PREFETCH
4810   if(temp>=0)
4811   {
4812     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4813   }
4814   #endif
4815   emit_movimm(return_address,rt); // PC into link register
4816   #ifdef IMM_PREFETCH
4817   emit_prefetch(hash_table_get(return_address));
4818   #endif
4819 }
4820
4821 void rjump_assemble(int i,struct regstat *i_regs)
4822 {
4823   int temp;
4824   int rs,cc;
4825   int ra_done=0;
4826   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4827   assert(rs>=0);
4828   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4829     // Delay slot abuse, make a copy of the branch address register
4830     temp=get_reg(branch_regs[i].regmap,RTEMP);
4831     assert(temp>=0);
4832     assert(regs[i].regmap[temp]==RTEMP);
4833     emit_mov(rs,temp);
4834     rs=temp;
4835   }
4836   address_generation(i+1,i_regs,regs[i].regmap_entry);
4837   #ifdef REG_PREFETCH
4838   if(rt1[i]==31)
4839   {
4840     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4841       signed char *i_regmap=i_regs->regmap;
4842       int return_address=start+i*4+8;
4843       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table_get(return_address),temp);
4844     }
4845   }
4846   #endif
4847   #ifdef USE_MINI_HT
4848   if(rs1[i]==31) {
4849     int rh=get_reg(regs[i].regmap,RHASH);
4850     if(rh>=0) do_preload_rhash(rh);
4851   }
4852   #endif
4853   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4854     rjump_assemble_write_ra(i);
4855     ra_done=1;
4856   }
4857   ds_assemble(i+1,i_regs);
4858   uint64_t bc_unneeded=branch_regs[i].u;
4859   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4860   bc_unneeded|=1|(1LL<<rt1[i]);
4861   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4862   bc_unneeded&=~(1LL<<rs1[i]);
4863   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4864                 bc_unneeded,bc_unneeded_upper);
4865   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4866   if(!ra_done&&rt1[i]!=0)
4867     rjump_assemble_write_ra(i);
4868   cc=get_reg(branch_regs[i].regmap,CCREG);
4869   assert(cc==HOST_CCREG);
4870   (void)cc;
4871   #ifdef USE_MINI_HT
4872   int rh=get_reg(branch_regs[i].regmap,RHASH);
4873   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4874   if(rs1[i]==31) {
4875     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4876     do_preload_rhtbl(ht);
4877     do_rhash(rs,rh);
4878   }
4879   #endif
4880   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4881   #ifdef DESTRUCTIVE_WRITEBACK
4882   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4883     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4884       emit_loadreg(rs1[i],rs);
4885     }
4886   }
4887   #endif
4888   #ifdef REG_PREFETCH
4889   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4890   #endif
4891   #ifdef USE_MINI_HT
4892   if(rs1[i]==31) {
4893     do_miniht_load(ht,rh);
4894   }
4895   #endif
4896   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4897   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4898   //assert(adj==0);
4899   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4900   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4901   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4902     // special case for RFE
4903     emit_jmp(0);
4904   else
4905     emit_jns(0);
4906   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4907   #ifdef USE_MINI_HT
4908   if(rs1[i]==31) {
4909     do_miniht_jump(rs,rh,ht);
4910   }
4911   else
4912   #endif
4913   {
4914     //if(rs!=EAX) emit_mov(rs,EAX);
4915     //emit_jmp((int)jump_vaddr_eax);
4916     emit_jmp(jump_vaddr_reg[rs]);
4917   }
4918   /* Check hash table
4919   temp=!rs;
4920   emit_mov(rs,temp);
4921   emit_shrimm(rs,16,rs);
4922   emit_xor(temp,rs,rs);
4923   emit_movzwl_reg(rs,rs);
4924   emit_shlimm(rs,4,rs);
4925   emit_cmpmem_indexed((int)hash_table,rs,temp);
4926   emit_jne((int)out+14);
4927   emit_readword_indexed((int)hash_table+4,rs,rs);
4928   emit_jmpreg(rs);
4929   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4930   emit_addimm_no_flags(8,rs);
4931   emit_jeq((int)out-17);
4932   // No hit on hash table, call compiler
4933   emit_pushreg(temp);
4934 //DEBUG >
4935 #ifdef DEBUG_CYCLE_COUNT
4936   emit_readword((int)&last_count,ECX);
4937   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4938   emit_readword((int)&next_interupt,ECX);
4939   emit_writeword(HOST_CCREG,(int)&Count);
4940   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4941   emit_writeword(ECX,(int)&last_count);
4942 #endif
4943 //DEBUG <
4944   emit_storereg(CCREG,HOST_CCREG);
4945   emit_call((int)get_addr);
4946   emit_loadreg(CCREG,HOST_CCREG);
4947   emit_addimm(ESP,4,ESP);
4948   emit_jmpreg(EAX);*/
4949   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4950   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4951   #endif
4952 }
4953
4954 void cjump_assemble(int i,struct regstat *i_regs)
4955 {
4956   signed char *i_regmap=i_regs->regmap;
4957   int cc;
4958   int match;
4959   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4960   assem_debug("match=%d\n",match);
4961   int s1h,s1l,s2h,s2l;
4962   int prev_cop1_usable=cop1_usable;
4963   int unconditional=0,nop=0;
4964   int only32=0;
4965   int invert=0;
4966   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4967   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4968   if(!match) invert=1;
4969   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4970   if(i>(ba[i]-start)>>2) invert=1;
4971   #endif
4972
4973   if(ooo[i]) {
4974     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4975     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4976     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4977     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4978   }
4979   else {
4980     s1l=get_reg(i_regmap,rs1[i]);
4981     s1h=get_reg(i_regmap,rs1[i]|64);
4982     s2l=get_reg(i_regmap,rs2[i]);
4983     s2h=get_reg(i_regmap,rs2[i]|64);
4984   }
4985   if(rs1[i]==0&&rs2[i]==0)
4986   {
4987     if(opcode[i]&1) nop=1;
4988     else unconditional=1;
4989     //assert(opcode[i]!=5);
4990     //assert(opcode[i]!=7);
4991     //assert(opcode[i]!=0x15);
4992     //assert(opcode[i]!=0x17);
4993   }
4994   else if(rs1[i]==0)
4995   {
4996     s1l=s2l;s1h=s2h;
4997     s2l=s2h=-1;
4998     only32=(regs[i].was32>>rs2[i])&1;
4999   }
5000   else if(rs2[i]==0)
5001   {
5002     s2l=s2h=-1;
5003     only32=(regs[i].was32>>rs1[i])&1;
5004   }
5005   else {
5006     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5007   }
5008
5009   if(ooo[i]) {
5010     // Out of order execution (delay slot first)
5011     //printf("OOOE\n");
5012     address_generation(i+1,i_regs,regs[i].regmap_entry);
5013     ds_assemble(i+1,i_regs);
5014     int adj;
5015     uint64_t bc_unneeded=branch_regs[i].u;
5016     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5017     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5018     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5019     bc_unneeded|=1;
5020     bc_unneeded_upper|=1;
5021     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5022                   bc_unneeded,bc_unneeded_upper);
5023     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5024     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5025     cc=get_reg(branch_regs[i].regmap,CCREG);
5026     assert(cc==HOST_CCREG);
5027     if(unconditional)
5028       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5029     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5030     //assem_debug("cycle count (adj)\n");
5031     if(unconditional) {
5032       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5033       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5034         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5035         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5036         if(internal)
5037           assem_debug("branch: internal\n");
5038         else
5039           assem_debug("branch: external\n");
5040         if(internal&&is_ds[(ba[i]-start)>>2]) {
5041           ds_assemble_entry(i);
5042         }
5043         else {
5044           add_to_linker((int)out,ba[i],internal);
5045           emit_jmp(0);
5046         }
5047         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5048         if(((u_int)out)&7) emit_addnop(0);
5049         #endif
5050       }
5051     }
5052     else if(nop) {
5053       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5054       int jaddr=(int)out;
5055       emit_jns(0);
5056       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5057     }
5058     else {
5059       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5060       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5061       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5062       if(!only32)
5063       {
5064         assert(s1h>=0);
5065         if(opcode[i]==4) // BEQ
5066         {
5067           if(s2h>=0) emit_cmp(s1h,s2h);
5068           else emit_test(s1h,s1h);
5069           nottaken1=out;
5070           emit_jne(1);
5071         }
5072         if(opcode[i]==5) // BNE
5073         {
5074           if(s2h>=0) emit_cmp(s1h,s2h);
5075           else emit_test(s1h,s1h);
5076           if(invert) taken=out;
5077           else add_to_linker((int)out,ba[i],internal);
5078           emit_jne(0);
5079         }
5080         if(opcode[i]==6) // BLEZ
5081         {
5082           emit_test(s1h,s1h);
5083           if(invert) taken=out;
5084           else add_to_linker((int)out,ba[i],internal);
5085           emit_js(0);
5086           nottaken1=out;
5087           emit_jne(1);
5088         }
5089         if(opcode[i]==7) // BGTZ
5090         {
5091           emit_test(s1h,s1h);
5092           nottaken1=out;
5093           emit_js(1);
5094           if(invert) taken=out;
5095           else add_to_linker((int)out,ba[i],internal);
5096           emit_jne(0);
5097         }
5098       } // if(!only32)
5099
5100       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5101       assert(s1l>=0);
5102       if(opcode[i]==4) // BEQ
5103       {
5104         if(s2l>=0) emit_cmp(s1l,s2l);
5105         else emit_test(s1l,s1l);
5106         if(invert){
5107           nottaken=out;
5108           emit_jne(1);
5109         }else{
5110           add_to_linker((int)out,ba[i],internal);
5111           emit_jeq(0);
5112         }
5113       }
5114       if(opcode[i]==5) // BNE
5115       {
5116         if(s2l>=0) emit_cmp(s1l,s2l);
5117         else emit_test(s1l,s1l);
5118         if(invert){
5119           nottaken=out;
5120           emit_jeq(1);
5121         }else{
5122           add_to_linker((int)out,ba[i],internal);
5123           emit_jne(0);
5124         }
5125       }
5126       if(opcode[i]==6) // BLEZ
5127       {
5128         emit_cmpimm(s1l,1);
5129         if(invert){
5130           nottaken=out;
5131           emit_jge(1);
5132         }else{
5133           add_to_linker((int)out,ba[i],internal);
5134           emit_jl(0);
5135         }
5136       }
5137       if(opcode[i]==7) // BGTZ
5138       {
5139         emit_cmpimm(s1l,1);
5140         if(invert){
5141           nottaken=out;
5142           emit_jl(1);
5143         }else{
5144           add_to_linker((int)out,ba[i],internal);
5145           emit_jge(0);
5146         }
5147       }
5148       if(invert) {
5149         if(taken) set_jump_target(taken, out);
5150         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5151         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5152           if(adj) {
5153             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5154             add_to_linker((int)out,ba[i],internal);
5155           }else{
5156             emit_addnop(13);
5157             add_to_linker((int)out,ba[i],internal*2);
5158           }
5159           emit_jmp(0);
5160         }else
5161         #endif
5162         {
5163           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5164           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5165           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5166           if(internal)
5167             assem_debug("branch: internal\n");
5168           else
5169             assem_debug("branch: external\n");
5170           if(internal&&is_ds[(ba[i]-start)>>2]) {
5171             ds_assemble_entry(i);
5172           }
5173           else {
5174             add_to_linker((int)out,ba[i],internal);
5175             emit_jmp(0);
5176           }
5177         }
5178         set_jump_target(nottaken, out);
5179       }
5180
5181       if(nottaken1) set_jump_target(nottaken1, out);
5182       if(adj) {
5183         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5184       }
5185     } // (!unconditional)
5186   } // if(ooo)
5187   else
5188   {
5189     // In-order execution (branch first)
5190     //if(likely[i]) printf("IOL\n");
5191     //else
5192     //printf("IOE\n");
5193     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
5194     if(!unconditional&&!nop) {
5195       if(!only32)
5196       {
5197         assert(s1h>=0);
5198         if((opcode[i]&0x2f)==4) // BEQ
5199         {
5200           if(s2h>=0) emit_cmp(s1h,s2h);
5201           else emit_test(s1h,s1h);
5202           nottaken1=out;
5203           emit_jne(2);
5204         }
5205         if((opcode[i]&0x2f)==5) // BNE
5206         {
5207           if(s2h>=0) emit_cmp(s1h,s2h);
5208           else emit_test(s1h,s1h);
5209           taken=out;
5210           emit_jne(1);
5211         }
5212         if((opcode[i]&0x2f)==6) // BLEZ
5213         {
5214           emit_test(s1h,s1h);
5215           taken=out;
5216           emit_js(1);
5217           nottaken1=out;
5218           emit_jne(2);
5219         }
5220         if((opcode[i]&0x2f)==7) // BGTZ
5221         {
5222           emit_test(s1h,s1h);
5223           nottaken1=out;
5224           emit_js(2);
5225           taken=out;
5226           emit_jne(1);
5227         }
5228       } // if(!only32)
5229
5230       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5231       assert(s1l>=0);
5232       if((opcode[i]&0x2f)==4) // BEQ
5233       {
5234         if(s2l>=0) emit_cmp(s1l,s2l);
5235         else emit_test(s1l,s1l);
5236         nottaken=out;
5237         emit_jne(2);
5238       }
5239       if((opcode[i]&0x2f)==5) // BNE
5240       {
5241         if(s2l>=0) emit_cmp(s1l,s2l);
5242         else emit_test(s1l,s1l);
5243         nottaken=out;
5244         emit_jeq(2);
5245       }
5246       if((opcode[i]&0x2f)==6) // BLEZ
5247       {
5248         emit_cmpimm(s1l,1);
5249         nottaken=out;
5250         emit_jge(2);
5251       }
5252       if((opcode[i]&0x2f)==7) // BGTZ
5253       {
5254         emit_cmpimm(s1l,1);
5255         nottaken=out;
5256         emit_jl(2);
5257       }
5258     } // if(!unconditional)
5259     int adj;
5260     uint64_t ds_unneeded=branch_regs[i].u;
5261     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5262     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5263     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5264     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5265     ds_unneeded|=1;
5266     ds_unneeded_upper|=1;
5267     // branch taken
5268     if(!nop) {
5269       if(taken) set_jump_target(taken, out);
5270       assem_debug("1:\n");
5271       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5272                     ds_unneeded,ds_unneeded_upper);
5273       // load regs
5274       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5275       address_generation(i+1,&branch_regs[i],0);
5276       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5277       ds_assemble(i+1,&branch_regs[i]);
5278       cc=get_reg(branch_regs[i].regmap,CCREG);
5279       if(cc==-1) {
5280         emit_loadreg(CCREG,cc=HOST_CCREG);
5281         // CHECK: Is the following instruction (fall thru) allocated ok?
5282       }
5283       assert(cc==HOST_CCREG);
5284       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5285       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5286       assem_debug("cycle count (adj)\n");
5287       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5288       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5289       if(internal)
5290         assem_debug("branch: internal\n");
5291       else
5292         assem_debug("branch: external\n");
5293       if(internal&&is_ds[(ba[i]-start)>>2]) {
5294         ds_assemble_entry(i);
5295       }
5296       else {
5297         add_to_linker((int)out,ba[i],internal);
5298         emit_jmp(0);
5299       }
5300     }
5301     // branch not taken
5302     cop1_usable=prev_cop1_usable;
5303     if(!unconditional) {
5304       if(nottaken1) set_jump_target(nottaken1, out);
5305       set_jump_target(nottaken, out);
5306       assem_debug("2:\n");
5307       if(!likely[i]) {
5308         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5309                       ds_unneeded,ds_unneeded_upper);
5310         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5311         address_generation(i+1,&branch_regs[i],0);
5312         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5313         ds_assemble(i+1,&branch_regs[i]);
5314       }
5315       cc=get_reg(branch_regs[i].regmap,CCREG);
5316       if(cc==-1&&!likely[i]) {
5317         // Cycle count isn't in a register, temporarily load it then write it out
5318         emit_loadreg(CCREG,HOST_CCREG);
5319         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5320         int jaddr=(int)out;
5321         emit_jns(0);
5322         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5323         emit_storereg(CCREG,HOST_CCREG);
5324       }
5325       else{
5326         cc=get_reg(i_regmap,CCREG);
5327         assert(cc==HOST_CCREG);
5328         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5329         int jaddr=(int)out;
5330         emit_jns(0);
5331         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5332       }
5333     }
5334   }
5335 }
5336
5337 void sjump_assemble(int i,struct regstat *i_regs)
5338 {
5339   signed char *i_regmap=i_regs->regmap;
5340   int cc;
5341   int match;
5342   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5343   assem_debug("smatch=%d\n",match);
5344   int s1h,s1l;
5345   int prev_cop1_usable=cop1_usable;
5346   int unconditional=0,nevertaken=0;
5347   int only32=0;
5348   int invert=0;
5349   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5350   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5351   if(!match) invert=1;
5352   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5353   if(i>(ba[i]-start)>>2) invert=1;
5354   #endif
5355
5356   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5357   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5358
5359   if(ooo[i]) {
5360     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5361     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5362   }
5363   else {
5364     s1l=get_reg(i_regmap,rs1[i]);
5365     s1h=get_reg(i_regmap,rs1[i]|64);
5366   }
5367   if(rs1[i]==0)
5368   {
5369     if(opcode2[i]&1) unconditional=1;
5370     else nevertaken=1;
5371     // These are never taken (r0 is never less than zero)
5372     //assert(opcode2[i]!=0);
5373     //assert(opcode2[i]!=2);
5374     //assert(opcode2[i]!=0x10);
5375     //assert(opcode2[i]!=0x12);
5376   }
5377   else {
5378     only32=(regs[i].was32>>rs1[i])&1;
5379   }
5380
5381   if(ooo[i]) {
5382     // Out of order execution (delay slot first)
5383     //printf("OOOE\n");
5384     address_generation(i+1,i_regs,regs[i].regmap_entry);
5385     ds_assemble(i+1,i_regs);
5386     int adj;
5387     uint64_t bc_unneeded=branch_regs[i].u;
5388     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5389     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5390     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5391     bc_unneeded|=1;
5392     bc_unneeded_upper|=1;
5393     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5394                   bc_unneeded,bc_unneeded_upper);
5395     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5396     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5397     if(rt1[i]==31) {
5398       int rt,return_address;
5399       rt=get_reg(branch_regs[i].regmap,31);
5400       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5401       if(rt>=0) {
5402         // Save the PC even if the branch is not taken
5403         return_address=start+i*4+8;
5404         emit_movimm(return_address,rt); // PC into link register
5405         #ifdef IMM_PREFETCH
5406         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
5407         #endif
5408       }
5409     }
5410     cc=get_reg(branch_regs[i].regmap,CCREG);
5411     assert(cc==HOST_CCREG);
5412     if(unconditional)
5413       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5414     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5415     assem_debug("cycle count (adj)\n");
5416     if(unconditional) {
5417       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5418       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5419         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5420         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5421         if(internal)
5422           assem_debug("branch: internal\n");
5423         else
5424           assem_debug("branch: external\n");
5425         if(internal&&is_ds[(ba[i]-start)>>2]) {
5426           ds_assemble_entry(i);
5427         }
5428         else {
5429           add_to_linker((int)out,ba[i],internal);
5430           emit_jmp(0);
5431         }
5432         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5433         if(((u_int)out)&7) emit_addnop(0);
5434         #endif
5435       }
5436     }
5437     else if(nevertaken) {
5438       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5439       int jaddr=(int)out;
5440       emit_jns(0);
5441       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5442     }
5443     else {
5444       void *nottaken = NULL;
5445       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5446       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5447       if(!only32)
5448       {
5449         assert(s1h>=0);
5450         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5451         {
5452           emit_test(s1h,s1h);
5453           if(invert){
5454             nottaken=out;
5455             emit_jns(1);
5456           }else{
5457             add_to_linker((int)out,ba[i],internal);
5458             emit_js(0);
5459           }
5460         }
5461         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5462         {
5463           emit_test(s1h,s1h);
5464           if(invert){
5465             nottaken=out;
5466             emit_js(1);
5467           }else{
5468             add_to_linker((int)out,ba[i],internal);
5469             emit_jns(0);
5470           }
5471         }
5472       } // if(!only32)
5473       else
5474       {
5475         assert(s1l>=0);
5476         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5477         {
5478           emit_test(s1l,s1l);
5479           if(invert){
5480             nottaken=out;
5481             emit_jns(1);
5482           }else{
5483             add_to_linker((int)out,ba[i],internal);
5484             emit_js(0);
5485           }
5486         }
5487         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5488         {
5489           emit_test(s1l,s1l);
5490           if(invert){
5491             nottaken=out;
5492             emit_js(1);
5493           }else{
5494             add_to_linker((int)out,ba[i],internal);
5495             emit_jns(0);
5496           }
5497         }
5498       } // if(!only32)
5499
5500       if(invert) {
5501         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5502         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5503           if(adj) {
5504             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5505             add_to_linker((int)out,ba[i],internal);
5506           }else{
5507             emit_addnop(13);
5508             add_to_linker((int)out,ba[i],internal*2);
5509           }
5510           emit_jmp(0);
5511         }else
5512         #endif
5513         {
5514           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5515           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5516           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5517           if(internal)
5518             assem_debug("branch: internal\n");
5519           else
5520             assem_debug("branch: external\n");
5521           if(internal&&is_ds[(ba[i]-start)>>2]) {
5522             ds_assemble_entry(i);
5523           }
5524           else {
5525             add_to_linker((int)out,ba[i],internal);
5526             emit_jmp(0);
5527           }
5528         }
5529         set_jump_target(nottaken, out);
5530       }
5531
5532       if(adj) {
5533         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5534       }
5535     } // (!unconditional)
5536   } // if(ooo)
5537   else
5538   {
5539     // In-order execution (branch first)
5540     //printf("IOE\n");
5541     void *nottaken = NULL;
5542     if(rt1[i]==31) {
5543       int rt,return_address;
5544       rt=get_reg(branch_regs[i].regmap,31);
5545       if(rt>=0) {
5546         // Save the PC even if the branch is not taken
5547         return_address=start+i*4+8;
5548         emit_movimm(return_address,rt); // PC into link register
5549         #ifdef IMM_PREFETCH
5550         emit_prefetch(hash_table_get(return_address));
5551         #endif
5552       }
5553     }
5554     if(!unconditional) {
5555       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5556       if(!only32)
5557       {
5558         assert(s1h>=0);
5559         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5560         {
5561           emit_test(s1h,s1h);
5562           nottaken=out;
5563           emit_jns(1);
5564         }
5565         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5566         {
5567           emit_test(s1h,s1h);
5568           nottaken=out;
5569           emit_js(1);
5570         }
5571       } // if(!only32)
5572       else
5573       {
5574         assert(s1l>=0);
5575         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5576         {
5577           emit_test(s1l,s1l);
5578           nottaken=out;
5579           emit_jns(1);
5580         }
5581         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5582         {
5583           emit_test(s1l,s1l);
5584           nottaken=out;
5585           emit_js(1);
5586         }
5587       }
5588     } // if(!unconditional)
5589     int adj;
5590     uint64_t ds_unneeded=branch_regs[i].u;
5591     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5592     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5593     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5594     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5595     ds_unneeded|=1;
5596     ds_unneeded_upper|=1;
5597     // branch taken
5598     if(!nevertaken) {
5599       //assem_debug("1:\n");
5600       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5601                     ds_unneeded,ds_unneeded_upper);
5602       // load regs
5603       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5604       address_generation(i+1,&branch_regs[i],0);
5605       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5606       ds_assemble(i+1,&branch_regs[i]);
5607       cc=get_reg(branch_regs[i].regmap,CCREG);
5608       if(cc==-1) {
5609         emit_loadreg(CCREG,cc=HOST_CCREG);
5610         // CHECK: Is the following instruction (fall thru) allocated ok?
5611       }
5612       assert(cc==HOST_CCREG);
5613       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5614       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5615       assem_debug("cycle count (adj)\n");
5616       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5617       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5618       if(internal)
5619         assem_debug("branch: internal\n");
5620       else
5621         assem_debug("branch: external\n");
5622       if(internal&&is_ds[(ba[i]-start)>>2]) {
5623         ds_assemble_entry(i);
5624       }
5625       else {
5626         add_to_linker((int)out,ba[i],internal);
5627         emit_jmp(0);
5628       }
5629     }
5630     // branch not taken
5631     cop1_usable=prev_cop1_usable;
5632     if(!unconditional) {
5633       set_jump_target(nottaken, out);
5634       assem_debug("1:\n");
5635       if(!likely[i]) {
5636         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5637                       ds_unneeded,ds_unneeded_upper);
5638         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5639         address_generation(i+1,&branch_regs[i],0);
5640         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5641         ds_assemble(i+1,&branch_regs[i]);
5642       }
5643       cc=get_reg(branch_regs[i].regmap,CCREG);
5644       if(cc==-1&&!likely[i]) {
5645         // Cycle count isn't in a register, temporarily load it then write it out
5646         emit_loadreg(CCREG,HOST_CCREG);
5647         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5648         int jaddr=(int)out;
5649         emit_jns(0);
5650         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5651         emit_storereg(CCREG,HOST_CCREG);
5652       }
5653       else{
5654         cc=get_reg(i_regmap,CCREG);
5655         assert(cc==HOST_CCREG);
5656         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5657         int jaddr=(int)out;
5658         emit_jns(0);
5659         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5660       }
5661     }
5662   }
5663 }
5664
5665 void fjump_assemble(int i,struct regstat *i_regs)
5666 {
5667   signed char *i_regmap=i_regs->regmap;
5668   int cc;
5669   int match;
5670   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5671   assem_debug("fmatch=%d\n",match);
5672   int fs,cs;
5673   int eaddr;
5674   int invert=0;
5675   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5676   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5677   if(!match) invert=1;
5678   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5679   if(i>(ba[i]-start)>>2) invert=1;
5680   #endif
5681
5682   if(ooo[i]) {
5683     fs=get_reg(branch_regs[i].regmap,FSREG);
5684     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5685   }
5686   else {
5687     fs=get_reg(i_regmap,FSREG);
5688   }
5689
5690   // Check cop1 unusable
5691   if(!cop1_usable) {
5692     cs=get_reg(i_regmap,CSREG);
5693     assert(cs>=0);
5694     emit_testimm(cs,0x20000000);
5695     eaddr=(int)out;
5696     emit_jeq(0);
5697     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5698     cop1_usable=1;
5699   }
5700
5701   if(ooo[i]) {
5702     // Out of order execution (delay slot first)
5703     //printf("OOOE\n");
5704     ds_assemble(i+1,i_regs);
5705     int adj;
5706     uint64_t bc_unneeded=branch_regs[i].u;
5707     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5708     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5709     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5710     bc_unneeded|=1;
5711     bc_unneeded_upper|=1;
5712     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5713                   bc_unneeded,bc_unneeded_upper);
5714     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5715     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5716     cc=get_reg(branch_regs[i].regmap,CCREG);
5717     assert(cc==HOST_CCREG);
5718     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5719     assem_debug("cycle count (adj)\n");
5720     if(1) {
5721       void *nottaken = NULL;
5722       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5723       if(1) {
5724         assert(fs>=0);
5725         emit_testimm(fs,0x800000);
5726         if(source[i]&0x10000) // BC1T
5727         {
5728           if(invert){
5729             nottaken=out;
5730             emit_jeq(1);
5731           }else{
5732             add_to_linker((int)out,ba[i],internal);
5733             emit_jne(0);
5734           }
5735         }
5736         else // BC1F
5737           if(invert){
5738             nottaken=out;
5739             emit_jne(1);
5740           }else{
5741             add_to_linker((int)out,ba[i],internal);
5742             emit_jeq(0);
5743           }
5744         {
5745         }
5746       } // if(!only32)
5747
5748       if(invert) {
5749         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5750         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5751         else if(match) emit_addnop(13);
5752         #endif
5753         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5754         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5755         if(internal)
5756           assem_debug("branch: internal\n");
5757         else
5758           assem_debug("branch: external\n");
5759         if(internal&&is_ds[(ba[i]-start)>>2]) {
5760           ds_assemble_entry(i);
5761         }
5762         else {
5763           add_to_linker((int)out,ba[i],internal);
5764           emit_jmp(0);
5765         }
5766         set_jump_target(nottaken, out);
5767       }
5768
5769       if(adj) {
5770         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5771       }
5772     } // (!unconditional)
5773   } // if(ooo)
5774   else
5775   {
5776     // In-order execution (branch first)
5777     //printf("IOE\n");
5778     void *nottaken = NULL;
5779     if(1) {
5780       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5781       if(1) {
5782         assert(fs>=0);
5783         emit_testimm(fs,0x800000);
5784         if(source[i]&0x10000) // BC1T
5785         {
5786           nottaken=out;
5787           emit_jeq(1);
5788         }
5789         else // BC1F
5790         {
5791           nottaken=out;
5792           emit_jne(1);
5793         }
5794       }
5795     } // if(!unconditional)
5796     int adj;
5797     uint64_t ds_unneeded=branch_regs[i].u;
5798     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5799     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5800     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5801     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5802     ds_unneeded|=1;
5803     ds_unneeded_upper|=1;
5804     // branch taken
5805     //assem_debug("1:\n");
5806     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5807                   ds_unneeded,ds_unneeded_upper);
5808     // load regs
5809     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5810     address_generation(i+1,&branch_regs[i],0);
5811     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5812     ds_assemble(i+1,&branch_regs[i]);
5813     cc=get_reg(branch_regs[i].regmap,CCREG);
5814     if(cc==-1) {
5815       emit_loadreg(CCREG,cc=HOST_CCREG);
5816       // CHECK: Is the following instruction (fall thru) allocated ok?
5817     }
5818     assert(cc==HOST_CCREG);
5819     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5820     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5821     assem_debug("cycle count (adj)\n");
5822     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5823     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5824     if(internal)
5825       assem_debug("branch: internal\n");
5826     else
5827       assem_debug("branch: external\n");
5828     if(internal&&is_ds[(ba[i]-start)>>2]) {
5829       ds_assemble_entry(i);
5830     }
5831     else {
5832       add_to_linker((int)out,ba[i],internal);
5833       emit_jmp(0);
5834     }
5835
5836     // branch not taken
5837     if(1) { // <- FIXME (don't need this)
5838       set_jump_target(nottaken, out);
5839       assem_debug("1:\n");
5840       if(!likely[i]) {
5841         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5842                       ds_unneeded,ds_unneeded_upper);
5843         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5844         address_generation(i+1,&branch_regs[i],0);
5845         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5846         ds_assemble(i+1,&branch_regs[i]);
5847       }
5848       cc=get_reg(branch_regs[i].regmap,CCREG);
5849       if(cc==-1&&!likely[i]) {
5850         // Cycle count isn't in a register, temporarily load it then write it out
5851         emit_loadreg(CCREG,HOST_CCREG);
5852         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5853         int jaddr=(int)out;
5854         emit_jns(0);
5855         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5856         emit_storereg(CCREG,HOST_CCREG);
5857       }
5858       else{
5859         cc=get_reg(i_regmap,CCREG);
5860         assert(cc==HOST_CCREG);
5861         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5862         int jaddr=(int)out;
5863         emit_jns(0);
5864         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5865       }
5866     }
5867   }
5868 }
5869
5870 static void pagespan_assemble(int i,struct regstat *i_regs)
5871 {
5872   int s1l=get_reg(i_regs->regmap,rs1[i]);
5873   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5874   int s2l=get_reg(i_regs->regmap,rs2[i]);
5875   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5876   void *taken = NULL;
5877   void *nottaken = NULL;
5878   int unconditional=0;
5879   if(rs1[i]==0)
5880   {
5881     s1l=s2l;s1h=s2h;
5882     s2l=s2h=-1;
5883   }
5884   else if(rs2[i]==0)
5885   {
5886     s2l=s2h=-1;
5887   }
5888   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5889     s1h=s2h=-1;
5890   }
5891   int hr=0;
5892   int addr=-1,alt=-1,ntaddr=-1;
5893   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5894   else {
5895     while(hr<HOST_REGS)
5896     {
5897       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5898          (i_regs->regmap[hr]&63)!=rs1[i] &&
5899          (i_regs->regmap[hr]&63)!=rs2[i] )
5900       {
5901         addr=hr++;break;
5902       }
5903       hr++;
5904     }
5905   }
5906   while(hr<HOST_REGS)
5907   {
5908     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5909        (i_regs->regmap[hr]&63)!=rs1[i] &&
5910        (i_regs->regmap[hr]&63)!=rs2[i] )
5911     {
5912       alt=hr++;break;
5913     }
5914     hr++;
5915   }
5916   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5917   {
5918     while(hr<HOST_REGS)
5919     {
5920       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5921          (i_regs->regmap[hr]&63)!=rs1[i] &&
5922          (i_regs->regmap[hr]&63)!=rs2[i] )
5923       {
5924         ntaddr=hr;break;
5925       }
5926       hr++;
5927     }
5928   }
5929   assert(hr<HOST_REGS);
5930   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5931     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5932   }
5933   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5934   if(opcode[i]==2) // J
5935   {
5936     unconditional=1;
5937   }
5938   if(opcode[i]==3) // JAL
5939   {
5940     // TODO: mini_ht
5941     int rt=get_reg(i_regs->regmap,31);
5942     emit_movimm(start+i*4+8,rt);
5943     unconditional=1;
5944   }
5945   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5946   {
5947     emit_mov(s1l,addr);
5948     if(opcode2[i]==9) // JALR
5949     {
5950       int rt=get_reg(i_regs->regmap,rt1[i]);
5951       emit_movimm(start+i*4+8,rt);
5952     }
5953   }
5954   if((opcode[i]&0x3f)==4) // BEQ
5955   {
5956     if(rs1[i]==rs2[i])
5957     {
5958       unconditional=1;
5959     }
5960     else
5961     #ifdef HAVE_CMOV_IMM
5962     if(s1h<0) {
5963       if(s2l>=0) emit_cmp(s1l,s2l);
5964       else emit_test(s1l,s1l);
5965       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5966     }
5967     else
5968     #endif
5969     {
5970       assert(s1l>=0);
5971       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5972       if(s1h>=0) {
5973         if(s2h>=0) emit_cmp(s1h,s2h);
5974         else emit_test(s1h,s1h);
5975         emit_cmovne_reg(alt,addr);
5976       }
5977       if(s2l>=0) emit_cmp(s1l,s2l);
5978       else emit_test(s1l,s1l);
5979       emit_cmovne_reg(alt,addr);
5980     }
5981   }
5982   if((opcode[i]&0x3f)==5) // BNE
5983   {
5984     #ifdef HAVE_CMOV_IMM
5985     if(s1h<0) {
5986       if(s2l>=0) emit_cmp(s1l,s2l);
5987       else emit_test(s1l,s1l);
5988       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5989     }
5990     else
5991     #endif
5992     {
5993       assert(s1l>=0);
5994       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5995       if(s1h>=0) {
5996         if(s2h>=0) emit_cmp(s1h,s2h);
5997         else emit_test(s1h,s1h);
5998         emit_cmovne_reg(alt,addr);
5999       }
6000       if(s2l>=0) emit_cmp(s1l,s2l);
6001       else emit_test(s1l,s1l);
6002       emit_cmovne_reg(alt,addr);
6003     }
6004   }
6005   if((opcode[i]&0x3f)==0x14) // BEQL
6006   {
6007     if(s1h>=0) {
6008       if(s2h>=0) emit_cmp(s1h,s2h);
6009       else emit_test(s1h,s1h);
6010       nottaken=out;
6011       emit_jne(0);
6012     }
6013     if(s2l>=0) emit_cmp(s1l,s2l);
6014     else emit_test(s1l,s1l);
6015     if(nottaken) set_jump_target(nottaken, out);
6016     nottaken=out;
6017     emit_jne(0);
6018   }
6019   if((opcode[i]&0x3f)==0x15) // BNEL
6020   {
6021     if(s1h>=0) {
6022       if(s2h>=0) emit_cmp(s1h,s2h);
6023       else emit_test(s1h,s1h);
6024       taken=out;
6025       emit_jne(0);
6026     }
6027     if(s2l>=0) emit_cmp(s1l,s2l);
6028     else emit_test(s1l,s1l);
6029     nottaken=out;
6030     emit_jeq(0);
6031     if(taken) set_jump_target(taken, out);
6032   }
6033   if((opcode[i]&0x3f)==6) // BLEZ
6034   {
6035     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6036     emit_cmpimm(s1l,1);
6037     if(s1h>=0) emit_mov(addr,ntaddr);
6038     emit_cmovl_reg(alt,addr);
6039     if(s1h>=0) {
6040       emit_test(s1h,s1h);
6041       emit_cmovne_reg(ntaddr,addr);
6042       emit_cmovs_reg(alt,addr);
6043     }
6044   }
6045   if((opcode[i]&0x3f)==7) // BGTZ
6046   {
6047     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6048     emit_cmpimm(s1l,1);
6049     if(s1h>=0) emit_mov(addr,alt);
6050     emit_cmovl_reg(ntaddr,addr);
6051     if(s1h>=0) {
6052       emit_test(s1h,s1h);
6053       emit_cmovne_reg(alt,addr);
6054       emit_cmovs_reg(ntaddr,addr);
6055     }
6056   }
6057   if((opcode[i]&0x3f)==0x16) // BLEZL
6058   {
6059     assert((opcode[i]&0x3f)!=0x16);
6060   }
6061   if((opcode[i]&0x3f)==0x17) // BGTZL
6062   {
6063     assert((opcode[i]&0x3f)!=0x17);
6064   }
6065   assert(opcode[i]!=1); // BLTZ/BGEZ
6066
6067   //FIXME: Check CSREG
6068   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6069     if((source[i]&0x30000)==0) // BC1F
6070     {
6071       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6072       emit_testimm(s1l,0x800000);
6073       emit_cmovne_reg(alt,addr);
6074     }
6075     if((source[i]&0x30000)==0x10000) // BC1T
6076     {
6077       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6078       emit_testimm(s1l,0x800000);
6079       emit_cmovne_reg(alt,addr);
6080     }
6081     if((source[i]&0x30000)==0x20000) // BC1FL
6082     {
6083       emit_testimm(s1l,0x800000);
6084       nottaken=out;
6085       emit_jne(0);
6086     }
6087     if((source[i]&0x30000)==0x30000) // BC1TL
6088     {
6089       emit_testimm(s1l,0x800000);
6090       nottaken=out;
6091       emit_jeq(0);
6092     }
6093   }
6094
6095   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6096   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6097   if(likely[i]||unconditional)
6098   {
6099     emit_movimm(ba[i],HOST_BTREG);
6100   }
6101   else if(addr!=HOST_BTREG)
6102   {
6103     emit_mov(addr,HOST_BTREG);
6104   }
6105   void *branch_addr=out;
6106   emit_jmp(0);
6107   int target_addr=start+i*4+5;
6108   void *stub=out;
6109   void *compiled_target_addr=check_addr(target_addr);
6110   emit_extjump_ds((int)branch_addr,target_addr);
6111   if(compiled_target_addr) {
6112     set_jump_target(branch_addr, compiled_target_addr);
6113     add_link(target_addr,stub);
6114   }
6115   else set_jump_target(branch_addr, stub);
6116   if(likely[i]) {
6117     // Not-taken path
6118     set_jump_target(nottaken, out);
6119     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6120     void *branch_addr=out;
6121     emit_jmp(0);
6122     int target_addr=start+i*4+8;
6123     void *stub=out;
6124     void *compiled_target_addr=check_addr(target_addr);
6125     emit_extjump_ds((int)branch_addr,target_addr);
6126     if(compiled_target_addr) {
6127       set_jump_target(branch_addr, compiled_target_addr);
6128       add_link(target_addr,stub);
6129     }
6130     else set_jump_target(branch_addr, stub);
6131   }
6132 }
6133
6134 // Assemble the delay slot for the above
6135 static void pagespan_ds()
6136 {
6137   assem_debug("initial delay slot:\n");
6138   u_int vaddr=start+1;
6139   u_int page=get_page(vaddr);
6140   u_int vpage=get_vpage(vaddr);
6141   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6142   do_dirty_stub_ds();
6143   ll_add(jump_in+page,vaddr,(void *)out);
6144   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6145   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6146     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6147   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6148     emit_writeword(HOST_BTREG,(int)&branch_target);
6149   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6150   address_generation(0,&regs[0],regs[0].regmap_entry);
6151   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6152     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6153   cop1_usable=0;
6154   is_delayslot=0;
6155   switch(itype[0]) {
6156     case ALU:
6157       alu_assemble(0,&regs[0]);break;
6158     case IMM16:
6159       imm16_assemble(0,&regs[0]);break;
6160     case SHIFT:
6161       shift_assemble(0,&regs[0]);break;
6162     case SHIFTIMM:
6163       shiftimm_assemble(0,&regs[0]);break;
6164     case LOAD:
6165       load_assemble(0,&regs[0]);break;
6166     case LOADLR:
6167       loadlr_assemble(0,&regs[0]);break;
6168     case STORE:
6169       store_assemble(0,&regs[0]);break;
6170     case STORELR:
6171       storelr_assemble(0,&regs[0]);break;
6172     case COP0:
6173       cop0_assemble(0,&regs[0]);break;
6174     case COP1:
6175       cop1_assemble(0,&regs[0]);break;
6176     case C1LS:
6177       c1ls_assemble(0,&regs[0]);break;
6178     case COP2:
6179       cop2_assemble(0,&regs[0]);break;
6180     case C2LS:
6181       c2ls_assemble(0,&regs[0]);break;
6182     case C2OP:
6183       c2op_assemble(0,&regs[0]);break;
6184     case FCONV:
6185       fconv_assemble(0,&regs[0]);break;
6186     case FLOAT:
6187       float_assemble(0,&regs[0]);break;
6188     case FCOMP:
6189       fcomp_assemble(0,&regs[0]);break;
6190     case MULTDIV:
6191       multdiv_assemble(0,&regs[0]);break;
6192     case MOV:
6193       mov_assemble(0,&regs[0]);break;
6194     case SYSCALL:
6195     case HLECALL:
6196     case INTCALL:
6197     case SPAN:
6198     case UJUMP:
6199     case RJUMP:
6200     case CJUMP:
6201     case SJUMP:
6202     case FJUMP:
6203       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6204   }
6205   int btaddr=get_reg(regs[0].regmap,BTREG);
6206   if(btaddr<0) {
6207     btaddr=get_reg(regs[0].regmap,-1);
6208     emit_readword((int)&branch_target,btaddr);
6209   }
6210   assert(btaddr!=HOST_CCREG);
6211   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6212 #ifdef HOST_IMM8
6213   emit_movimm(start+4,HOST_TEMPREG);
6214   emit_cmp(btaddr,HOST_TEMPREG);
6215 #else
6216   emit_cmpimm(btaddr,start+4);
6217 #endif
6218   void *branch = out;
6219   emit_jeq(0);
6220   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6221   emit_jmp(jump_vaddr_reg[btaddr]);
6222   set_jump_target(branch, out);
6223   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6224   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6225 }
6226
6227 // Basic liveness analysis for MIPS registers
6228 void unneeded_registers(int istart,int iend,int r)
6229 {
6230   int i;
6231   uint64_t u,uu,gte_u,b,bu,gte_bu;
6232   uint64_t temp_u,temp_uu,temp_gte_u=0;
6233   uint64_t tdep;
6234   uint64_t gte_u_unknown=0;
6235   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6236     gte_u_unknown=~0ll;
6237   if(iend==slen-1) {
6238     u=1;uu=1;
6239     gte_u=gte_u_unknown;
6240   }else{
6241     u=unneeded_reg[iend+1];
6242     uu=unneeded_reg_upper[iend+1];
6243     u=1;uu=1;
6244     gte_u=gte_unneeded[iend+1];
6245   }
6246
6247   for (i=iend;i>=istart;i--)
6248   {
6249     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6250     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6251     {
6252       // If subroutine call, flag return address as a possible branch target
6253       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6254
6255       if(ba[i]<start || ba[i]>=(start+slen*4))
6256       {
6257         // Branch out of this block, flush all regs
6258         u=1;
6259         uu=1;
6260         gte_u=gte_u_unknown;
6261         /* Hexagon hack
6262         if(itype[i]==UJUMP&&rt1[i]==31)
6263         {
6264           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6265         }
6266         if(itype[i]==RJUMP&&rs1[i]==31)
6267         {
6268           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6269         }
6270         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6271           if(itype[i]==UJUMP&&rt1[i]==31)
6272           {
6273             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6274             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6275           }
6276           if(itype[i]==RJUMP&&rs1[i]==31)
6277           {
6278             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6279             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6280           }
6281         }*/
6282         branch_unneeded_reg[i]=u;
6283         branch_unneeded_reg_upper[i]=uu;
6284         // Merge in delay slot
6285         tdep=(~uu>>rt1[i+1])&1;
6286         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6287         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6288         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6289         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6290         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6291         u|=1;uu|=1;
6292         gte_u|=gte_rt[i+1];
6293         gte_u&=~gte_rs[i+1];
6294         // If branch is "likely" (and conditional)
6295         // then we skip the delay slot on the fall-thru path
6296         if(likely[i]) {
6297           if(i<slen-1) {
6298             u&=unneeded_reg[i+2];
6299             uu&=unneeded_reg_upper[i+2];
6300             gte_u&=gte_unneeded[i+2];
6301           }
6302           else
6303           {
6304             u=1;
6305             uu=1;
6306             gte_u=gte_u_unknown;
6307           }
6308         }
6309       }
6310       else
6311       {
6312         // Internal branch, flag target
6313         bt[(ba[i]-start)>>2]=1;
6314         if(ba[i]<=start+i*4) {
6315           // Backward branch
6316           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6317           {
6318             // Unconditional branch
6319             temp_u=1;temp_uu=1;
6320             temp_gte_u=0;
6321           } else {
6322             // Conditional branch (not taken case)
6323             temp_u=unneeded_reg[i+2];
6324             temp_uu=unneeded_reg_upper[i+2];
6325             temp_gte_u&=gte_unneeded[i+2];
6326           }
6327           // Merge in delay slot
6328           tdep=(~temp_uu>>rt1[i+1])&1;
6329           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6330           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6331           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6332           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6333           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6334           temp_u|=1;temp_uu|=1;
6335           temp_gte_u|=gte_rt[i+1];
6336           temp_gte_u&=~gte_rs[i+1];
6337           // If branch is "likely" (and conditional)
6338           // then we skip the delay slot on the fall-thru path
6339           if(likely[i]) {
6340             if(i<slen-1) {
6341               temp_u&=unneeded_reg[i+2];
6342               temp_uu&=unneeded_reg_upper[i+2];
6343               temp_gte_u&=gte_unneeded[i+2];
6344             }
6345             else
6346             {
6347               temp_u=1;
6348               temp_uu=1;
6349               temp_gte_u=gte_u_unknown;
6350             }
6351           }
6352           tdep=(~temp_uu>>rt1[i])&1;
6353           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6354           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6355           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6356           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6357           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6358           temp_u|=1;temp_uu|=1;
6359           temp_gte_u|=gte_rt[i];
6360           temp_gte_u&=~gte_rs[i];
6361           unneeded_reg[i]=temp_u;
6362           unneeded_reg_upper[i]=temp_uu;
6363           gte_unneeded[i]=temp_gte_u;
6364           // Only go three levels deep.  This recursion can take an
6365           // excessive amount of time if there are a lot of nested loops.
6366           if(r<2) {
6367             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6368           }else{
6369             unneeded_reg[(ba[i]-start)>>2]=1;
6370             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6371             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6372           }
6373         } /*else*/ if(1) {
6374           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6375           {
6376             // Unconditional branch
6377             u=unneeded_reg[(ba[i]-start)>>2];
6378             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6379             gte_u=gte_unneeded[(ba[i]-start)>>2];
6380             branch_unneeded_reg[i]=u;
6381             branch_unneeded_reg_upper[i]=uu;
6382         //u=1;
6383         //uu=1;
6384         //branch_unneeded_reg[i]=u;
6385         //branch_unneeded_reg_upper[i]=uu;
6386             // Merge in delay slot
6387             tdep=(~uu>>rt1[i+1])&1;
6388             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6389             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6390             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6391             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6392             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6393             u|=1;uu|=1;
6394             gte_u|=gte_rt[i+1];
6395             gte_u&=~gte_rs[i+1];
6396           } else {
6397             // Conditional branch
6398             b=unneeded_reg[(ba[i]-start)>>2];
6399             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6400             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6401             branch_unneeded_reg[i]=b;
6402             branch_unneeded_reg_upper[i]=bu;
6403         //b=1;
6404         //bu=1;
6405         //branch_unneeded_reg[i]=b;
6406         //branch_unneeded_reg_upper[i]=bu;
6407             // Branch delay slot
6408             tdep=(~uu>>rt1[i+1])&1;
6409             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6410             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6411             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6412             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6413             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6414             b|=1;bu|=1;
6415             gte_bu|=gte_rt[i+1];
6416             gte_bu&=~gte_rs[i+1];
6417             // If branch is "likely" then we skip the
6418             // delay slot on the fall-thru path
6419             if(likely[i]) {
6420               u=b;
6421               uu=bu;
6422               gte_u=gte_bu;
6423               if(i<slen-1) {
6424                 u&=unneeded_reg[i+2];
6425                 uu&=unneeded_reg_upper[i+2];
6426                 gte_u&=gte_unneeded[i+2];
6427         //u=1;
6428         //uu=1;
6429               }
6430             } else {
6431               u&=b;
6432               uu&=bu;
6433               gte_u&=gte_bu;
6434         //u=1;
6435         //uu=1;
6436             }
6437             if(i<slen-1) {
6438               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6439               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6440         //branch_unneeded_reg[i]=1;
6441         //branch_unneeded_reg_upper[i]=1;
6442             } else {
6443               branch_unneeded_reg[i]=1;
6444               branch_unneeded_reg_upper[i]=1;
6445             }
6446           }
6447         }
6448       }
6449     }
6450     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6451     {
6452       // SYSCALL instruction (software interrupt)
6453       u=1;
6454       uu=1;
6455     }
6456     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6457     {
6458       // ERET instruction (return from interrupt)
6459       u=1;
6460       uu=1;
6461     }
6462     //u=uu=1; // DEBUG
6463     tdep=(~uu>>rt1[i])&1;
6464     // Written registers are unneeded
6465     u|=1LL<<rt1[i];
6466     u|=1LL<<rt2[i];
6467     uu|=1LL<<rt1[i];
6468     uu|=1LL<<rt2[i];
6469     gte_u|=gte_rt[i];
6470     // Accessed registers are needed
6471     u&=~(1LL<<rs1[i]);
6472     u&=~(1LL<<rs2[i]);
6473     uu&=~(1LL<<us1[i]);
6474     uu&=~(1LL<<us2[i]);
6475     gte_u&=~gte_rs[i];
6476     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6477       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6478     // Source-target dependencies
6479     uu&=~(tdep<<dep1[i]);
6480     uu&=~(tdep<<dep2[i]);
6481     // R0 is always unneeded
6482     u|=1;uu|=1;
6483     // Save it
6484     unneeded_reg[i]=u;
6485     unneeded_reg_upper[i]=uu;
6486     gte_unneeded[i]=gte_u;
6487     /*
6488     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6489     printf("U:");
6490     int r;
6491     for(r=1;r<=CCREG;r++) {
6492       if((unneeded_reg[i]>>r)&1) {
6493         if(r==HIREG) printf(" HI");
6494         else if(r==LOREG) printf(" LO");
6495         else printf(" r%d",r);
6496       }
6497     }
6498     printf(" UU:");
6499     for(r=1;r<=CCREG;r++) {
6500       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6501         if(r==HIREG) printf(" HI");
6502         else if(r==LOREG) printf(" LO");
6503         else printf(" r%d",r);
6504       }
6505     }
6506     printf("\n");*/
6507   }
6508   for (i=iend;i>=istart;i--)
6509   {
6510     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6511   }
6512 }
6513
6514 // Write back dirty registers as soon as we will no longer modify them,
6515 // so that we don't end up with lots of writes at the branches.
6516 void clean_registers(int istart,int iend,int wr)
6517 {
6518   int i;
6519   int r;
6520   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6521   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6522   if(iend==slen-1) {
6523     will_dirty_i=will_dirty_next=0;
6524     wont_dirty_i=wont_dirty_next=0;
6525   }else{
6526     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6527     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6528   }
6529   for (i=iend;i>=istart;i--)
6530   {
6531     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6532     {
6533       if(ba[i]<start || ba[i]>=(start+slen*4))
6534       {
6535         // Branch out of this block, flush all regs
6536         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6537         {
6538           // Unconditional branch
6539           will_dirty_i=0;
6540           wont_dirty_i=0;
6541           // Merge in delay slot (will dirty)
6542           for(r=0;r<HOST_REGS;r++) {
6543             if(r!=EXCLUDE_REG) {
6544               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6545               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6546               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6547               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6548               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6549               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6550               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6551               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6552               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6553               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6554               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6555               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6556               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6557               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6558             }
6559           }
6560         }
6561         else
6562         {
6563           // Conditional branch
6564           will_dirty_i=0;
6565           wont_dirty_i=wont_dirty_next;
6566           // Merge in delay slot (will dirty)
6567           for(r=0;r<HOST_REGS;r++) {
6568             if(r!=EXCLUDE_REG) {
6569               if(!likely[i]) {
6570                 // Might not dirty if likely branch is not taken
6571                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6572                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6573                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6574                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6575                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6576                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6577                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6578                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6579                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6580                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6581                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6582                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6583                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6584                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6585               }
6586             }
6587           }
6588         }
6589         // Merge in delay slot (wont dirty)
6590         for(r=0;r<HOST_REGS;r++) {
6591           if(r!=EXCLUDE_REG) {
6592             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6593             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6594             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6595             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6596             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6597             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6598             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6599             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6600             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6601             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6602           }
6603         }
6604         if(wr) {
6605           #ifndef DESTRUCTIVE_WRITEBACK
6606           branch_regs[i].dirty&=wont_dirty_i;
6607           #endif
6608           branch_regs[i].dirty|=will_dirty_i;
6609         }
6610       }
6611       else
6612       {
6613         // Internal branch
6614         if(ba[i]<=start+i*4) {
6615           // Backward branch
6616           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6617           {
6618             // Unconditional branch
6619             temp_will_dirty=0;
6620             temp_wont_dirty=0;
6621             // Merge in delay slot (will dirty)
6622             for(r=0;r<HOST_REGS;r++) {
6623               if(r!=EXCLUDE_REG) {
6624                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6625                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6626                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6627                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6628                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6629                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6630                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6631                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6632                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6633                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6634                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6635                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6636                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6637                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6638               }
6639             }
6640           } else {
6641             // Conditional branch (not taken case)
6642             temp_will_dirty=will_dirty_next;
6643             temp_wont_dirty=wont_dirty_next;
6644             // Merge in delay slot (will dirty)
6645             for(r=0;r<HOST_REGS;r++) {
6646               if(r!=EXCLUDE_REG) {
6647                 if(!likely[i]) {
6648                   // Will not dirty if likely branch is not taken
6649                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6650                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6651                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6652                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6653                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6654                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6655                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6656                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6657                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6658                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6659                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6660                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6661                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6662                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6663                 }
6664               }
6665             }
6666           }
6667           // Merge in delay slot (wont dirty)
6668           for(r=0;r<HOST_REGS;r++) {
6669             if(r!=EXCLUDE_REG) {
6670               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6671               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6672               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6673               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6674               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6675               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6676               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6677               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6678               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6679               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6680             }
6681           }
6682           // Deal with changed mappings
6683           if(i<iend) {
6684             for(r=0;r<HOST_REGS;r++) {
6685               if(r!=EXCLUDE_REG) {
6686                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6687                   temp_will_dirty&=~(1<<r);
6688                   temp_wont_dirty&=~(1<<r);
6689                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6690                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6691                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6692                   } else {
6693                     temp_will_dirty|=1<<r;
6694                     temp_wont_dirty|=1<<r;
6695                   }
6696                 }
6697               }
6698             }
6699           }
6700           if(wr) {
6701             will_dirty[i]=temp_will_dirty;
6702             wont_dirty[i]=temp_wont_dirty;
6703             clean_registers((ba[i]-start)>>2,i-1,0);
6704           }else{
6705             // Limit recursion.  It can take an excessive amount
6706             // of time if there are a lot of nested loops.
6707             will_dirty[(ba[i]-start)>>2]=0;
6708             wont_dirty[(ba[i]-start)>>2]=-1;
6709           }
6710         }
6711         /*else*/ if(1)
6712         {
6713           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6714           {
6715             // Unconditional branch
6716             will_dirty_i=0;
6717             wont_dirty_i=0;
6718           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6719             for(r=0;r<HOST_REGS;r++) {
6720               if(r!=EXCLUDE_REG) {
6721                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6722                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6723                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6724                 }
6725                 if(branch_regs[i].regmap[r]>=0) {
6726                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6727                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6728                 }
6729               }
6730             }
6731           //}
6732             // Merge in delay slot
6733             for(r=0;r<HOST_REGS;r++) {
6734               if(r!=EXCLUDE_REG) {
6735                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6736                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6737                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6738                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6739                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6740                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6741                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6742                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6743                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6744                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6745                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6746                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6747                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6748                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6749               }
6750             }
6751           } else {
6752             // Conditional branch
6753             will_dirty_i=will_dirty_next;
6754             wont_dirty_i=wont_dirty_next;
6755           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6756             for(r=0;r<HOST_REGS;r++) {
6757               if(r!=EXCLUDE_REG) {
6758                 signed char target_reg=branch_regs[i].regmap[r];
6759                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6760                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6761                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6762                 }
6763                 else if(target_reg>=0) {
6764                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6765                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6766                 }
6767                 // Treat delay slot as part of branch too
6768                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6769                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6770                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6771                 }
6772                 else
6773                 {
6774                   will_dirty[i+1]&=~(1<<r);
6775                 }*/
6776               }
6777             }
6778           //}
6779             // Merge in delay slot
6780             for(r=0;r<HOST_REGS;r++) {
6781               if(r!=EXCLUDE_REG) {
6782                 if(!likely[i]) {
6783                   // Might not dirty if likely branch is not taken
6784                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6785                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6786                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6787                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6788                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6789                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6790                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6791                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6792                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6793                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6794                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6795                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6796                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6797                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6798                 }
6799               }
6800             }
6801           }
6802           // Merge in delay slot (won't dirty)
6803           for(r=0;r<HOST_REGS;r++) {
6804             if(r!=EXCLUDE_REG) {
6805               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6806               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6807               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6808               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6809               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6810               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6811               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6812               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6813               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6814               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6815             }
6816           }
6817           if(wr) {
6818             #ifndef DESTRUCTIVE_WRITEBACK
6819             branch_regs[i].dirty&=wont_dirty_i;
6820             #endif
6821             branch_regs[i].dirty|=will_dirty_i;
6822           }
6823         }
6824       }
6825     }
6826     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6827     {
6828       // SYSCALL instruction (software interrupt)
6829       will_dirty_i=0;
6830       wont_dirty_i=0;
6831     }
6832     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6833     {
6834       // ERET instruction (return from interrupt)
6835       will_dirty_i=0;
6836       wont_dirty_i=0;
6837     }
6838     will_dirty_next=will_dirty_i;
6839     wont_dirty_next=wont_dirty_i;
6840     for(r=0;r<HOST_REGS;r++) {
6841       if(r!=EXCLUDE_REG) {
6842         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6843         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6844         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6845         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6846         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6847         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6848         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6849         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6850         if(i>istart) {
6851           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6852           {
6853             // Don't store a register immediately after writing it,
6854             // may prevent dual-issue.
6855             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6856             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6857           }
6858         }
6859       }
6860     }
6861     // Save it
6862     will_dirty[i]=will_dirty_i;
6863     wont_dirty[i]=wont_dirty_i;
6864     // Mark registers that won't be dirtied as not dirty
6865     if(wr) {
6866       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6867       for(r=0;r<HOST_REGS;r++) {
6868         if((will_dirty_i>>r)&1) {
6869           printf(" r%d",r);
6870         }
6871       }
6872       printf("\n");*/
6873
6874       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6875         regs[i].dirty|=will_dirty_i;
6876         #ifndef DESTRUCTIVE_WRITEBACK
6877         regs[i].dirty&=wont_dirty_i;
6878         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6879         {
6880           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6881             for(r=0;r<HOST_REGS;r++) {
6882               if(r!=EXCLUDE_REG) {
6883                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6884                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6885                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6886               }
6887             }
6888           }
6889         }
6890         else
6891         {
6892           if(i<iend) {
6893             for(r=0;r<HOST_REGS;r++) {
6894               if(r!=EXCLUDE_REG) {
6895                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6896                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6897                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6898               }
6899             }
6900           }
6901         }
6902         #endif
6903       //}
6904     }
6905     // Deal with changed mappings
6906     temp_will_dirty=will_dirty_i;
6907     temp_wont_dirty=wont_dirty_i;
6908     for(r=0;r<HOST_REGS;r++) {
6909       if(r!=EXCLUDE_REG) {
6910         int nr;
6911         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6912           if(wr) {
6913             #ifndef DESTRUCTIVE_WRITEBACK
6914             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6915             #endif
6916             regs[i].wasdirty|=will_dirty_i&(1<<r);
6917           }
6918         }
6919         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6920           // Register moved to a different register
6921           will_dirty_i&=~(1<<r);
6922           wont_dirty_i&=~(1<<r);
6923           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6924           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6925           if(wr) {
6926             #ifndef DESTRUCTIVE_WRITEBACK
6927             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6928             #endif
6929             regs[i].wasdirty|=will_dirty_i&(1<<r);
6930           }
6931         }
6932         else {
6933           will_dirty_i&=~(1<<r);
6934           wont_dirty_i&=~(1<<r);
6935           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6936             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6937             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6938           } else {
6939             wont_dirty_i|=1<<r;
6940             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6941           }
6942         }
6943       }
6944     }
6945   }
6946 }
6947
6948 #ifdef DISASM
6949   /* disassembly */
6950 void disassemble_inst(int i)
6951 {
6952     if (bt[i]) printf("*"); else printf(" ");
6953     switch(itype[i]) {
6954       case UJUMP:
6955         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6956       case CJUMP:
6957         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6958       case SJUMP:
6959         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6960       case FJUMP:
6961         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6962       case RJUMP:
6963         if (opcode[i]==0x9&&rt1[i]!=31)
6964           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6965         else
6966           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6967         break;
6968       case SPAN:
6969         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6970       case IMM16:
6971         if(opcode[i]==0xf) //LUI
6972           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6973         else
6974           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6975         break;
6976       case LOAD:
6977       case LOADLR:
6978         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6979         break;
6980       case STORE:
6981       case STORELR:
6982         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6983         break;
6984       case ALU:
6985       case SHIFT:
6986         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6987         break;
6988       case MULTDIV:
6989         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6990         break;
6991       case SHIFTIMM:
6992         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6993         break;
6994       case MOV:
6995         if((opcode2[i]&0x1d)==0x10)
6996           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6997         else if((opcode2[i]&0x1d)==0x11)
6998           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6999         else
7000           printf (" %x: %s\n",start+i*4,insn[i]);
7001         break;
7002       case COP0:
7003         if(opcode2[i]==0)
7004           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7005         else if(opcode2[i]==4)
7006           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7007         else printf (" %x: %s\n",start+i*4,insn[i]);
7008         break;
7009       case COP1:
7010         if(opcode2[i]<3)
7011           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7012         else if(opcode2[i]>3)
7013           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7014         else printf (" %x: %s\n",start+i*4,insn[i]);
7015         break;
7016       case COP2:
7017         if(opcode2[i]<3)
7018           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7019         else if(opcode2[i]>3)
7020           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7021         else printf (" %x: %s\n",start+i*4,insn[i]);
7022         break;
7023       case C1LS:
7024         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7025         break;
7026       case C2LS:
7027         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7028         break;
7029       case INTCALL:
7030         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7031         break;
7032       default:
7033         //printf (" %s %8x\n",insn[i],source[i]);
7034         printf (" %x: %s\n",start+i*4,insn[i]);
7035     }
7036 }
7037 #else
7038 static void disassemble_inst(int i) {}
7039 #endif // DISASM
7040
7041 #define DRC_TEST_VAL 0x74657374
7042
7043 static int new_dynarec_test(void)
7044 {
7045   int (*testfunc)(void) = (void *)out;
7046   void *beginning;
7047   int ret;
7048
7049   beginning = start_block();
7050   emit_movimm(DRC_TEST_VAL,0); // test
7051   emit_jmpreg(14);
7052   literal_pool(0);
7053   end_block(beginning);
7054   SysPrintf("testing if we can run recompiled code..\n");
7055   ret = testfunc();
7056   if (ret == DRC_TEST_VAL)
7057     SysPrintf("test passed.\n");
7058   else
7059     SysPrintf("test failed: %08x\n", ret);
7060   out=(u_char *)BASE_ADDR;
7061   return ret == DRC_TEST_VAL;
7062 }
7063
7064 // clear the state completely, instead of just marking
7065 // things invalid like invalidate_all_pages() does
7066 void new_dynarec_clear_full()
7067 {
7068   int n;
7069   out=(u_char *)BASE_ADDR;
7070   memset(invalid_code,1,sizeof(invalid_code));
7071   memset(hash_table,0xff,sizeof(hash_table));
7072   memset(mini_ht,-1,sizeof(mini_ht));
7073   memset(restore_candidate,0,sizeof(restore_candidate));
7074   memset(shadow,0,sizeof(shadow));
7075   copy=shadow;
7076   expirep=16384; // Expiry pointer, +2 blocks
7077   pending_exception=0;
7078   literalcount=0;
7079   stop_after_jal=0;
7080   inv_code_start=inv_code_end=~0;
7081   // TLB
7082   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7083   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7084   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7085 }
7086
7087 void new_dynarec_init()
7088 {
7089   SysPrintf("Init new dynarec\n");
7090
7091   // allocate/prepare a buffer for translation cache
7092   // see assem_arm.h for some explanation
7093 #if   defined(BASE_ADDR_FIXED)
7094   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7095             PROT_READ | PROT_WRITE | PROT_EXEC,
7096             MAP_PRIVATE | MAP_ANONYMOUS,
7097             -1, 0) != translation_cache) {
7098     SysPrintf("mmap() failed: %s\n", strerror(errno));
7099     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7100     abort();
7101   }
7102 #elif defined(BASE_ADDR_DYNAMIC)
7103   #ifdef VITA
7104   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7105   if (sceBlock < 0)
7106     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7107   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7108   if (ret < 0)
7109     SysPrintf("sceKernelGetMemBlockBase failed\n");
7110   #else
7111   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7112             PROT_READ | PROT_WRITE | PROT_EXEC,
7113             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7114   if (translation_cache == MAP_FAILED) {
7115     SysPrintf("mmap() failed: %s\n", strerror(errno));
7116     abort();
7117   }
7118   #endif
7119 #else
7120   #ifndef NO_WRITE_EXEC
7121   // not all systems allow execute in data segment by default
7122   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7123     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7124   #endif
7125 #endif
7126   out=(u_char *)BASE_ADDR;
7127   cycle_multiplier=200;
7128   new_dynarec_clear_full();
7129 #ifdef HOST_IMM8
7130   // Copy this into local area so we don't have to put it in every literal pool
7131   invc_ptr=invalid_code;
7132 #endif
7133   arch_init();
7134   new_dynarec_test();
7135 #ifndef RAM_FIXED
7136   ram_offset=(u_int)rdram-0x80000000;
7137 #endif
7138   if (ram_offset!=0)
7139     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7140 }
7141
7142 void new_dynarec_cleanup()
7143 {
7144   int n;
7145 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7146   #ifdef VITA
7147   sceKernelFreeMemBlock(sceBlock);
7148   sceBlock = -1;
7149   #else
7150   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7151     SysPrintf("munmap() failed\n");
7152   #endif
7153 #endif
7154   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7155   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7156   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7157   #ifdef ROM_COPY
7158   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7159   #endif
7160 }
7161
7162 static u_int *get_source_start(u_int addr, u_int *limit)
7163 {
7164   if (addr < 0x00200000 ||
7165     (0xa0000000 <= addr && addr < 0xa0200000)) {
7166     // used for BIOS calls mostly?
7167     *limit = (addr&0xa0000000)|0x00200000;
7168     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7169   }
7170   else if (!Config.HLE && (
7171     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7172     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7173     // BIOS
7174     *limit = (addr & 0xfff00000) | 0x80000;
7175     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7176   }
7177   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7178     *limit = (addr & 0x80600000) + 0x00200000;
7179     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7180   }
7181   return NULL;
7182 }
7183
7184 static u_int scan_for_ret(u_int addr)
7185 {
7186   u_int limit = 0;
7187   u_int *mem;
7188
7189   mem = get_source_start(addr, &limit);
7190   if (mem == NULL)
7191     return addr;
7192
7193   if (limit > addr + 0x1000)
7194     limit = addr + 0x1000;
7195   for (; addr < limit; addr += 4, mem++) {
7196     if (*mem == 0x03e00008) // jr $ra
7197       return addr + 8;
7198   }
7199   return addr;
7200 }
7201
7202 struct savestate_block {
7203   uint32_t addr;
7204   uint32_t regflags;
7205 };
7206
7207 static int addr_cmp(const void *p1_, const void *p2_)
7208 {
7209   const struct savestate_block *p1 = p1_, *p2 = p2_;
7210   return p1->addr - p2->addr;
7211 }
7212
7213 int new_dynarec_save_blocks(void *save, int size)
7214 {
7215   struct savestate_block *blocks = save;
7216   int maxcount = size / sizeof(blocks[0]);
7217   struct savestate_block tmp_blocks[1024];
7218   struct ll_entry *head;
7219   int p, s, d, o, bcnt;
7220   u_int addr;
7221
7222   o = 0;
7223   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7224     bcnt = 0;
7225     for (head = jump_in[p]; head != NULL; head = head->next) {
7226       tmp_blocks[bcnt].addr = head->vaddr;
7227       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7228       bcnt++;
7229     }
7230     if (bcnt < 1)
7231       continue;
7232     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7233
7234     addr = tmp_blocks[0].addr;
7235     for (s = d = 0; s < bcnt; s++) {
7236       if (tmp_blocks[s].addr < addr)
7237         continue;
7238       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7239         tmp_blocks[d++] = tmp_blocks[s];
7240       addr = scan_for_ret(tmp_blocks[s].addr);
7241     }
7242
7243     if (o + d > maxcount)
7244       d = maxcount - o;
7245     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7246     o += d;
7247   }
7248
7249   return o * sizeof(blocks[0]);
7250 }
7251
7252 void new_dynarec_load_blocks(const void *save, int size)
7253 {
7254   const struct savestate_block *blocks = save;
7255   int count = size / sizeof(blocks[0]);
7256   u_int regs_save[32];
7257   uint32_t f;
7258   int i, b;
7259
7260   get_addr(psxRegs.pc);
7261
7262   // change GPRs for speculation to at least partially work..
7263   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7264   for (i = 1; i < 32; i++)
7265     psxRegs.GPR.r[i] = 0x80000000;
7266
7267   for (b = 0; b < count; b++) {
7268     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7269       if (f & 1)
7270         psxRegs.GPR.r[i] = 0x1f800000;
7271     }
7272
7273     get_addr(blocks[b].addr);
7274
7275     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7276       if (f & 1)
7277         psxRegs.GPR.r[i] = 0x80000000;
7278     }
7279   }
7280
7281   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7282 }
7283
7284 int new_recompile_block(int addr)
7285 {
7286   u_int pagelimit = 0;
7287   u_int state_rflags = 0;
7288   int i;
7289
7290   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7291   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7292   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7293   //if(debug)
7294   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7295   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7296   /*if(Count>=312978186) {
7297     rlist();
7298   }*/
7299   //rlist();
7300
7301   // this is just for speculation
7302   for (i = 1; i < 32; i++) {
7303     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7304       state_rflags |= 1 << i;
7305   }
7306
7307   start = (u_int)addr&~3;
7308   //assert(((u_int)addr&1)==0);
7309   new_dynarec_did_compile=1;
7310   if (Config.HLE && start == 0x80001000) // hlecall
7311   {
7312     // XXX: is this enough? Maybe check hleSoftCall?
7313     void *beginning=start_block();
7314     u_int page=get_page(start);
7315
7316     invalid_code[start>>12]=0;
7317     emit_movimm(start,0);
7318     emit_writeword(0,(int)&pcaddr);
7319     emit_jmp((int)new_dyna_leave);
7320     literal_pool(0);
7321     end_block(beginning);
7322     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7323     return 0;
7324   }
7325
7326   source = get_source_start(start, &pagelimit);
7327   if (source == NULL) {
7328     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7329     exit(1);
7330   }
7331
7332   /* Pass 1: disassemble */
7333   /* Pass 2: register dependencies, branch targets */
7334   /* Pass 3: register allocation */
7335   /* Pass 4: branch dependencies */
7336   /* Pass 5: pre-alloc */
7337   /* Pass 6: optimize clean/dirty state */
7338   /* Pass 7: flag 32-bit registers */
7339   /* Pass 8: assembly */
7340   /* Pass 9: linker */
7341   /* Pass 10: garbage collection / free memory */
7342
7343   int j;
7344   int done=0;
7345   unsigned int type,op,op2;
7346
7347   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7348
7349   /* Pass 1 disassembly */
7350
7351   for(i=0;!done;i++) {
7352     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7353     minimum_free_regs[i]=0;
7354     opcode[i]=op=source[i]>>26;
7355     switch(op)
7356     {
7357       case 0x00: strcpy(insn[i],"special"); type=NI;
7358         op2=source[i]&0x3f;
7359         switch(op2)
7360         {
7361           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7362           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7363           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7364           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7365           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7366           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7367           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7368           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7369           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7370           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7371           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7372           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7373           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7374           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7375           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7376           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7377           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7378           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7379           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7380           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7381           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7382           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7383           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7384           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7385           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7386           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7387           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7388           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7389           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7390           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7391           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7392           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7393           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7394           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7395           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7396 #if 0
7397           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7398           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7399           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7400           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7401           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7402           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7403           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7404           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7405           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7406           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7407           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7408           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7409           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7410           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7411           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7412           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7413           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7414 #endif
7415         }
7416         break;
7417       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7418         op2=(source[i]>>16)&0x1f;
7419         switch(op2)
7420         {
7421           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7422           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7423           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7424           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7425           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7426           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7427           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7428           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7429           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7430           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7431           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7432           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7433           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7434           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7435         }
7436         break;
7437       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7438       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7439       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7440       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7441       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7442       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7443       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7444       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7445       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7446       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7447       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7448       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7449       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7450       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7451       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7452         op2=(source[i]>>21)&0x1f;
7453         switch(op2)
7454         {
7455           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7456           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7457           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7458           switch(source[i]&0x3f)
7459           {
7460             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7461             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7462             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7463             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7464             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7465             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7466           }
7467         }
7468         break;
7469       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7470         op2=(source[i]>>21)&0x1f;
7471         switch(op2)
7472         {
7473           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7474           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7475           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7476           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7477           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7478           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7479           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7480           switch((source[i]>>16)&0x3)
7481           {
7482             case 0x00: strcpy(insn[i],"BC1F"); break;
7483             case 0x01: strcpy(insn[i],"BC1T"); break;
7484             case 0x02: strcpy(insn[i],"BC1FL"); break;
7485             case 0x03: strcpy(insn[i],"BC1TL"); break;
7486           }
7487           break;
7488           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7489           switch(source[i]&0x3f)
7490           {
7491             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7492             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7493             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7494             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7495             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7496             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7497             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7498             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7499             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7500             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7501             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7502             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7503             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7504             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7505             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7506             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7507             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7508             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7509             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7510             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7511             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7512             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7513             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7514             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7515             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7516             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7517             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7518             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7519             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7520             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7521             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7522             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7523             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7524             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7525             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7526           }
7527           break;
7528           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7529           switch(source[i]&0x3f)
7530           {
7531             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7532             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7533             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7534             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7535             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7536             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7537             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7538             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7539             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7540             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7541             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7542             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7543             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7544             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7545             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7546             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7547             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7548             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7549             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7550             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7551             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7552             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7553             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7554             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7555             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7556             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7557             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7558             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7559             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7560             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7561             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7562             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7563             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7564             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7565             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7566           }
7567           break;
7568           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7569           switch(source[i]&0x3f)
7570           {
7571             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7572             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7573           }
7574           break;
7575           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7576           switch(source[i]&0x3f)
7577           {
7578             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7579             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7580           }
7581           break;
7582         }
7583         break;
7584 #if 0
7585       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7586       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7587       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7588       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7589       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7590       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7591       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7592       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7593 #endif
7594       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7595       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7596       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7597       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7598       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7599       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7600       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7601 #if 0
7602       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7603 #endif
7604       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7605       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7606       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7607       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7608 #if 0
7609       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7610       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7611 #endif
7612       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7613       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7614       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7615       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7616 #if 0
7617       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7618       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7619       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7620 #endif
7621       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7622       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7623 #if 0
7624       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7625       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7626       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7627 #endif
7628       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7629         op2=(source[i]>>21)&0x1f;
7630         //if (op2 & 0x10) {
7631         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7632           if (gte_handlers[source[i]&0x3f]!=NULL) {
7633             if (gte_regnames[source[i]&0x3f]!=NULL)
7634               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7635             else
7636               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7637             type=C2OP;
7638           }
7639         }
7640         else switch(op2)
7641         {
7642           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7643           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7644           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7645           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7646         }
7647         break;
7648       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7649       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7650       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7651       default: strcpy(insn[i],"???"); type=NI;
7652         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7653         break;
7654     }
7655     itype[i]=type;
7656     opcode2[i]=op2;
7657     /* Get registers/immediates */
7658     lt1[i]=0;
7659     us1[i]=0;
7660     us2[i]=0;
7661     dep1[i]=0;
7662     dep2[i]=0;
7663     gte_rs[i]=gte_rt[i]=0;
7664     switch(type) {
7665       case LOAD:
7666         rs1[i]=(source[i]>>21)&0x1f;
7667         rs2[i]=0;
7668         rt1[i]=(source[i]>>16)&0x1f;
7669         rt2[i]=0;
7670         imm[i]=(short)source[i];
7671         break;
7672       case STORE:
7673       case STORELR:
7674         rs1[i]=(source[i]>>21)&0x1f;
7675         rs2[i]=(source[i]>>16)&0x1f;
7676         rt1[i]=0;
7677         rt2[i]=0;
7678         imm[i]=(short)source[i];
7679         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7680         break;
7681       case LOADLR:
7682         // LWL/LWR only load part of the register,
7683         // therefore the target register must be treated as a source too
7684         rs1[i]=(source[i]>>21)&0x1f;
7685         rs2[i]=(source[i]>>16)&0x1f;
7686         rt1[i]=(source[i]>>16)&0x1f;
7687         rt2[i]=0;
7688         imm[i]=(short)source[i];
7689         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7690         if(op==0x26) dep1[i]=rt1[i]; // LWR
7691         break;
7692       case IMM16:
7693         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7694         else rs1[i]=(source[i]>>21)&0x1f;
7695         rs2[i]=0;
7696         rt1[i]=(source[i]>>16)&0x1f;
7697         rt2[i]=0;
7698         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7699           imm[i]=(unsigned short)source[i];
7700         }else{
7701           imm[i]=(short)source[i];
7702         }
7703         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7704         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7705         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7706         break;
7707       case UJUMP:
7708         rs1[i]=0;
7709         rs2[i]=0;
7710         rt1[i]=0;
7711         rt2[i]=0;
7712         // The JAL instruction writes to r31.
7713         if (op&1) {
7714           rt1[i]=31;
7715         }
7716         rs2[i]=CCREG;
7717         break;
7718       case RJUMP:
7719         rs1[i]=(source[i]>>21)&0x1f;
7720         rs2[i]=0;
7721         rt1[i]=0;
7722         rt2[i]=0;
7723         // The JALR instruction writes to rd.
7724         if (op2&1) {
7725           rt1[i]=(source[i]>>11)&0x1f;
7726         }
7727         rs2[i]=CCREG;
7728         break;
7729       case CJUMP:
7730         rs1[i]=(source[i]>>21)&0x1f;
7731         rs2[i]=(source[i]>>16)&0x1f;
7732         rt1[i]=0;
7733         rt2[i]=0;
7734         if(op&2) { // BGTZ/BLEZ
7735           rs2[i]=0;
7736         }
7737         us1[i]=rs1[i];
7738         us2[i]=rs2[i];
7739         likely[i]=op>>4;
7740         break;
7741       case SJUMP:
7742         rs1[i]=(source[i]>>21)&0x1f;
7743         rs2[i]=CCREG;
7744         rt1[i]=0;
7745         rt2[i]=0;
7746         us1[i]=rs1[i];
7747         if(op2&0x10) { // BxxAL
7748           rt1[i]=31;
7749           // NOTE: If the branch is not taken, r31 is still overwritten
7750         }
7751         likely[i]=(op2&2)>>1;
7752         break;
7753       case FJUMP:
7754         rs1[i]=FSREG;
7755         rs2[i]=CSREG;
7756         rt1[i]=0;
7757         rt2[i]=0;
7758         likely[i]=((source[i])>>17)&1;
7759         break;
7760       case ALU:
7761         rs1[i]=(source[i]>>21)&0x1f; // source
7762         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7763         rt1[i]=(source[i]>>11)&0x1f; // destination
7764         rt2[i]=0;
7765         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7766           us1[i]=rs1[i];us2[i]=rs2[i];
7767         }
7768         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7769           dep1[i]=rs1[i];dep2[i]=rs2[i];
7770         }
7771         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7772           dep1[i]=rs1[i];dep2[i]=rs2[i];
7773         }
7774         break;
7775       case MULTDIV:
7776         rs1[i]=(source[i]>>21)&0x1f; // source
7777         rs2[i]=(source[i]>>16)&0x1f; // divisor
7778         rt1[i]=HIREG;
7779         rt2[i]=LOREG;
7780         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7781           us1[i]=rs1[i];us2[i]=rs2[i];
7782         }
7783         break;
7784       case MOV:
7785         rs1[i]=0;
7786         rs2[i]=0;
7787         rt1[i]=0;
7788         rt2[i]=0;
7789         if(op2==0x10) rs1[i]=HIREG; // MFHI
7790         if(op2==0x11) rt1[i]=HIREG; // MTHI
7791         if(op2==0x12) rs1[i]=LOREG; // MFLO
7792         if(op2==0x13) rt1[i]=LOREG; // MTLO
7793         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7794         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7795         dep1[i]=rs1[i];
7796         break;
7797       case SHIFT:
7798         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7799         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7800         rt1[i]=(source[i]>>11)&0x1f; // destination
7801         rt2[i]=0;
7802         // DSLLV/DSRLV/DSRAV are 64-bit
7803         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7804         break;
7805       case SHIFTIMM:
7806         rs1[i]=(source[i]>>16)&0x1f;
7807         rs2[i]=0;
7808         rt1[i]=(source[i]>>11)&0x1f;
7809         rt2[i]=0;
7810         imm[i]=(source[i]>>6)&0x1f;
7811         // DSxx32 instructions
7812         if(op2>=0x3c) imm[i]|=0x20;
7813         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7814         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7815         break;
7816       case COP0:
7817         rs1[i]=0;
7818         rs2[i]=0;
7819         rt1[i]=0;
7820         rt2[i]=0;
7821         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7822         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7823         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7824         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7825         break;
7826       case COP1:
7827         rs1[i]=0;
7828         rs2[i]=0;
7829         rt1[i]=0;
7830         rt2[i]=0;
7831         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7832         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7833         if(op2==5) us1[i]=rs1[i]; // DMTC1
7834         rs2[i]=CSREG;
7835         break;
7836       case COP2:
7837         rs1[i]=0;
7838         rs2[i]=0;
7839         rt1[i]=0;
7840         rt2[i]=0;
7841         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7842         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7843         rs2[i]=CSREG;
7844         int gr=(source[i]>>11)&0x1F;
7845         switch(op2)
7846         {
7847           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7848           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7849           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7850           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7851         }
7852         break;
7853       case C1LS:
7854         rs1[i]=(source[i]>>21)&0x1F;
7855         rs2[i]=CSREG;
7856         rt1[i]=0;
7857         rt2[i]=0;
7858         imm[i]=(short)source[i];
7859         break;
7860       case C2LS:
7861         rs1[i]=(source[i]>>21)&0x1F;
7862         rs2[i]=0;
7863         rt1[i]=0;
7864         rt2[i]=0;
7865         imm[i]=(short)source[i];
7866         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7867         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7868         break;
7869       case C2OP:
7870         rs1[i]=0;
7871         rs2[i]=0;
7872         rt1[i]=0;
7873         rt2[i]=0;
7874         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7875         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7876         gte_rt[i]|=1ll<<63; // every op changes flags
7877         if((source[i]&0x3f)==GTE_MVMVA) {
7878           int v = (source[i] >> 15) & 3;
7879           gte_rs[i]&=~0xe3fll;
7880           if(v==3) gte_rs[i]|=0xe00ll;
7881           else gte_rs[i]|=3ll<<(v*2);
7882         }
7883         break;
7884       case FLOAT:
7885       case FCONV:
7886         rs1[i]=0;
7887         rs2[i]=CSREG;
7888         rt1[i]=0;
7889         rt2[i]=0;
7890         break;
7891       case FCOMP:
7892         rs1[i]=FSREG;
7893         rs2[i]=CSREG;
7894         rt1[i]=FSREG;
7895         rt2[i]=0;
7896         break;
7897       case SYSCALL:
7898       case HLECALL:
7899       case INTCALL:
7900         rs1[i]=CCREG;
7901         rs2[i]=0;
7902         rt1[i]=0;
7903         rt2[i]=0;
7904         break;
7905       default:
7906         rs1[i]=0;
7907         rs2[i]=0;
7908         rt1[i]=0;
7909         rt2[i]=0;
7910     }
7911     /* Calculate branch target addresses */
7912     if(type==UJUMP)
7913       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7914     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7915       ba[i]=start+i*4+8; // Ignore never taken branch
7916     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7917       ba[i]=start+i*4+8; // Ignore never taken branch
7918     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7919       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7920     else ba[i]=-1;
7921     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7922       int do_in_intrp=0;
7923       // branch in delay slot?
7924       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7925         // don't handle first branch and call interpreter if it's hit
7926         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7927         do_in_intrp=1;
7928       }
7929       // basic load delay detection
7930       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7931         int t=(ba[i-1]-start)/4;
7932         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7933           // jump target wants DS result - potential load delay effect
7934           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7935           do_in_intrp=1;
7936           bt[t+1]=1; // expected return from interpreter
7937         }
7938         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7939               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7940           // v0 overwrite like this is a sign of trouble, bail out
7941           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7942           do_in_intrp=1;
7943         }
7944       }
7945       if(do_in_intrp) {
7946         rs1[i-1]=CCREG;
7947         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7948         ba[i-1]=-1;
7949         itype[i-1]=INTCALL;
7950         done=2;
7951         i--; // don't compile the DS
7952       }
7953     }
7954     /* Is this the end of the block? */
7955     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7956       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7957         done=2;
7958       }
7959       else {
7960         if(stop_after_jal) done=1;
7961         // Stop on BREAK
7962         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7963       }
7964       // Don't recompile stuff that's already compiled
7965       if(check_addr(start+i*4+4)) done=1;
7966       // Don't get too close to the limit
7967       if(i>MAXBLOCK/2) done=1;
7968     }
7969     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7970     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7971     if(done==2) {
7972       // Does the block continue due to a branch?
7973       for(j=i-1;j>=0;j--)
7974       {
7975         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7976         if(ba[j]==start+i*4+4) done=j=0;
7977         if(ba[j]==start+i*4+8) done=j=0;
7978       }
7979     }
7980     //assert(i<MAXBLOCK-1);
7981     if(start+i*4==pagelimit-4) done=1;
7982     assert(start+i*4<pagelimit);
7983     if (i==MAXBLOCK-1) done=1;
7984     // Stop if we're compiling junk
7985     if(itype[i]==NI&&opcode[i]==0x11) {
7986       done=stop_after_jal=1;
7987       SysPrintf("Disabled speculative precompilation\n");
7988     }
7989   }
7990   slen=i;
7991   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7992     if(start+i*4==pagelimit) {
7993       itype[i-1]=SPAN;
7994     }
7995   }
7996   assert(slen>0);
7997
7998   /* Pass 2 - Register dependencies and branch targets */
7999
8000   unneeded_registers(0,slen-1,0);
8001
8002   /* Pass 3 - Register allocation */
8003
8004   struct regstat current; // Current register allocations/status
8005   current.is32=1;
8006   current.dirty=0;
8007   current.u=unneeded_reg[0];
8008   current.uu=unneeded_reg_upper[0];
8009   clear_all_regs(current.regmap);
8010   alloc_reg(&current,0,CCREG);
8011   dirty_reg(&current,CCREG);
8012   current.isconst=0;
8013   current.wasconst=0;
8014   current.waswritten=0;
8015   int ds=0;
8016   int cc=0;
8017   int hr=-1;
8018
8019   if((u_int)addr&1) {
8020     // First instruction is delay slot
8021     cc=-1;
8022     bt[1]=1;
8023     ds=1;
8024     unneeded_reg[0]=1;
8025     unneeded_reg_upper[0]=1;
8026     current.regmap[HOST_BTREG]=BTREG;
8027   }
8028
8029   for(i=0;i<slen;i++)
8030   {
8031     if(bt[i])
8032     {
8033       int hr;
8034       for(hr=0;hr<HOST_REGS;hr++)
8035       {
8036         // Is this really necessary?
8037         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8038       }
8039       current.isconst=0;
8040       current.waswritten=0;
8041     }
8042     if(i>1)
8043     {
8044       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8045       {
8046         if(rs1[i-2]==0||rs2[i-2]==0)
8047         {
8048           if(rs1[i-2]) {
8049             current.is32|=1LL<<rs1[i-2];
8050             int hr=get_reg(current.regmap,rs1[i-2]|64);
8051             if(hr>=0) current.regmap[hr]=-1;
8052           }
8053           if(rs2[i-2]) {
8054             current.is32|=1LL<<rs2[i-2];
8055             int hr=get_reg(current.regmap,rs2[i-2]|64);
8056             if(hr>=0) current.regmap[hr]=-1;
8057           }
8058         }
8059       }
8060     }
8061     current.is32=-1LL;
8062
8063     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8064     regs[i].wasconst=current.isconst;
8065     regs[i].was32=current.is32;
8066     regs[i].wasdirty=current.dirty;
8067     regs[i].loadedconst=0;
8068     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8069       if(i+1<slen) {
8070         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8071         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8072         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8073         current.u|=1;
8074         current.uu|=1;
8075       } else {
8076         current.u=1;
8077         current.uu=1;
8078       }
8079     } else {
8080       if(i+1<slen) {
8081         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8082         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8083         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8084         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8085         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8086         current.u|=1;
8087         current.uu|=1;
8088       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8089     }
8090     is_ds[i]=ds;
8091     if(ds) {
8092       ds=0; // Skip delay slot, already allocated as part of branch
8093       // ...but we need to alloc it in case something jumps here
8094       if(i+1<slen) {
8095         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8096         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8097       }else{
8098         current.u=branch_unneeded_reg[i-1];
8099         current.uu=branch_unneeded_reg_upper[i-1];
8100       }
8101       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8102       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8103       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8104       current.u|=1;
8105       current.uu|=1;
8106       struct regstat temp;
8107       memcpy(&temp,&current,sizeof(current));
8108       temp.wasdirty=temp.dirty;
8109       temp.was32=temp.is32;
8110       // TODO: Take into account unconditional branches, as below
8111       delayslot_alloc(&temp,i);
8112       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8113       regs[i].wasdirty=temp.wasdirty;
8114       regs[i].was32=temp.was32;
8115       regs[i].dirty=temp.dirty;
8116       regs[i].is32=temp.is32;
8117       regs[i].isconst=0;
8118       regs[i].wasconst=0;
8119       current.isconst=0;
8120       // Create entry (branch target) regmap
8121       for(hr=0;hr<HOST_REGS;hr++)
8122       {
8123         int r=temp.regmap[hr];
8124         if(r>=0) {
8125           if(r!=regmap_pre[i][hr]) {
8126             regs[i].regmap_entry[hr]=-1;
8127           }
8128           else
8129           {
8130             if(r<64){
8131               if((current.u>>r)&1) {
8132                 regs[i].regmap_entry[hr]=-1;
8133                 regs[i].regmap[hr]=-1;
8134                 //Don't clear regs in the delay slot as the branch might need them
8135                 //current.regmap[hr]=-1;
8136               }else
8137                 regs[i].regmap_entry[hr]=r;
8138             }
8139             else {
8140               if((current.uu>>(r&63))&1) {
8141                 regs[i].regmap_entry[hr]=-1;
8142                 regs[i].regmap[hr]=-1;
8143                 //Don't clear regs in the delay slot as the branch might need them
8144                 //current.regmap[hr]=-1;
8145               }else
8146                 regs[i].regmap_entry[hr]=r;
8147             }
8148           }
8149         } else {
8150           // First instruction expects CCREG to be allocated
8151           if(i==0&&hr==HOST_CCREG)
8152             regs[i].regmap_entry[hr]=CCREG;
8153           else
8154             regs[i].regmap_entry[hr]=-1;
8155         }
8156       }
8157     }
8158     else { // Not delay slot
8159       switch(itype[i]) {
8160         case UJUMP:
8161           //current.isconst=0; // DEBUG
8162           //current.wasconst=0; // DEBUG
8163           //regs[i].wasconst=0; // DEBUG
8164           clear_const(&current,rt1[i]);
8165           alloc_cc(&current,i);
8166           dirty_reg(&current,CCREG);
8167           if (rt1[i]==31) {
8168             alloc_reg(&current,i,31);
8169             dirty_reg(&current,31);
8170             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8171             //assert(rt1[i+1]!=rt1[i]);
8172             #ifdef REG_PREFETCH
8173             alloc_reg(&current,i,PTEMP);
8174             #endif
8175             //current.is32|=1LL<<rt1[i];
8176           }
8177           ooo[i]=1;
8178           delayslot_alloc(&current,i+1);
8179           //current.isconst=0; // DEBUG
8180           ds=1;
8181           //printf("i=%d, isconst=%x\n",i,current.isconst);
8182           break;
8183         case RJUMP:
8184           //current.isconst=0;
8185           //current.wasconst=0;
8186           //regs[i].wasconst=0;
8187           clear_const(&current,rs1[i]);
8188           clear_const(&current,rt1[i]);
8189           alloc_cc(&current,i);
8190           dirty_reg(&current,CCREG);
8191           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8192             alloc_reg(&current,i,rs1[i]);
8193             if (rt1[i]!=0) {
8194               alloc_reg(&current,i,rt1[i]);
8195               dirty_reg(&current,rt1[i]);
8196               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8197               assert(rt1[i+1]!=rt1[i]);
8198               #ifdef REG_PREFETCH
8199               alloc_reg(&current,i,PTEMP);
8200               #endif
8201             }
8202             #ifdef USE_MINI_HT
8203             if(rs1[i]==31) { // JALR
8204               alloc_reg(&current,i,RHASH);
8205               #ifndef HOST_IMM_ADDR32
8206               alloc_reg(&current,i,RHTBL);
8207               #endif
8208             }
8209             #endif
8210             delayslot_alloc(&current,i+1);
8211           } else {
8212             // The delay slot overwrites our source register,
8213             // allocate a temporary register to hold the old value.
8214             current.isconst=0;
8215             current.wasconst=0;
8216             regs[i].wasconst=0;
8217             delayslot_alloc(&current,i+1);
8218             current.isconst=0;
8219             alloc_reg(&current,i,RTEMP);
8220           }
8221           //current.isconst=0; // DEBUG
8222           ooo[i]=1;
8223           ds=1;
8224           break;
8225         case CJUMP:
8226           //current.isconst=0;
8227           //current.wasconst=0;
8228           //regs[i].wasconst=0;
8229           clear_const(&current,rs1[i]);
8230           clear_const(&current,rs2[i]);
8231           if((opcode[i]&0x3E)==4) // BEQ/BNE
8232           {
8233             alloc_cc(&current,i);
8234             dirty_reg(&current,CCREG);
8235             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8236             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8237             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8238             {
8239               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8240               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8241             }
8242             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8243                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8244               // The delay slot overwrites one of our conditions.
8245               // Allocate the branch condition registers instead.
8246               current.isconst=0;
8247               current.wasconst=0;
8248               regs[i].wasconst=0;
8249               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8250               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8251               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8252               {
8253                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8254                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8255               }
8256             }
8257             else
8258             {
8259               ooo[i]=1;
8260               delayslot_alloc(&current,i+1);
8261             }
8262           }
8263           else
8264           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8265           {
8266             alloc_cc(&current,i);
8267             dirty_reg(&current,CCREG);
8268             alloc_reg(&current,i,rs1[i]);
8269             if(!(current.is32>>rs1[i]&1))
8270             {
8271               alloc_reg64(&current,i,rs1[i]);
8272             }
8273             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8274               // The delay slot overwrites one of our conditions.
8275               // Allocate the branch condition registers instead.
8276               current.isconst=0;
8277               current.wasconst=0;
8278               regs[i].wasconst=0;
8279               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8280               if(!((current.is32>>rs1[i])&1))
8281               {
8282                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8283               }
8284             }
8285             else
8286             {
8287               ooo[i]=1;
8288               delayslot_alloc(&current,i+1);
8289             }
8290           }
8291           else
8292           // Don't alloc the delay slot yet because we might not execute it
8293           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8294           {
8295             current.isconst=0;
8296             current.wasconst=0;
8297             regs[i].wasconst=0;
8298             alloc_cc(&current,i);
8299             dirty_reg(&current,CCREG);
8300             alloc_reg(&current,i,rs1[i]);
8301             alloc_reg(&current,i,rs2[i]);
8302             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8303             {
8304               alloc_reg64(&current,i,rs1[i]);
8305               alloc_reg64(&current,i,rs2[i]);
8306             }
8307           }
8308           else
8309           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8310           {
8311             current.isconst=0;
8312             current.wasconst=0;
8313             regs[i].wasconst=0;
8314             alloc_cc(&current,i);
8315             dirty_reg(&current,CCREG);
8316             alloc_reg(&current,i,rs1[i]);
8317             if(!(current.is32>>rs1[i]&1))
8318             {
8319               alloc_reg64(&current,i,rs1[i]);
8320             }
8321           }
8322           ds=1;
8323           //current.isconst=0;
8324           break;
8325         case SJUMP:
8326           //current.isconst=0;
8327           //current.wasconst=0;
8328           //regs[i].wasconst=0;
8329           clear_const(&current,rs1[i]);
8330           clear_const(&current,rt1[i]);
8331           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8332           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8333           {
8334             alloc_cc(&current,i);
8335             dirty_reg(&current,CCREG);
8336             alloc_reg(&current,i,rs1[i]);
8337             if(!(current.is32>>rs1[i]&1))
8338             {
8339               alloc_reg64(&current,i,rs1[i]);
8340             }
8341             if (rt1[i]==31) { // BLTZAL/BGEZAL
8342               alloc_reg(&current,i,31);
8343               dirty_reg(&current,31);
8344               //#ifdef REG_PREFETCH
8345               //alloc_reg(&current,i,PTEMP);
8346               //#endif
8347               //current.is32|=1LL<<rt1[i];
8348             }
8349             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8350                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8351               // Allocate the branch condition registers instead.
8352               current.isconst=0;
8353               current.wasconst=0;
8354               regs[i].wasconst=0;
8355               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8356               if(!((current.is32>>rs1[i])&1))
8357               {
8358                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8359               }
8360             }
8361             else
8362             {
8363               ooo[i]=1;
8364               delayslot_alloc(&current,i+1);
8365             }
8366           }
8367           else
8368           // Don't alloc the delay slot yet because we might not execute it
8369           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8370           {
8371             current.isconst=0;
8372             current.wasconst=0;
8373             regs[i].wasconst=0;
8374             alloc_cc(&current,i);
8375             dirty_reg(&current,CCREG);
8376             alloc_reg(&current,i,rs1[i]);
8377             if(!(current.is32>>rs1[i]&1))
8378             {
8379               alloc_reg64(&current,i,rs1[i]);
8380             }
8381           }
8382           ds=1;
8383           //current.isconst=0;
8384           break;
8385         case FJUMP:
8386           current.isconst=0;
8387           current.wasconst=0;
8388           regs[i].wasconst=0;
8389           if(likely[i]==0) // BC1F/BC1T
8390           {
8391             // TODO: Theoretically we can run out of registers here on x86.
8392             // The delay slot can allocate up to six, and we need to check
8393             // CSREG before executing the delay slot.  Possibly we can drop
8394             // the cycle count and then reload it after checking that the
8395             // FPU is in a usable state, or don't do out-of-order execution.
8396             alloc_cc(&current,i);
8397             dirty_reg(&current,CCREG);
8398             alloc_reg(&current,i,FSREG);
8399             alloc_reg(&current,i,CSREG);
8400             if(itype[i+1]==FCOMP) {
8401               // The delay slot overwrites the branch condition.
8402               // Allocate the branch condition registers instead.
8403               alloc_cc(&current,i);
8404               dirty_reg(&current,CCREG);
8405               alloc_reg(&current,i,CSREG);
8406               alloc_reg(&current,i,FSREG);
8407             }
8408             else {
8409               ooo[i]=1;
8410               delayslot_alloc(&current,i+1);
8411               alloc_reg(&current,i+1,CSREG);
8412             }
8413           }
8414           else
8415           // Don't alloc the delay slot yet because we might not execute it
8416           if(likely[i]) // BC1FL/BC1TL
8417           {
8418             alloc_cc(&current,i);
8419             dirty_reg(&current,CCREG);
8420             alloc_reg(&current,i,CSREG);
8421             alloc_reg(&current,i,FSREG);
8422           }
8423           ds=1;
8424           current.isconst=0;
8425           break;
8426         case IMM16:
8427           imm16_alloc(&current,i);
8428           break;
8429         case LOAD:
8430         case LOADLR:
8431           load_alloc(&current,i);
8432           break;
8433         case STORE:
8434         case STORELR:
8435           store_alloc(&current,i);
8436           break;
8437         case ALU:
8438           alu_alloc(&current,i);
8439           break;
8440         case SHIFT:
8441           shift_alloc(&current,i);
8442           break;
8443         case MULTDIV:
8444           multdiv_alloc(&current,i);
8445           break;
8446         case SHIFTIMM:
8447           shiftimm_alloc(&current,i);
8448           break;
8449         case MOV:
8450           mov_alloc(&current,i);
8451           break;
8452         case COP0:
8453           cop0_alloc(&current,i);
8454           break;
8455         case COP1:
8456         case COP2:
8457           cop1_alloc(&current,i);
8458           break;
8459         case C1LS:
8460           c1ls_alloc(&current,i);
8461           break;
8462         case C2LS:
8463           c2ls_alloc(&current,i);
8464           break;
8465         case C2OP:
8466           c2op_alloc(&current,i);
8467           break;
8468         case FCONV:
8469           fconv_alloc(&current,i);
8470           break;
8471         case FLOAT:
8472           float_alloc(&current,i);
8473           break;
8474         case FCOMP:
8475           fcomp_alloc(&current,i);
8476           break;
8477         case SYSCALL:
8478         case HLECALL:
8479         case INTCALL:
8480           syscall_alloc(&current,i);
8481           break;
8482         case SPAN:
8483           pagespan_alloc(&current,i);
8484           break;
8485       }
8486
8487       // Drop the upper half of registers that have become 32-bit
8488       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8489       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8490         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8491         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8492         current.uu|=1;
8493       } else {
8494         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8495         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8496         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8497         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8498         current.uu|=1;
8499       }
8500
8501       // Create entry (branch target) regmap
8502       for(hr=0;hr<HOST_REGS;hr++)
8503       {
8504         int r,or;
8505         r=current.regmap[hr];
8506         if(r>=0) {
8507           if(r!=regmap_pre[i][hr]) {
8508             // TODO: delay slot (?)
8509             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8510             if(or<0||(r&63)>=TEMPREG){
8511               regs[i].regmap_entry[hr]=-1;
8512             }
8513             else
8514             {
8515               // Just move it to a different register
8516               regs[i].regmap_entry[hr]=r;
8517               // If it was dirty before, it's still dirty
8518               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8519             }
8520           }
8521           else
8522           {
8523             // Unneeded
8524             if(r==0){
8525               regs[i].regmap_entry[hr]=0;
8526             }
8527             else
8528             if(r<64){
8529               if((current.u>>r)&1) {
8530                 regs[i].regmap_entry[hr]=-1;
8531                 //regs[i].regmap[hr]=-1;
8532                 current.regmap[hr]=-1;
8533               }else
8534                 regs[i].regmap_entry[hr]=r;
8535             }
8536             else {
8537               if((current.uu>>(r&63))&1) {
8538                 regs[i].regmap_entry[hr]=-1;
8539                 //regs[i].regmap[hr]=-1;
8540                 current.regmap[hr]=-1;
8541               }else
8542                 regs[i].regmap_entry[hr]=r;
8543             }
8544           }
8545         } else {
8546           // Branches expect CCREG to be allocated at the target
8547           if(regmap_pre[i][hr]==CCREG)
8548             regs[i].regmap_entry[hr]=CCREG;
8549           else
8550             regs[i].regmap_entry[hr]=-1;
8551         }
8552       }
8553       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8554     }
8555
8556     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8557       current.waswritten|=1<<rs1[i-1];
8558     current.waswritten&=~(1<<rt1[i]);
8559     current.waswritten&=~(1<<rt2[i]);
8560     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8561       current.waswritten&=~(1<<rs1[i]);
8562
8563     /* Branch post-alloc */
8564     if(i>0)
8565     {
8566       current.was32=current.is32;
8567       current.wasdirty=current.dirty;
8568       switch(itype[i-1]) {
8569         case UJUMP:
8570           memcpy(&branch_regs[i-1],&current,sizeof(current));
8571           branch_regs[i-1].isconst=0;
8572           branch_regs[i-1].wasconst=0;
8573           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8574           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8575           alloc_cc(&branch_regs[i-1],i-1);
8576           dirty_reg(&branch_regs[i-1],CCREG);
8577           if(rt1[i-1]==31) { // JAL
8578             alloc_reg(&branch_regs[i-1],i-1,31);
8579             dirty_reg(&branch_regs[i-1],31);
8580             branch_regs[i-1].is32|=1LL<<31;
8581           }
8582           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8583           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8584           break;
8585         case RJUMP:
8586           memcpy(&branch_regs[i-1],&current,sizeof(current));
8587           branch_regs[i-1].isconst=0;
8588           branch_regs[i-1].wasconst=0;
8589           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8590           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8591           alloc_cc(&branch_regs[i-1],i-1);
8592           dirty_reg(&branch_regs[i-1],CCREG);
8593           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8594           if(rt1[i-1]!=0) { // JALR
8595             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8596             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8597             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8598           }
8599           #ifdef USE_MINI_HT
8600           if(rs1[i-1]==31) { // JALR
8601             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8602             #ifndef HOST_IMM_ADDR32
8603             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8604             #endif
8605           }
8606           #endif
8607           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8608           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8609           break;
8610         case CJUMP:
8611           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8612           {
8613             alloc_cc(&current,i-1);
8614             dirty_reg(&current,CCREG);
8615             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8616                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8617               // The delay slot overwrote one of our conditions
8618               // Delay slot goes after the test (in order)
8619               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8620               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8621               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8622               current.u|=1;
8623               current.uu|=1;
8624               delayslot_alloc(&current,i);
8625               current.isconst=0;
8626             }
8627             else
8628             {
8629               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8630               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8631               // Alloc the branch condition registers
8632               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8633               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8634               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8635               {
8636                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8637                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8638               }
8639             }
8640             memcpy(&branch_regs[i-1],&current,sizeof(current));
8641             branch_regs[i-1].isconst=0;
8642             branch_regs[i-1].wasconst=0;
8643             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8644             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8645           }
8646           else
8647           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8648           {
8649             alloc_cc(&current,i-1);
8650             dirty_reg(&current,CCREG);
8651             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8652               // The delay slot overwrote the branch condition
8653               // Delay slot goes after the test (in order)
8654               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8655               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8656               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8657               current.u|=1;
8658               current.uu|=1;
8659               delayslot_alloc(&current,i);
8660               current.isconst=0;
8661             }
8662             else
8663             {
8664               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8665               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8666               // Alloc the branch condition register
8667               alloc_reg(&current,i-1,rs1[i-1]);
8668               if(!(current.is32>>rs1[i-1]&1))
8669               {
8670                 alloc_reg64(&current,i-1,rs1[i-1]);
8671               }
8672             }
8673             memcpy(&branch_regs[i-1],&current,sizeof(current));
8674             branch_regs[i-1].isconst=0;
8675             branch_regs[i-1].wasconst=0;
8676             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8677             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8678           }
8679           else
8680           // Alloc the delay slot in case the branch is taken
8681           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8682           {
8683             memcpy(&branch_regs[i-1],&current,sizeof(current));
8684             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8685             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8686             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8687             alloc_cc(&branch_regs[i-1],i);
8688             dirty_reg(&branch_regs[i-1],CCREG);
8689             delayslot_alloc(&branch_regs[i-1],i);
8690             branch_regs[i-1].isconst=0;
8691             alloc_reg(&current,i,CCREG); // Not taken path
8692             dirty_reg(&current,CCREG);
8693             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8694           }
8695           else
8696           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8697           {
8698             memcpy(&branch_regs[i-1],&current,sizeof(current));
8699             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8700             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8701             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8702             alloc_cc(&branch_regs[i-1],i);
8703             dirty_reg(&branch_regs[i-1],CCREG);
8704             delayslot_alloc(&branch_regs[i-1],i);
8705             branch_regs[i-1].isconst=0;
8706             alloc_reg(&current,i,CCREG); // Not taken path
8707             dirty_reg(&current,CCREG);
8708             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8709           }
8710           break;
8711         case SJUMP:
8712           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8713           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8714           {
8715             alloc_cc(&current,i-1);
8716             dirty_reg(&current,CCREG);
8717             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8718               // The delay slot overwrote the branch condition
8719               // Delay slot goes after the test (in order)
8720               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8721               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8722               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8723               current.u|=1;
8724               current.uu|=1;
8725               delayslot_alloc(&current,i);
8726               current.isconst=0;
8727             }
8728             else
8729             {
8730               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8731               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8732               // Alloc the branch condition register
8733               alloc_reg(&current,i-1,rs1[i-1]);
8734               if(!(current.is32>>rs1[i-1]&1))
8735               {
8736                 alloc_reg64(&current,i-1,rs1[i-1]);
8737               }
8738             }
8739             memcpy(&branch_regs[i-1],&current,sizeof(current));
8740             branch_regs[i-1].isconst=0;
8741             branch_regs[i-1].wasconst=0;
8742             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8743             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8744           }
8745           else
8746           // Alloc the delay slot in case the branch is taken
8747           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8748           {
8749             memcpy(&branch_regs[i-1],&current,sizeof(current));
8750             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8751             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8752             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8753             alloc_cc(&branch_regs[i-1],i);
8754             dirty_reg(&branch_regs[i-1],CCREG);
8755             delayslot_alloc(&branch_regs[i-1],i);
8756             branch_regs[i-1].isconst=0;
8757             alloc_reg(&current,i,CCREG); // Not taken path
8758             dirty_reg(&current,CCREG);
8759             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8760           }
8761           // FIXME: BLTZAL/BGEZAL
8762           if(opcode2[i-1]&0x10) { // BxxZAL
8763             alloc_reg(&branch_regs[i-1],i-1,31);
8764             dirty_reg(&branch_regs[i-1],31);
8765             branch_regs[i-1].is32|=1LL<<31;
8766           }
8767           break;
8768         case FJUMP:
8769           if(likely[i-1]==0) // BC1F/BC1T
8770           {
8771             alloc_cc(&current,i-1);
8772             dirty_reg(&current,CCREG);
8773             if(itype[i]==FCOMP) {
8774               // The delay slot overwrote the branch condition
8775               // Delay slot goes after the test (in order)
8776               delayslot_alloc(&current,i);
8777               current.isconst=0;
8778             }
8779             else
8780             {
8781               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8782               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8783               // Alloc the branch condition register
8784               alloc_reg(&current,i-1,FSREG);
8785             }
8786             memcpy(&branch_regs[i-1],&current,sizeof(current));
8787             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8788           }
8789           else // BC1FL/BC1TL
8790           {
8791             // Alloc the delay slot in case the branch is taken
8792             memcpy(&branch_regs[i-1],&current,sizeof(current));
8793             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8794             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8795             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8796             alloc_cc(&branch_regs[i-1],i);
8797             dirty_reg(&branch_regs[i-1],CCREG);
8798             delayslot_alloc(&branch_regs[i-1],i);
8799             branch_regs[i-1].isconst=0;
8800             alloc_reg(&current,i,CCREG); // Not taken path
8801             dirty_reg(&current,CCREG);
8802             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8803           }
8804           break;
8805       }
8806
8807       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8808       {
8809         if(rt1[i-1]==31) // JAL/JALR
8810         {
8811           // Subroutine call will return here, don't alloc any registers
8812           current.is32=1;
8813           current.dirty=0;
8814           clear_all_regs(current.regmap);
8815           alloc_reg(&current,i,CCREG);
8816           dirty_reg(&current,CCREG);
8817         }
8818         else if(i+1<slen)
8819         {
8820           // Internal branch will jump here, match registers to caller
8821           current.is32=0x3FFFFFFFFLL;
8822           current.dirty=0;
8823           clear_all_regs(current.regmap);
8824           alloc_reg(&current,i,CCREG);
8825           dirty_reg(&current,CCREG);
8826           for(j=i-1;j>=0;j--)
8827           {
8828             if(ba[j]==start+i*4+4) {
8829               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8830               current.is32=branch_regs[j].is32;
8831               current.dirty=branch_regs[j].dirty;
8832               break;
8833             }
8834           }
8835           while(j>=0) {
8836             if(ba[j]==start+i*4+4) {
8837               for(hr=0;hr<HOST_REGS;hr++) {
8838                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8839                   current.regmap[hr]=-1;
8840                 }
8841                 current.is32&=branch_regs[j].is32;
8842                 current.dirty&=branch_regs[j].dirty;
8843               }
8844             }
8845             j--;
8846           }
8847         }
8848       }
8849     }
8850
8851     // Count cycles in between branches
8852     ccadj[i]=cc;
8853     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8854     {
8855       cc=0;
8856     }
8857 #if !defined(DRC_DBG)
8858     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8859     {
8860       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8861       cc+=gte_cycletab[source[i]&0x3f]/2;
8862     }
8863     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8864     {
8865       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8866     }
8867     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8868     {
8869       cc+=4;
8870     }
8871     else if(itype[i]==C2LS)
8872     {
8873       cc+=4;
8874     }
8875 #endif
8876     else
8877     {
8878       cc++;
8879     }
8880
8881     flush_dirty_uppers(&current);
8882     if(!is_ds[i]) {
8883       regs[i].is32=current.is32;
8884       regs[i].dirty=current.dirty;
8885       regs[i].isconst=current.isconst;
8886       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8887     }
8888     for(hr=0;hr<HOST_REGS;hr++) {
8889       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8890         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8891           regs[i].wasconst&=~(1<<hr);
8892         }
8893       }
8894     }
8895     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8896     regs[i].waswritten=current.waswritten;
8897   }
8898
8899   /* Pass 4 - Cull unused host registers */
8900
8901   uint64_t nr=0;
8902
8903   for (i=slen-1;i>=0;i--)
8904   {
8905     int hr;
8906     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8907     {
8908       if(ba[i]<start || ba[i]>=(start+slen*4))
8909       {
8910         // Branch out of this block, don't need anything
8911         nr=0;
8912       }
8913       else
8914       {
8915         // Internal branch
8916         // Need whatever matches the target
8917         nr=0;
8918         int t=(ba[i]-start)>>2;
8919         for(hr=0;hr<HOST_REGS;hr++)
8920         {
8921           if(regs[i].regmap_entry[hr]>=0) {
8922             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8923           }
8924         }
8925       }
8926       // Conditional branch may need registers for following instructions
8927       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8928       {
8929         if(i<slen-2) {
8930           nr|=needed_reg[i+2];
8931           for(hr=0;hr<HOST_REGS;hr++)
8932           {
8933             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8934             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8935           }
8936         }
8937       }
8938       // Don't need stuff which is overwritten
8939       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8940       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8941       // Merge in delay slot
8942       for(hr=0;hr<HOST_REGS;hr++)
8943       {
8944         if(!likely[i]) {
8945           // These are overwritten unless the branch is "likely"
8946           // and the delay slot is nullified if not taken
8947           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8948           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8949         }
8950         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8951         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8952         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8953         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8954         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8955         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8956         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8957         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8958         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8959           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8960           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8961         }
8962         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8963           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8964           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8965         }
8966         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8967           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8968           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8969         }
8970       }
8971     }
8972     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8973     {
8974       // SYSCALL instruction (software interrupt)
8975       nr=0;
8976     }
8977     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8978     {
8979       // ERET instruction (return from interrupt)
8980       nr=0;
8981     }
8982     else // Non-branch
8983     {
8984       if(i<slen-1) {
8985         for(hr=0;hr<HOST_REGS;hr++) {
8986           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8987           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8988           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8989           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8990         }
8991       }
8992     }
8993     for(hr=0;hr<HOST_REGS;hr++)
8994     {
8995       // Overwritten registers are not needed
8996       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8997       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8998       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8999       // Source registers are needed
9000       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9001       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9002       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9003       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9004       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9005       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9006       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9007       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9008       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9009         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9010         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9011       }
9012       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9013         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9014         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9015       }
9016       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9017         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9018         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9019       }
9020       // Don't store a register immediately after writing it,
9021       // may prevent dual-issue.
9022       // But do so if this is a branch target, otherwise we
9023       // might have to load the register before the branch.
9024       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9025         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9026            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9027           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9028           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9029         }
9030         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9031            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9032           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9033           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9034         }
9035       }
9036     }
9037     // Cycle count is needed at branches.  Assume it is needed at the target too.
9038     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9039       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9040       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9041     }
9042     // Save it
9043     needed_reg[i]=nr;
9044
9045     // Deallocate unneeded registers
9046     for(hr=0;hr<HOST_REGS;hr++)
9047     {
9048       if(!((nr>>hr)&1)) {
9049         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9050         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9051            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9052            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9053         {
9054           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9055           {
9056             if(likely[i]) {
9057               regs[i].regmap[hr]=-1;
9058               regs[i].isconst&=~(1<<hr);
9059               if(i<slen-2) {
9060                 regmap_pre[i+2][hr]=-1;
9061                 regs[i+2].wasconst&=~(1<<hr);
9062               }
9063             }
9064           }
9065         }
9066         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9067         {
9068           int d1=0,d2=0,map=0,temp=0;
9069           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9070           {
9071             d1=dep1[i+1];
9072             d2=dep2[i+1];
9073           }
9074           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9075              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9076             map=INVCP;
9077           }
9078           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9079              itype[i+1]==C1LS || itype[i+1]==C2LS)
9080             temp=FTEMP;
9081           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9082              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9083              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9084              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9085              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9086              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9087              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9088              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9089              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9090              regs[i].regmap[hr]!=map )
9091           {
9092             regs[i].regmap[hr]=-1;
9093             regs[i].isconst&=~(1<<hr);
9094             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9095                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9096                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9097                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9098                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9099                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9100                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9101                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9102                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9103                branch_regs[i].regmap[hr]!=map)
9104             {
9105               branch_regs[i].regmap[hr]=-1;
9106               branch_regs[i].regmap_entry[hr]=-1;
9107               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9108               {
9109                 if(!likely[i]&&i<slen-2) {
9110                   regmap_pre[i+2][hr]=-1;
9111                   regs[i+2].wasconst&=~(1<<hr);
9112                 }
9113               }
9114             }
9115           }
9116         }
9117         else
9118         {
9119           // Non-branch
9120           if(i>0)
9121           {
9122             int d1=0,d2=0,map=-1,temp=-1;
9123             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9124             {
9125               d1=dep1[i];
9126               d2=dep2[i];
9127             }
9128             if(itype[i]==STORE || itype[i]==STORELR ||
9129                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9130               map=INVCP;
9131             }
9132             if(itype[i]==LOADLR || itype[i]==STORELR ||
9133                itype[i]==C1LS || itype[i]==C2LS)
9134               temp=FTEMP;
9135             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9136                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9137                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9138                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9139                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9140                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9141             {
9142               if(i<slen-1&&!is_ds[i]) {
9143                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9144                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9145                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9146                 {
9147                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9148                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9149                 }
9150                 regmap_pre[i+1][hr]=-1;
9151                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9152                 regs[i+1].wasconst&=~(1<<hr);
9153               }
9154               regs[i].regmap[hr]=-1;
9155               regs[i].isconst&=~(1<<hr);
9156             }
9157           }
9158         }
9159       }
9160     }
9161   }
9162
9163   /* Pass 5 - Pre-allocate registers */
9164
9165   // If a register is allocated during a loop, try to allocate it for the
9166   // entire loop, if possible.  This avoids loading/storing registers
9167   // inside of the loop.
9168
9169   signed char f_regmap[HOST_REGS];
9170   clear_all_regs(f_regmap);
9171   for(i=0;i<slen-1;i++)
9172   {
9173     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9174     {
9175       if(ba[i]>=start && ba[i]<(start+i*4))
9176       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9177       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9178       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9179       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9180       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9181       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9182       {
9183         int t=(ba[i]-start)>>2;
9184         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9185         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9186         for(hr=0;hr<HOST_REGS;hr++)
9187         {
9188           if(regs[i].regmap[hr]>64) {
9189             if(!((regs[i].dirty>>hr)&1))
9190               f_regmap[hr]=regs[i].regmap[hr];
9191             else f_regmap[hr]=-1;
9192           }
9193           else if(regs[i].regmap[hr]>=0) {
9194             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9195               // dealloc old register
9196               int n;
9197               for(n=0;n<HOST_REGS;n++)
9198               {
9199                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9200               }
9201               // and alloc new one
9202               f_regmap[hr]=regs[i].regmap[hr];
9203             }
9204           }
9205           if(branch_regs[i].regmap[hr]>64) {
9206             if(!((branch_regs[i].dirty>>hr)&1))
9207               f_regmap[hr]=branch_regs[i].regmap[hr];
9208             else f_regmap[hr]=-1;
9209           }
9210           else if(branch_regs[i].regmap[hr]>=0) {
9211             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9212               // dealloc old register
9213               int n;
9214               for(n=0;n<HOST_REGS;n++)
9215               {
9216                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9217               }
9218               // and alloc new one
9219               f_regmap[hr]=branch_regs[i].regmap[hr];
9220             }
9221           }
9222           if(ooo[i]) {
9223             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9224               f_regmap[hr]=branch_regs[i].regmap[hr];
9225           }else{
9226             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9227               f_regmap[hr]=branch_regs[i].regmap[hr];
9228           }
9229           // Avoid dirty->clean transition
9230           #ifdef DESTRUCTIVE_WRITEBACK
9231           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9232           #endif
9233           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9234           // case above, however it's always a good idea.  We can't hoist the
9235           // load if the register was already allocated, so there's no point
9236           // wasting time analyzing most of these cases.  It only "succeeds"
9237           // when the mapping was different and the load can be replaced with
9238           // a mov, which is of negligible benefit.  So such cases are
9239           // skipped below.
9240           if(f_regmap[hr]>0) {
9241             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9242               int r=f_regmap[hr];
9243               for(j=t;j<=i;j++)
9244               {
9245                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9246                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9247                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9248                 if(r>63) {
9249                   // NB This can exclude the case where the upper-half
9250                   // register is lower numbered than the lower-half
9251                   // register.  Not sure if it's worth fixing...
9252                   if(get_reg(regs[j].regmap,r&63)<0) break;
9253                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9254                   if(regs[j].is32&(1LL<<(r&63))) break;
9255                 }
9256                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9257                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9258                   int k;
9259                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9260                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9261                     if(r>63) {
9262                       if(get_reg(regs[i].regmap,r&63)<0) break;
9263                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9264                     }
9265                     k=i;
9266                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9267                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9268                         //printf("no free regs for store %x\n",start+(k-1)*4);
9269                         break;
9270                       }
9271                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9272                         //printf("no-match due to different register\n");
9273                         break;
9274                       }
9275                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9276                         //printf("no-match due to branch\n");
9277                         break;
9278                       }
9279                       // call/ret fast path assumes no registers allocated
9280                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9281                         break;
9282                       }
9283                       if(r>63) {
9284                         // NB This can exclude the case where the upper-half
9285                         // register is lower numbered than the lower-half
9286                         // register.  Not sure if it's worth fixing...
9287                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9288                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9289                       }
9290                       k--;
9291                     }
9292                     if(i<slen-1) {
9293                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9294                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9295                         //printf("bad match after branch\n");
9296                         break;
9297                       }
9298                     }
9299                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9300                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9301                       while(k<i) {
9302                         regs[k].regmap_entry[hr]=f_regmap[hr];
9303                         regs[k].regmap[hr]=f_regmap[hr];
9304                         regmap_pre[k+1][hr]=f_regmap[hr];
9305                         regs[k].wasdirty&=~(1<<hr);
9306                         regs[k].dirty&=~(1<<hr);
9307                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9308                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9309                         regs[k].wasconst&=~(1<<hr);
9310                         regs[k].isconst&=~(1<<hr);
9311                         k++;
9312                       }
9313                     }
9314                     else {
9315                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9316                       break;
9317                     }
9318                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9319                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9320                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9321                       regs[i].regmap_entry[hr]=f_regmap[hr];
9322                       regs[i].regmap[hr]=f_regmap[hr];
9323                       regs[i].wasdirty&=~(1<<hr);
9324                       regs[i].dirty&=~(1<<hr);
9325                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9326                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9327                       regs[i].wasconst&=~(1<<hr);
9328                       regs[i].isconst&=~(1<<hr);
9329                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9330                       branch_regs[i].wasdirty&=~(1<<hr);
9331                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9332                       branch_regs[i].regmap[hr]=f_regmap[hr];
9333                       branch_regs[i].dirty&=~(1<<hr);
9334                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9335                       branch_regs[i].wasconst&=~(1<<hr);
9336                       branch_regs[i].isconst&=~(1<<hr);
9337                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9338                         regmap_pre[i+2][hr]=f_regmap[hr];
9339                         regs[i+2].wasdirty&=~(1<<hr);
9340                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9341                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9342                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9343                       }
9344                     }
9345                   }
9346                   for(k=t;k<j;k++) {
9347                     // Alloc register clean at beginning of loop,
9348                     // but may dirty it in pass 6
9349                     regs[k].regmap_entry[hr]=f_regmap[hr];
9350                     regs[k].regmap[hr]=f_regmap[hr];
9351                     regs[k].dirty&=~(1<<hr);
9352                     regs[k].wasconst&=~(1<<hr);
9353                     regs[k].isconst&=~(1<<hr);
9354                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9355                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9356                       branch_regs[k].regmap[hr]=f_regmap[hr];
9357                       branch_regs[k].dirty&=~(1<<hr);
9358                       branch_regs[k].wasconst&=~(1<<hr);
9359                       branch_regs[k].isconst&=~(1<<hr);
9360                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9361                         regmap_pre[k+2][hr]=f_regmap[hr];
9362                         regs[k+2].wasdirty&=~(1<<hr);
9363                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9364                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9365                       }
9366                     }
9367                     else
9368                     {
9369                       regmap_pre[k+1][hr]=f_regmap[hr];
9370                       regs[k+1].wasdirty&=~(1<<hr);
9371                     }
9372                   }
9373                   if(regs[j].regmap[hr]==f_regmap[hr])
9374                     regs[j].regmap_entry[hr]=f_regmap[hr];
9375                   break;
9376                 }
9377                 if(j==i) break;
9378                 if(regs[j].regmap[hr]>=0)
9379                   break;
9380                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9381                   //printf("no-match due to different register\n");
9382                   break;
9383                 }
9384                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9385                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9386                   break;
9387                 }
9388                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9389                 {
9390                   // Stop on unconditional branch
9391                   break;
9392                 }
9393                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9394                 {
9395                   if(ooo[j]) {
9396                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9397                       break;
9398                   }else{
9399                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9400                       break;
9401                   }
9402                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9403                     //printf("no-match due to different register (branch)\n");
9404                     break;
9405                   }
9406                 }
9407                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9408                   //printf("No free regs for store %x\n",start+j*4);
9409                   break;
9410                 }
9411                 if(f_regmap[hr]>=64) {
9412                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9413                     break;
9414                   }
9415                   else
9416                   {
9417                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9418                       break;
9419                     }
9420                   }
9421                 }
9422               }
9423             }
9424           }
9425         }
9426       }
9427     }else{
9428       // Non branch or undetermined branch target
9429       for(hr=0;hr<HOST_REGS;hr++)
9430       {
9431         if(hr!=EXCLUDE_REG) {
9432           if(regs[i].regmap[hr]>64) {
9433             if(!((regs[i].dirty>>hr)&1))
9434               f_regmap[hr]=regs[i].regmap[hr];
9435           }
9436           else if(regs[i].regmap[hr]>=0) {
9437             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9438               // dealloc old register
9439               int n;
9440               for(n=0;n<HOST_REGS;n++)
9441               {
9442                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9443               }
9444               // and alloc new one
9445               f_regmap[hr]=regs[i].regmap[hr];
9446             }
9447           }
9448         }
9449       }
9450       // Try to restore cycle count at branch targets
9451       if(bt[i]) {
9452         for(j=i;j<slen-1;j++) {
9453           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9454           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9455             //printf("no free regs for store %x\n",start+j*4);
9456             break;
9457           }
9458         }
9459         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9460           int k=i;
9461           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9462           while(k<j) {
9463             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9464             regs[k].regmap[HOST_CCREG]=CCREG;
9465             regmap_pre[k+1][HOST_CCREG]=CCREG;
9466             regs[k+1].wasdirty|=1<<HOST_CCREG;
9467             regs[k].dirty|=1<<HOST_CCREG;
9468             regs[k].wasconst&=~(1<<HOST_CCREG);
9469             regs[k].isconst&=~(1<<HOST_CCREG);
9470             k++;
9471           }
9472           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9473         }
9474         // Work backwards from the branch target
9475         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9476         {
9477           //printf("Extend backwards\n");
9478           int k;
9479           k=i;
9480           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9481             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9482               //printf("no free regs for store %x\n",start+(k-1)*4);
9483               break;
9484             }
9485             k--;
9486           }
9487           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9488             //printf("Extend CC, %x ->\n",start+k*4);
9489             while(k<=i) {
9490               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9491               regs[k].regmap[HOST_CCREG]=CCREG;
9492               regmap_pre[k+1][HOST_CCREG]=CCREG;
9493               regs[k+1].wasdirty|=1<<HOST_CCREG;
9494               regs[k].dirty|=1<<HOST_CCREG;
9495               regs[k].wasconst&=~(1<<HOST_CCREG);
9496               regs[k].isconst&=~(1<<HOST_CCREG);
9497               k++;
9498             }
9499           }
9500           else {
9501             //printf("Fail Extend CC, %x ->\n",start+k*4);
9502           }
9503         }
9504       }
9505       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9506          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9507          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9508          itype[i]!=FCONV&&itype[i]!=FCOMP)
9509       {
9510         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9511       }
9512     }
9513   }
9514
9515   // Cache memory offset or tlb map pointer if a register is available
9516   #ifndef HOST_IMM_ADDR32
9517   #ifndef RAM_OFFSET
9518   if(0)
9519   #endif
9520   {
9521     int earliest_available[HOST_REGS];
9522     int loop_start[HOST_REGS];
9523     int score[HOST_REGS];
9524     int end[HOST_REGS];
9525     int reg=ROREG;
9526
9527     // Init
9528     for(hr=0;hr<HOST_REGS;hr++) {
9529       score[hr]=0;earliest_available[hr]=0;
9530       loop_start[hr]=MAXBLOCK;
9531     }
9532     for(i=0;i<slen-1;i++)
9533     {
9534       // Can't do anything if no registers are available
9535       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9536         for(hr=0;hr<HOST_REGS;hr++) {
9537           score[hr]=0;earliest_available[hr]=i+1;
9538           loop_start[hr]=MAXBLOCK;
9539         }
9540       }
9541       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9542         if(!ooo[i]) {
9543           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9544             for(hr=0;hr<HOST_REGS;hr++) {
9545               score[hr]=0;earliest_available[hr]=i+1;
9546               loop_start[hr]=MAXBLOCK;
9547             }
9548           }
9549         }else{
9550           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9551             for(hr=0;hr<HOST_REGS;hr++) {
9552               score[hr]=0;earliest_available[hr]=i+1;
9553               loop_start[hr]=MAXBLOCK;
9554             }
9555           }
9556         }
9557       }
9558       // Mark unavailable registers
9559       for(hr=0;hr<HOST_REGS;hr++) {
9560         if(regs[i].regmap[hr]>=0) {
9561           score[hr]=0;earliest_available[hr]=i+1;
9562           loop_start[hr]=MAXBLOCK;
9563         }
9564         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9565           if(branch_regs[i].regmap[hr]>=0) {
9566             score[hr]=0;earliest_available[hr]=i+2;
9567             loop_start[hr]=MAXBLOCK;
9568           }
9569         }
9570       }
9571       // No register allocations after unconditional jumps
9572       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9573       {
9574         for(hr=0;hr<HOST_REGS;hr++) {
9575           score[hr]=0;earliest_available[hr]=i+2;
9576           loop_start[hr]=MAXBLOCK;
9577         }
9578         i++; // Skip delay slot too
9579         //printf("skip delay slot: %x\n",start+i*4);
9580       }
9581       else
9582       // Possible match
9583       if(itype[i]==LOAD||itype[i]==LOADLR||
9584          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9585         for(hr=0;hr<HOST_REGS;hr++) {
9586           if(hr!=EXCLUDE_REG) {
9587             end[hr]=i-1;
9588             for(j=i;j<slen-1;j++) {
9589               if(regs[j].regmap[hr]>=0) break;
9590               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9591                 if(branch_regs[j].regmap[hr]>=0) break;
9592                 if(ooo[j]) {
9593                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9594                 }else{
9595                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9596                 }
9597               }
9598               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9599               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9600                 int t=(ba[j]-start)>>2;
9601                 if(t<j&&t>=earliest_available[hr]) {
9602                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9603                     // Score a point for hoisting loop invariant
9604                     if(t<loop_start[hr]) loop_start[hr]=t;
9605                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9606                     score[hr]++;
9607                     end[hr]=j;
9608                   }
9609                 }
9610                 else if(t<j) {
9611                   if(regs[t].regmap[hr]==reg) {
9612                     // Score a point if the branch target matches this register
9613                     score[hr]++;
9614                     end[hr]=j;
9615                   }
9616                 }
9617                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9618                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9619                   score[hr]++;
9620                   end[hr]=j;
9621                 }
9622               }
9623               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9624               {
9625                 // Stop on unconditional branch
9626                 break;
9627               }
9628               else
9629               if(itype[j]==LOAD||itype[j]==LOADLR||
9630                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9631                 score[hr]++;
9632                 end[hr]=j;
9633               }
9634             }
9635           }
9636         }
9637         // Find highest score and allocate that register
9638         int maxscore=0;
9639         for(hr=0;hr<HOST_REGS;hr++) {
9640           if(hr!=EXCLUDE_REG) {
9641             if(score[hr]>score[maxscore]) {
9642               maxscore=hr;
9643               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9644             }
9645           }
9646         }
9647         if(score[maxscore]>1)
9648         {
9649           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9650           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9651             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9652             assert(regs[j].regmap[maxscore]<0);
9653             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9654             regs[j].regmap[maxscore]=reg;
9655             regs[j].dirty&=~(1<<maxscore);
9656             regs[j].wasconst&=~(1<<maxscore);
9657             regs[j].isconst&=~(1<<maxscore);
9658             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9659               branch_regs[j].regmap[maxscore]=reg;
9660               branch_regs[j].wasdirty&=~(1<<maxscore);
9661               branch_regs[j].dirty&=~(1<<maxscore);
9662               branch_regs[j].wasconst&=~(1<<maxscore);
9663               branch_regs[j].isconst&=~(1<<maxscore);
9664               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9665                 regmap_pre[j+2][maxscore]=reg;
9666                 regs[j+2].wasdirty&=~(1<<maxscore);
9667               }
9668               // loop optimization (loop_preload)
9669               int t=(ba[j]-start)>>2;
9670               if(t==loop_start[maxscore]) {
9671                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9672                   regs[t].regmap_entry[maxscore]=reg;
9673               }
9674             }
9675             else
9676             {
9677               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9678                 regmap_pre[j+1][maxscore]=reg;
9679                 regs[j+1].wasdirty&=~(1<<maxscore);
9680               }
9681             }
9682           }
9683           i=j-1;
9684           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9685           for(hr=0;hr<HOST_REGS;hr++) {
9686             score[hr]=0;earliest_available[hr]=i+i;
9687             loop_start[hr]=MAXBLOCK;
9688           }
9689         }
9690       }
9691     }
9692   }
9693   #endif
9694
9695   // This allocates registers (if possible) one instruction prior
9696   // to use, which can avoid a load-use penalty on certain CPUs.
9697   for(i=0;i<slen-1;i++)
9698   {
9699     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9700     {
9701       if(!bt[i+1])
9702       {
9703         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9704            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9705         {
9706           if(rs1[i+1]) {
9707             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9708             {
9709               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9710               {
9711                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9712                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9713                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9714                 regs[i].isconst&=~(1<<hr);
9715                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9716                 constmap[i][hr]=constmap[i+1][hr];
9717                 regs[i+1].wasdirty&=~(1<<hr);
9718                 regs[i].dirty&=~(1<<hr);
9719               }
9720             }
9721           }
9722           if(rs2[i+1]) {
9723             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9724             {
9725               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9726               {
9727                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9728                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9729                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9730                 regs[i].isconst&=~(1<<hr);
9731                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9732                 constmap[i][hr]=constmap[i+1][hr];
9733                 regs[i+1].wasdirty&=~(1<<hr);
9734                 regs[i].dirty&=~(1<<hr);
9735               }
9736             }
9737           }
9738           // Preload target address for load instruction (non-constant)
9739           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9740             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9741             {
9742               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9743               {
9744                 regs[i].regmap[hr]=rs1[i+1];
9745                 regmap_pre[i+1][hr]=rs1[i+1];
9746                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9747                 regs[i].isconst&=~(1<<hr);
9748                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9749                 constmap[i][hr]=constmap[i+1][hr];
9750                 regs[i+1].wasdirty&=~(1<<hr);
9751                 regs[i].dirty&=~(1<<hr);
9752               }
9753             }
9754           }
9755           // Load source into target register
9756           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9757             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9758             {
9759               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9760               {
9761                 regs[i].regmap[hr]=rs1[i+1];
9762                 regmap_pre[i+1][hr]=rs1[i+1];
9763                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9764                 regs[i].isconst&=~(1<<hr);
9765                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9766                 constmap[i][hr]=constmap[i+1][hr];
9767                 regs[i+1].wasdirty&=~(1<<hr);
9768                 regs[i].dirty&=~(1<<hr);
9769               }
9770             }
9771           }
9772           // Address for store instruction (non-constant)
9773           if(itype[i+1]==STORE||itype[i+1]==STORELR
9774              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9775             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9776               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9777               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9778               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9779               assert(hr>=0);
9780               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9781               {
9782                 regs[i].regmap[hr]=rs1[i+1];
9783                 regmap_pre[i+1][hr]=rs1[i+1];
9784                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9785                 regs[i].isconst&=~(1<<hr);
9786                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9787                 constmap[i][hr]=constmap[i+1][hr];
9788                 regs[i+1].wasdirty&=~(1<<hr);
9789                 regs[i].dirty&=~(1<<hr);
9790               }
9791             }
9792           }
9793           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9794             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9795               int nr;
9796               hr=get_reg(regs[i+1].regmap,FTEMP);
9797               assert(hr>=0);
9798               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9799               {
9800                 regs[i].regmap[hr]=rs1[i+1];
9801                 regmap_pre[i+1][hr]=rs1[i+1];
9802                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9803                 regs[i].isconst&=~(1<<hr);
9804                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9805                 constmap[i][hr]=constmap[i+1][hr];
9806                 regs[i+1].wasdirty&=~(1<<hr);
9807                 regs[i].dirty&=~(1<<hr);
9808               }
9809               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9810               {
9811                 // move it to another register
9812                 regs[i+1].regmap[hr]=-1;
9813                 regmap_pre[i+2][hr]=-1;
9814                 regs[i+1].regmap[nr]=FTEMP;
9815                 regmap_pre[i+2][nr]=FTEMP;
9816                 regs[i].regmap[nr]=rs1[i+1];
9817                 regmap_pre[i+1][nr]=rs1[i+1];
9818                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9819                 regs[i].isconst&=~(1<<nr);
9820                 regs[i+1].isconst&=~(1<<nr);
9821                 regs[i].dirty&=~(1<<nr);
9822                 regs[i+1].wasdirty&=~(1<<nr);
9823                 regs[i+1].dirty&=~(1<<nr);
9824                 regs[i+2].wasdirty&=~(1<<nr);
9825               }
9826             }
9827           }
9828           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9829             if(itype[i+1]==LOAD)
9830               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9831             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9832               hr=get_reg(regs[i+1].regmap,FTEMP);
9833             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9834               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9835               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9836             }
9837             if(hr>=0&&regs[i].regmap[hr]<0) {
9838               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9839               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9840                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9841                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9842                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9843                 regs[i].isconst&=~(1<<hr);
9844                 regs[i+1].wasdirty&=~(1<<hr);
9845                 regs[i].dirty&=~(1<<hr);
9846               }
9847             }
9848           }
9849         }
9850       }
9851     }
9852   }
9853
9854   /* Pass 6 - Optimize clean/dirty state */
9855   clean_registers(0,slen-1,1);
9856
9857   /* Pass 7 - Identify 32-bit registers */
9858   for (i=slen-1;i>=0;i--)
9859   {
9860     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9861     {
9862       // Conditional branch
9863       if((source[i]>>16)!=0x1000&&i<slen-2) {
9864         // Mark this address as a branch target since it may be called
9865         // upon return from interrupt
9866         bt[i+2]=1;
9867       }
9868     }
9869   }
9870
9871   if(itype[slen-1]==SPAN) {
9872     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9873   }
9874
9875 #ifdef DISASM
9876   /* Debug/disassembly */
9877   for(i=0;i<slen;i++)
9878   {
9879     printf("U:");
9880     int r;
9881     for(r=1;r<=CCREG;r++) {
9882       if((unneeded_reg[i]>>r)&1) {
9883         if(r==HIREG) printf(" HI");
9884         else if(r==LOREG) printf(" LO");
9885         else printf(" r%d",r);
9886       }
9887     }
9888     printf("\n");
9889     #if defined(__i386__) || defined(__x86_64__)
9890     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9891     #endif
9892     #ifdef __arm__
9893     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9894     #endif
9895     printf("needs: ");
9896     if(needed_reg[i]&1) printf("eax ");
9897     if((needed_reg[i]>>1)&1) printf("ecx ");
9898     if((needed_reg[i]>>2)&1) printf("edx ");
9899     if((needed_reg[i]>>3)&1) printf("ebx ");
9900     if((needed_reg[i]>>5)&1) printf("ebp ");
9901     if((needed_reg[i]>>6)&1) printf("esi ");
9902     if((needed_reg[i]>>7)&1) printf("edi ");
9903     printf("\n");
9904     #if defined(__i386__) || defined(__x86_64__)
9905     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9906     printf("dirty: ");
9907     if(regs[i].wasdirty&1) printf("eax ");
9908     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9909     if((regs[i].wasdirty>>2)&1) printf("edx ");
9910     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9911     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9912     if((regs[i].wasdirty>>6)&1) printf("esi ");
9913     if((regs[i].wasdirty>>7)&1) printf("edi ");
9914     #endif
9915     #ifdef __arm__
9916     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9917     printf("dirty: ");
9918     if(regs[i].wasdirty&1) printf("r0 ");
9919     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9920     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9921     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9922     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9923     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9924     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9925     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9926     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9927     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9928     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9929     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9930     #endif
9931     printf("\n");
9932     disassemble_inst(i);
9933     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9934     #if defined(__i386__) || defined(__x86_64__)
9935     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9936     if(regs[i].dirty&1) printf("eax ");
9937     if((regs[i].dirty>>1)&1) printf("ecx ");
9938     if((regs[i].dirty>>2)&1) printf("edx ");
9939     if((regs[i].dirty>>3)&1) printf("ebx ");
9940     if((regs[i].dirty>>5)&1) printf("ebp ");
9941     if((regs[i].dirty>>6)&1) printf("esi ");
9942     if((regs[i].dirty>>7)&1) printf("edi ");
9943     #endif
9944     #ifdef __arm__
9945     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9946     if(regs[i].dirty&1) printf("r0 ");
9947     if((regs[i].dirty>>1)&1) printf("r1 ");
9948     if((regs[i].dirty>>2)&1) printf("r2 ");
9949     if((regs[i].dirty>>3)&1) printf("r3 ");
9950     if((regs[i].dirty>>4)&1) printf("r4 ");
9951     if((regs[i].dirty>>5)&1) printf("r5 ");
9952     if((regs[i].dirty>>6)&1) printf("r6 ");
9953     if((regs[i].dirty>>7)&1) printf("r7 ");
9954     if((regs[i].dirty>>8)&1) printf("r8 ");
9955     if((regs[i].dirty>>9)&1) printf("r9 ");
9956     if((regs[i].dirty>>10)&1) printf("r10 ");
9957     if((regs[i].dirty>>12)&1) printf("r12 ");
9958     #endif
9959     printf("\n");
9960     if(regs[i].isconst) {
9961       printf("constants: ");
9962       #if defined(__i386__) || defined(__x86_64__)
9963       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9964       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9965       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9966       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9967       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9968       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9969       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9970       #endif
9971       #ifdef __arm__
9972       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9973       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9974       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9975       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9976       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9977       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9978       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9979       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9980       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9981       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9982       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9983       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9984       #endif
9985       printf("\n");
9986     }
9987     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9988       #if defined(__i386__) || defined(__x86_64__)
9989       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9990       if(branch_regs[i].dirty&1) printf("eax ");
9991       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9992       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9993       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9994       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9995       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9996       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9997       #endif
9998       #ifdef __arm__
9999       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10000       if(branch_regs[i].dirty&1) printf("r0 ");
10001       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10002       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10003       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10004       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10005       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10006       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10007       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10008       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10009       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10010       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10011       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10012       #endif
10013     }
10014   }
10015 #endif // DISASM
10016
10017   /* Pass 8 - Assembly */
10018   linkcount=0;stubcount=0;
10019   ds=0;is_delayslot=0;
10020   cop1_usable=0;
10021   uint64_t is32_pre=0;
10022   u_int dirty_pre=0;
10023   void *beginning=start_block();
10024   if((u_int)addr&1) {
10025     ds=1;
10026     pagespan_ds();
10027   }
10028   void *instr_addr0_override = NULL;
10029
10030   if (start == 0x80030000) {
10031     // nasty hack for fastbios thing
10032     // override block entry to this code
10033     instr_addr0_override = out;
10034     emit_movimm(start,0);
10035     // abuse io address var as a flag that we
10036     // have already returned here once
10037     emit_readword((int)&address,1);
10038     emit_writeword(0,(int)&pcaddr);
10039     emit_writeword(0,(int)&address);
10040     emit_cmp(0,1);
10041     emit_jne((int)new_dyna_leave);
10042   }
10043   for(i=0;i<slen;i++)
10044   {
10045     //if(ds) printf("ds: ");
10046     disassemble_inst(i);
10047     if(ds) {
10048       ds=0; // Skip delay slot
10049       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10050       instr_addr[i] = NULL;
10051     } else {
10052       speculate_register_values(i);
10053       #ifndef DESTRUCTIVE_WRITEBACK
10054       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10055       {
10056         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10057               unneeded_reg[i],unneeded_reg_upper[i]);
10058       }
10059       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10060         is32_pre=branch_regs[i].is32;
10061         dirty_pre=branch_regs[i].dirty;
10062       }else{
10063         is32_pre=regs[i].is32;
10064         dirty_pre=regs[i].dirty;
10065       }
10066       #endif
10067       // write back
10068       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10069       {
10070         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10071                       unneeded_reg[i],unneeded_reg_upper[i]);
10072         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10073       }
10074       // branch target entry point
10075       instr_addr[i] = out;
10076       assem_debug("<->\n");
10077       drc_dbg_emit_do_cmp(i);
10078
10079       // load regs
10080       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10081         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10082       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10083       address_generation(i,&regs[i],regs[i].regmap_entry);
10084       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10085       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10086       {
10087         // Load the delay slot registers if necessary
10088         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10089           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10090         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10091           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10092         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10093           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10094       }
10095       else if(i+1<slen)
10096       {
10097         // Preload registers for following instruction
10098         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10099           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10100             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10101         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10102           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10103             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10104       }
10105       // TODO: if(is_ooo(i)) address_generation(i+1);
10106       if(itype[i]==CJUMP||itype[i]==FJUMP)
10107         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10108       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10109         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10110       if(bt[i]) cop1_usable=0;
10111       // assemble
10112       switch(itype[i]) {
10113         case ALU:
10114           alu_assemble(i,&regs[i]);break;
10115         case IMM16:
10116           imm16_assemble(i,&regs[i]);break;
10117         case SHIFT:
10118           shift_assemble(i,&regs[i]);break;
10119         case SHIFTIMM:
10120           shiftimm_assemble(i,&regs[i]);break;
10121         case LOAD:
10122           load_assemble(i,&regs[i]);break;
10123         case LOADLR:
10124           loadlr_assemble(i,&regs[i]);break;
10125         case STORE:
10126           store_assemble(i,&regs[i]);break;
10127         case STORELR:
10128           storelr_assemble(i,&regs[i]);break;
10129         case COP0:
10130           cop0_assemble(i,&regs[i]);break;
10131         case COP1:
10132           cop1_assemble(i,&regs[i]);break;
10133         case C1LS:
10134           c1ls_assemble(i,&regs[i]);break;
10135         case COP2:
10136           cop2_assemble(i,&regs[i]);break;
10137         case C2LS:
10138           c2ls_assemble(i,&regs[i]);break;
10139         case C2OP:
10140           c2op_assemble(i,&regs[i]);break;
10141         case FCONV:
10142           fconv_assemble(i,&regs[i]);break;
10143         case FLOAT:
10144           float_assemble(i,&regs[i]);break;
10145         case FCOMP:
10146           fcomp_assemble(i,&regs[i]);break;
10147         case MULTDIV:
10148           multdiv_assemble(i,&regs[i]);break;
10149         case MOV:
10150           mov_assemble(i,&regs[i]);break;
10151         case SYSCALL:
10152           syscall_assemble(i,&regs[i]);break;
10153         case HLECALL:
10154           hlecall_assemble(i,&regs[i]);break;
10155         case INTCALL:
10156           intcall_assemble(i,&regs[i]);break;
10157         case UJUMP:
10158           ujump_assemble(i,&regs[i]);ds=1;break;
10159         case RJUMP:
10160           rjump_assemble(i,&regs[i]);ds=1;break;
10161         case CJUMP:
10162           cjump_assemble(i,&regs[i]);ds=1;break;
10163         case SJUMP:
10164           sjump_assemble(i,&regs[i]);ds=1;break;
10165         case FJUMP:
10166           fjump_assemble(i,&regs[i]);ds=1;break;
10167         case SPAN:
10168           pagespan_assemble(i,&regs[i]);break;
10169       }
10170       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10171         literal_pool(1024);
10172       else
10173         literal_pool_jumpover(256);
10174     }
10175   }
10176   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10177   // If the block did not end with an unconditional branch,
10178   // add a jump to the next instruction.
10179   if(i>1) {
10180     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10181       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10182       assert(i==slen);
10183       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10184         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10185         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10186           emit_loadreg(CCREG,HOST_CCREG);
10187         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10188       }
10189       else if(!likely[i-2])
10190       {
10191         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10192         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10193       }
10194       else
10195       {
10196         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10197         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10198       }
10199       add_to_linker((int)out,start+i*4,0);
10200       emit_jmp(0);
10201     }
10202   }
10203   else
10204   {
10205     assert(i>0);
10206     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10207     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10208     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10209       emit_loadreg(CCREG,HOST_CCREG);
10210     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10211     add_to_linker((int)out,start+i*4,0);
10212     emit_jmp(0);
10213   }
10214
10215   // TODO: delay slot stubs?
10216   // Stubs
10217   for(i=0;i<stubcount;i++)
10218   {
10219     switch(stubs[i][0])
10220     {
10221       case LOADB_STUB:
10222       case LOADH_STUB:
10223       case LOADW_STUB:
10224       case LOADD_STUB:
10225       case LOADBU_STUB:
10226       case LOADHU_STUB:
10227         do_readstub(i);break;
10228       case STOREB_STUB:
10229       case STOREH_STUB:
10230       case STOREW_STUB:
10231       case STORED_STUB:
10232         do_writestub(i);break;
10233       case CC_STUB:
10234         do_ccstub(i);break;
10235       case INVCODE_STUB:
10236         do_invstub(i);break;
10237       case FP_STUB:
10238         do_cop1stub(i);break;
10239       case STORELR_STUB:
10240         do_unalignedwritestub(i);break;
10241     }
10242   }
10243
10244   if (instr_addr0_override)
10245     instr_addr[0] = instr_addr0_override;
10246
10247   /* Pass 9 - Linker */
10248   for(i=0;i<linkcount;i++)
10249   {
10250     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10251     literal_pool(64);
10252     if(!link_addr[i][2])
10253     {
10254       void *stub=out;
10255       void *addr=check_addr(link_addr[i][1]);
10256       emit_extjump(link_addr[i][0],link_addr[i][1]);
10257       if(addr) {
10258         set_jump_target(link_addr[i][0], addr);
10259         add_link(link_addr[i][1],stub);
10260       }
10261       else set_jump_target(link_addr[i][0], stub);
10262     }
10263     else
10264     {
10265       // Internal branch
10266       int target=(link_addr[i][1]-start)>>2;
10267       assert(target>=0&&target<slen);
10268       assert(instr_addr[target]);
10269       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10270       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10271       //#else
10272       set_jump_target(link_addr[i][0],instr_addr[target]);
10273       //#endif
10274     }
10275   }
10276   // External Branch Targets (jump_in)
10277   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10278   for(i=0;i<slen;i++)
10279   {
10280     if(bt[i]||i==0)
10281     {
10282       if(instr_addr[i]) // TODO - delay slots (=null)
10283       {
10284         u_int vaddr=start+i*4;
10285         u_int page=get_page(vaddr);
10286         u_int vpage=get_vpage(vaddr);
10287         literal_pool(256);
10288         {
10289           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10290           assem_debug("jump_in: %x\n",start+i*4);
10291           ll_add(jump_dirty+vpage,vaddr,out);
10292           void *entry_point = do_dirty_stub(i);
10293           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
10294           // If there was an existing entry in the hash table,
10295           // replace it with the new address.
10296           // Don't add new entries.  We'll insert the
10297           // ones that actually get used in check_addr().
10298           struct ht_entry *ht_bin = hash_table_get(vaddr);
10299           if (ht_bin->vaddr[0] == vaddr)
10300             ht_bin->tcaddr[0] = entry_point;
10301           if (ht_bin->vaddr[1] == vaddr)
10302             ht_bin->tcaddr[1] = entry_point;
10303         }
10304       }
10305     }
10306   }
10307   // Write out the literal pool if necessary
10308   literal_pool(0);
10309   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10310   // Align code
10311   if(((u_int)out)&7) emit_addnop(13);
10312   #endif
10313   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10314   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10315   memcpy(copy,source,slen*4);
10316   copy+=slen*4;
10317
10318   end_block(beginning);
10319
10320   // If we're within 256K of the end of the buffer,
10321   // start over from the beginning. (Is 256K enough?)
10322   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10323
10324   // Trap writes to any of the pages we compiled
10325   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10326     invalid_code[i]=0;
10327   }
10328   inv_code_start=inv_code_end=~0;
10329
10330   // for PCSX we need to mark all mirrors too
10331   if(get_page(start)<(RAM_SIZE>>12))
10332     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10333       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10334       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10335       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10336
10337   /* Pass 10 - Free memory by expiring oldest blocks */
10338
10339   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10340   while(expirep!=end)
10341   {
10342     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10343     uintptr_t base=(uintptr_t)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10344     inv_debug("EXP: Phase %d\n",expirep);
10345     switch((expirep>>11)&3)
10346     {
10347       case 0:
10348         // Clear jump_in and jump_dirty
10349         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10350         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10351         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10352         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10353         break;
10354       case 1:
10355         // Clear pointers
10356         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10357         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10358         break;
10359       case 2:
10360         // Clear hash table
10361         for(i=0;i<32;i++) {
10362           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
10363           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
10364              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10365             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
10366             ht_bin->vaddr[1] = -1;
10367             ht_bin->tcaddr[1] = NULL;
10368           }
10369           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
10370              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10371             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
10372             ht_bin->vaddr[0] = ht_bin->vaddr[1];
10373             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
10374             ht_bin->vaddr[1] = -1;
10375             ht_bin->tcaddr[1] = NULL;
10376           }
10377         }
10378         break;
10379       case 3:
10380         // Clear jump_out
10381         #ifdef __arm__
10382         if((expirep&2047)==0)
10383           do_clear_cache();
10384         #endif
10385         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10386         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10387         break;
10388     }
10389     expirep=(expirep+1)&65535;
10390   }
10391   return 0;
10392 }
10393
10394 // vim:shiftwidth=2:expandtab