drc: remove unnecessary cache flushing
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26
27 #include "emu_if.h" //emulator interface
28
29 //#define DISASM
30 //#define assem_debug printf
31 //#define inv_debug printf
32 #define assem_debug(...)
33 #define inv_debug(...)
34
35 #ifdef __i386__
36 #include "assem_x86.h"
37 #endif
38 #ifdef __x86_64__
39 #include "assem_x64.h"
40 #endif
41 #ifdef __arm__
42 #include "assem_arm.h"
43 #endif
44
45 #ifdef __BLACKBERRY_QNX__
46 #undef __clear_cache
47 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
48 #elif defined(__MACH__)
49 #include <libkern/OSCacheControl.h>
50 #define __clear_cache mach_clear_cache
51 static void __clear_cache(void *start, void *end) {
52   size_t len = (char *)end - (char *)start;
53   sys_dcache_flush(start, len);
54   sys_icache_invalidate(start, len);
55 }
56 #endif
57
58 #define MAXBLOCK 4096
59 #define MAX_OUTPUT_BLOCK_SIZE 262144
60
61 struct regstat
62 {
63   signed char regmap_entry[HOST_REGS];
64   signed char regmap[HOST_REGS];
65   uint64_t was32;
66   uint64_t is32;
67   uint64_t wasdirty;
68   uint64_t dirty;
69   uint64_t u;
70   uint64_t uu;
71   u_int wasconst;
72   u_int isconst;
73   u_int loadedconst;             // host regs that have constants loaded
74   u_int waswritten;              // MIPS regs that were used as store base before
75 };
76
77 // note: asm depends on this layout
78 struct ll_entry
79 {
80   u_int vaddr;
81   u_int reg_sv_flags;
82   void *addr;
83   struct ll_entry *next;
84 };
85
86   // used by asm:
87   u_char *out;
88   u_int hash_table[65536][4]  __attribute__((aligned(16)));
89   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
90   struct ll_entry *jump_dirty[4096];
91
92   static struct ll_entry *jump_out[4096];
93   static u_int start;
94   static u_int *source;
95   static char insn[MAXBLOCK][10];
96   static u_char itype[MAXBLOCK];
97   static u_char opcode[MAXBLOCK];
98   static u_char opcode2[MAXBLOCK];
99   static u_char bt[MAXBLOCK];
100   static u_char rs1[MAXBLOCK];
101   static u_char rs2[MAXBLOCK];
102   static u_char rt1[MAXBLOCK];
103   static u_char rt2[MAXBLOCK];
104   static u_char us1[MAXBLOCK];
105   static u_char us2[MAXBLOCK];
106   static u_char dep1[MAXBLOCK];
107   static u_char dep2[MAXBLOCK];
108   static u_char lt1[MAXBLOCK];
109   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
110   static uint64_t gte_rt[MAXBLOCK];
111   static uint64_t gte_unneeded[MAXBLOCK];
112   static u_int smrv[32]; // speculated MIPS register values
113   static u_int smrv_strong; // mask or regs that are likely to have correct values
114   static u_int smrv_weak; // same, but somewhat less likely
115   static u_int smrv_strong_next; // same, but after current insn executes
116   static u_int smrv_weak_next;
117   static int imm[MAXBLOCK];
118   static u_int ba[MAXBLOCK];
119   static char likely[MAXBLOCK];
120   static char is_ds[MAXBLOCK];
121   static char ooo[MAXBLOCK];
122   static uint64_t unneeded_reg[MAXBLOCK];
123   static uint64_t unneeded_reg_upper[MAXBLOCK];
124   static uint64_t branch_unneeded_reg[MAXBLOCK];
125   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
126   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
127   static uint64_t current_constmap[HOST_REGS];
128   static uint64_t constmap[MAXBLOCK][HOST_REGS];
129   static struct regstat regs[MAXBLOCK];
130   static struct regstat branch_regs[MAXBLOCK];
131   static signed char minimum_free_regs[MAXBLOCK];
132   static u_int needed_reg[MAXBLOCK];
133   static u_int wont_dirty[MAXBLOCK];
134   static u_int will_dirty[MAXBLOCK];
135   static int ccadj[MAXBLOCK];
136   static int slen;
137   static u_int instr_addr[MAXBLOCK];
138   static u_int link_addr[MAXBLOCK][3];
139   static int linkcount;
140   static u_int stubs[MAXBLOCK*3][8];
141   static int stubcount;
142   static u_int literals[1024][2];
143   static int literalcount;
144   static int is_delayslot;
145   static int cop1_usable;
146   static char shadow[1048576]  __attribute__((aligned(16)));
147   static void *copy;
148   static int expirep;
149   static u_int stop_after_jal;
150 #ifndef RAM_FIXED
151   static u_int ram_offset;
152 #else
153   static const u_int ram_offset=0;
154 #endif
155
156   int new_dynarec_hacks;
157   int new_dynarec_did_compile;
158   extern u_char restore_candidate[512];
159   extern int cycle_count;
160
161   /* registers that may be allocated */
162   /* 1-31 gpr */
163 #define HIREG 32 // hi
164 #define LOREG 33 // lo
165 #define FSREG 34 // FPU status (FCSR)
166 #define CSREG 35 // Coprocessor status
167 #define CCREG 36 // Cycle count
168 #define INVCP 37 // Pointer to invalid_code
169 //#define MMREG 38 // Pointer to memory_map
170 #define ROREG 39 // ram offset (if rdram!=0x80000000)
171 #define TEMPREG 40
172 #define FTEMP 40 // FPU temporary register
173 #define PTEMP 41 // Prefetch temporary register
174 //#define TLREG 42 // TLB mapping offset
175 #define RHASH 43 // Return address hash
176 #define RHTBL 44 // Return address hash table address
177 #define RTEMP 45 // JR/JALR address register
178 #define MAXREG 45
179 #define AGEN1 46 // Address generation temporary register
180 //#define AGEN2 47 // Address generation temporary register
181 //#define MGEN1 48 // Maptable address generation temporary register
182 //#define MGEN2 49 // Maptable address generation temporary register
183 #define BTREG 50 // Branch target temporary register
184
185   /* instruction types */
186 #define NOP 0     // No operation
187 #define LOAD 1    // Load
188 #define STORE 2   // Store
189 #define LOADLR 3  // Unaligned load
190 #define STORELR 4 // Unaligned store
191 #define MOV 5     // Move
192 #define ALU 6     // Arithmetic/logic
193 #define MULTDIV 7 // Multiply/divide
194 #define SHIFT 8   // Shift by register
195 #define SHIFTIMM 9// Shift by immediate
196 #define IMM16 10  // 16-bit immediate
197 #define RJUMP 11  // Unconditional jump to register
198 #define UJUMP 12  // Unconditional jump
199 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
200 #define SJUMP 14  // Conditional branch (regimm format)
201 #define COP0 15   // Coprocessor 0
202 #define COP1 16   // Coprocessor 1
203 #define C1LS 17   // Coprocessor 1 load/store
204 #define FJUMP 18  // Conditional branch (floating point)
205 #define FLOAT 19  // Floating point unit
206 #define FCONV 20  // Convert integer to float
207 #define FCOMP 21  // Floating point compare (sets FSREG)
208 #define SYSCALL 22// SYSCALL
209 #define OTHER 23  // Other
210 #define SPAN 24   // Branch/delay slot spans 2 pages
211 #define NI 25     // Not implemented
212 #define HLECALL 26// PCSX fake opcodes for HLE
213 #define COP2 27   // Coprocessor 2 move
214 #define C2LS 28   // Coprocessor 2 load/store
215 #define C2OP 29   // Coprocessor 2 operation
216 #define INTCALL 30// Call interpreter to handle rare corner cases
217
218   /* stubs */
219 #define CC_STUB 1
220 #define FP_STUB 2
221 #define LOADB_STUB 3
222 #define LOADH_STUB 4
223 #define LOADW_STUB 5
224 #define LOADD_STUB 6
225 #define LOADBU_STUB 7
226 #define LOADHU_STUB 8
227 #define STOREB_STUB 9
228 #define STOREH_STUB 10
229 #define STOREW_STUB 11
230 #define STORED_STUB 12
231 #define STORELR_STUB 13
232 #define INVCODE_STUB 14
233
234   /* branch codes */
235 #define TAKEN 1
236 #define NOTTAKEN 2
237 #define NULLDS 3
238
239 // asm linkage
240 int new_recompile_block(int addr);
241 void *get_addr_ht(u_int vaddr);
242 void invalidate_block(u_int block);
243 void invalidate_addr(u_int addr);
244 void remove_hash(int vaddr);
245 void dyna_linker();
246 void dyna_linker_ds();
247 void verify_code();
248 void verify_code_vm();
249 void verify_code_ds();
250 void cc_interrupt();
251 void fp_exception();
252 void fp_exception_ds();
253 void jump_syscall_hle();
254 void jump_hlecall();
255 void jump_intcall();
256 void new_dyna_leave();
257
258 // Needed by assembler
259 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
260 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
261 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
262 static void load_all_regs(signed char i_regmap[]);
263 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
264 static void load_regs_entry(int t);
265 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
266
267 static int verify_dirty(u_int *ptr);
268 static int get_final_value(int hr, int i, int *value);
269 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
270 static void add_to_linker(int addr,int target,int ext);
271
272 static int tracedebug=0;
273
274 //#define DEBUG_CYCLE_COUNT 1
275
276 #define NO_CYCLE_PENALTY_THR 12
277
278 int cycle_multiplier; // 100 for 1.0
279
280 static int CLOCK_ADJUST(int x)
281 {
282   int s=(x>>31)|1;
283   return (x * cycle_multiplier + s * 50) / 100;
284 }
285
286 static u_int get_page(u_int vaddr)
287 {
288   u_int page=vaddr&~0xe0000000;
289   if (page < 0x1000000)
290     page &= ~0x0e00000; // RAM mirrors
291   page>>=12;
292   if(page>2048) page=2048+(page&2047);
293   return page;
294 }
295
296 // no virtual mem in PCSX
297 static u_int get_vpage(u_int vaddr)
298 {
299   return get_page(vaddr);
300 }
301
302 // Get address from virtual address
303 // This is called from the recompiled JR/JALR instructions
304 void *get_addr(u_int vaddr)
305 {
306   u_int page=get_page(vaddr);
307   u_int vpage=get_vpage(vaddr);
308   struct ll_entry *head;
309   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
310   head=jump_in[page];
311   while(head!=NULL) {
312     if(head->vaddr==vaddr) {
313   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
314       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
315       ht_bin[3]=ht_bin[1];
316       ht_bin[2]=ht_bin[0];
317       ht_bin[1]=(u_int)head->addr;
318       ht_bin[0]=vaddr;
319       return head->addr;
320     }
321     head=head->next;
322   }
323   head=jump_dirty[vpage];
324   while(head!=NULL) {
325     if(head->vaddr==vaddr) {
326       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
327       // Don't restore blocks which are about to expire from the cache
328       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
329       if(verify_dirty(head->addr)) {
330         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
331         invalid_code[vaddr>>12]=0;
332         inv_code_start=inv_code_end=~0;
333         if(vpage<2048) {
334           restore_candidate[vpage>>3]|=1<<(vpage&7);
335         }
336         else restore_candidate[page>>3]|=1<<(page&7);
337         u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
338         if(ht_bin[0]==vaddr) {
339           ht_bin[1]=(u_int)head->addr; // Replace existing entry
340         }
341         else
342         {
343           ht_bin[3]=ht_bin[1];
344           ht_bin[2]=ht_bin[0];
345           ht_bin[1]=(int)head->addr;
346           ht_bin[0]=vaddr;
347         }
348         return head->addr;
349       }
350     }
351     head=head->next;
352   }
353   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
354   int r=new_recompile_block(vaddr);
355   if(r==0) return get_addr(vaddr);
356   // Execute in unmapped page, generate pagefault execption
357   Status|=2;
358   Cause=(vaddr<<31)|0x8;
359   EPC=(vaddr&1)?vaddr-5:vaddr;
360   BadVAddr=(vaddr&~1);
361   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
362   EntryHi=BadVAddr&0xFFFFE000;
363   return get_addr_ht(0x80000000);
364 }
365 // Look up address in hash table first
366 void *get_addr_ht(u_int vaddr)
367 {
368   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
369   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
370   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
371   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
372   return get_addr(vaddr);
373 }
374
375 void clear_all_regs(signed char regmap[])
376 {
377   int hr;
378   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
379 }
380
381 signed char get_reg(signed char regmap[],int r)
382 {
383   int hr;
384   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
385   return -1;
386 }
387
388 // Find a register that is available for two consecutive cycles
389 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
390 {
391   int hr;
392   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
393   return -1;
394 }
395
396 int count_free_regs(signed char regmap[])
397 {
398   int count=0;
399   int hr;
400   for(hr=0;hr<HOST_REGS;hr++)
401   {
402     if(hr!=EXCLUDE_REG) {
403       if(regmap[hr]<0) count++;
404     }
405   }
406   return count;
407 }
408
409 void dirty_reg(struct regstat *cur,signed char reg)
410 {
411   int hr;
412   if(!reg) return;
413   for (hr=0;hr<HOST_REGS;hr++) {
414     if((cur->regmap[hr]&63)==reg) {
415       cur->dirty|=1<<hr;
416     }
417   }
418 }
419
420 // If we dirty the lower half of a 64 bit register which is now being
421 // sign-extended, we need to dump the upper half.
422 // Note: Do this only after completion of the instruction, because
423 // some instructions may need to read the full 64-bit value even if
424 // overwriting it (eg SLTI, DSRA32).
425 static void flush_dirty_uppers(struct regstat *cur)
426 {
427   int hr,reg;
428   for (hr=0;hr<HOST_REGS;hr++) {
429     if((cur->dirty>>hr)&1) {
430       reg=cur->regmap[hr];
431       if(reg>=64)
432         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
433     }
434   }
435 }
436
437 void set_const(struct regstat *cur,signed char reg,uint64_t value)
438 {
439   int hr;
440   if(!reg) return;
441   for (hr=0;hr<HOST_REGS;hr++) {
442     if(cur->regmap[hr]==reg) {
443       cur->isconst|=1<<hr;
444       current_constmap[hr]=value;
445     }
446     else if((cur->regmap[hr]^64)==reg) {
447       cur->isconst|=1<<hr;
448       current_constmap[hr]=value>>32;
449     }
450   }
451 }
452
453 void clear_const(struct regstat *cur,signed char reg)
454 {
455   int hr;
456   if(!reg) return;
457   for (hr=0;hr<HOST_REGS;hr++) {
458     if((cur->regmap[hr]&63)==reg) {
459       cur->isconst&=~(1<<hr);
460     }
461   }
462 }
463
464 int is_const(struct regstat *cur,signed char reg)
465 {
466   int hr;
467   if(reg<0) return 0;
468   if(!reg) return 1;
469   for (hr=0;hr<HOST_REGS;hr++) {
470     if((cur->regmap[hr]&63)==reg) {
471       return (cur->isconst>>hr)&1;
472     }
473   }
474   return 0;
475 }
476 uint64_t get_const(struct regstat *cur,signed char reg)
477 {
478   int hr;
479   if(!reg) return 0;
480   for (hr=0;hr<HOST_REGS;hr++) {
481     if(cur->regmap[hr]==reg) {
482       return current_constmap[hr];
483     }
484   }
485   SysPrintf("Unknown constant in r%d\n",reg);
486   exit(1);
487 }
488
489 // Least soon needed registers
490 // Look at the next ten instructions and see which registers
491 // will be used.  Try not to reallocate these.
492 void lsn(u_char hsn[], int i, int *preferred_reg)
493 {
494   int j;
495   int b=-1;
496   for(j=0;j<9;j++)
497   {
498     if(i+j>=slen) {
499       j=slen-i-1;
500       break;
501     }
502     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
503     {
504       // Don't go past an unconditonal jump
505       j++;
506       break;
507     }
508   }
509   for(;j>=0;j--)
510   {
511     if(rs1[i+j]) hsn[rs1[i+j]]=j;
512     if(rs2[i+j]) hsn[rs2[i+j]]=j;
513     if(rt1[i+j]) hsn[rt1[i+j]]=j;
514     if(rt2[i+j]) hsn[rt2[i+j]]=j;
515     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
516       // Stores can allocate zero
517       hsn[rs1[i+j]]=j;
518       hsn[rs2[i+j]]=j;
519     }
520     // On some architectures stores need invc_ptr
521     #if defined(HOST_IMM8)
522     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
523       hsn[INVCP]=j;
524     }
525     #endif
526     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
527     {
528       hsn[CCREG]=j;
529       b=j;
530     }
531   }
532   if(b>=0)
533   {
534     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
535     {
536       // Follow first branch
537       int t=(ba[i+b]-start)>>2;
538       j=7-b;if(t+j>=slen) j=slen-t-1;
539       for(;j>=0;j--)
540       {
541         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
542         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
543         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
544         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
545       }
546     }
547     // TODO: preferred register based on backward branch
548   }
549   // Delay slot should preferably not overwrite branch conditions or cycle count
550   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
551     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
552     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
553     hsn[CCREG]=1;
554     // ...or hash tables
555     hsn[RHASH]=1;
556     hsn[RHTBL]=1;
557   }
558   // Coprocessor load/store needs FTEMP, even if not declared
559   if(itype[i]==C1LS||itype[i]==C2LS) {
560     hsn[FTEMP]=0;
561   }
562   // Load L/R also uses FTEMP as a temporary register
563   if(itype[i]==LOADLR) {
564     hsn[FTEMP]=0;
565   }
566   // Also SWL/SWR/SDL/SDR
567   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
568     hsn[FTEMP]=0;
569   }
570   // Don't remove the miniht registers
571   if(itype[i]==UJUMP||itype[i]==RJUMP)
572   {
573     hsn[RHASH]=0;
574     hsn[RHTBL]=0;
575   }
576 }
577
578 // We only want to allocate registers if we're going to use them again soon
579 int needed_again(int r, int i)
580 {
581   int j;
582   int b=-1;
583   int rn=10;
584
585   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
586   {
587     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
588       return 0; // Don't need any registers if exiting the block
589   }
590   for(j=0;j<9;j++)
591   {
592     if(i+j>=slen) {
593       j=slen-i-1;
594       break;
595     }
596     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
597     {
598       // Don't go past an unconditonal jump
599       j++;
600       break;
601     }
602     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
603     {
604       break;
605     }
606   }
607   for(;j>=1;j--)
608   {
609     if(rs1[i+j]==r) rn=j;
610     if(rs2[i+j]==r) rn=j;
611     if((unneeded_reg[i+j]>>r)&1) rn=10;
612     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
613     {
614       b=j;
615     }
616   }
617   /*
618   if(b>=0)
619   {
620     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
621     {
622       // Follow first branch
623       int o=rn;
624       int t=(ba[i+b]-start)>>2;
625       j=7-b;if(t+j>=slen) j=slen-t-1;
626       for(;j>=0;j--)
627       {
628         if(!((unneeded_reg[t+j]>>r)&1)) {
629           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
630           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
631         }
632         else rn=o;
633       }
634     }
635   }*/
636   if(rn<10) return 1;
637   (void)b;
638   return 0;
639 }
640
641 // Try to match register allocations at the end of a loop with those
642 // at the beginning
643 int loop_reg(int i, int r, int hr)
644 {
645   int j,k;
646   for(j=0;j<9;j++)
647   {
648     if(i+j>=slen) {
649       j=slen-i-1;
650       break;
651     }
652     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
653     {
654       // Don't go past an unconditonal jump
655       j++;
656       break;
657     }
658   }
659   k=0;
660   if(i>0){
661     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
662       k--;
663   }
664   for(;k<j;k++)
665   {
666     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
667     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
668     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
669     {
670       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
671       {
672         int t=(ba[i+k]-start)>>2;
673         int reg=get_reg(regs[t].regmap_entry,r);
674         if(reg>=0) return reg;
675         //reg=get_reg(regs[t+1].regmap_entry,r);
676         //if(reg>=0) return reg;
677       }
678     }
679   }
680   return hr;
681 }
682
683
684 // Allocate every register, preserving source/target regs
685 void alloc_all(struct regstat *cur,int i)
686 {
687   int hr;
688
689   for(hr=0;hr<HOST_REGS;hr++) {
690     if(hr!=EXCLUDE_REG) {
691       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
692          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
693       {
694         cur->regmap[hr]=-1;
695         cur->dirty&=~(1<<hr);
696       }
697       // Don't need zeros
698       if((cur->regmap[hr]&63)==0)
699       {
700         cur->regmap[hr]=-1;
701         cur->dirty&=~(1<<hr);
702       }
703     }
704   }
705 }
706
707 #ifdef __i386__
708 #include "assem_x86.c"
709 #endif
710 #ifdef __x86_64__
711 #include "assem_x64.c"
712 #endif
713 #ifdef __arm__
714 #include "assem_arm.c"
715 #endif
716
717 // Add virtual address mapping to linked list
718 void ll_add(struct ll_entry **head,int vaddr,void *addr)
719 {
720   struct ll_entry *new_entry;
721   new_entry=malloc(sizeof(struct ll_entry));
722   assert(new_entry!=NULL);
723   new_entry->vaddr=vaddr;
724   new_entry->reg_sv_flags=0;
725   new_entry->addr=addr;
726   new_entry->next=*head;
727   *head=new_entry;
728 }
729
730 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
731 {
732   ll_add(head,vaddr,addr);
733   (*head)->reg_sv_flags=reg_sv_flags;
734 }
735
736 // Check if an address is already compiled
737 // but don't return addresses which are about to expire from the cache
738 void *check_addr(u_int vaddr)
739 {
740   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
741   if(ht_bin[0]==vaddr) {
742     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
743       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
744   }
745   if(ht_bin[2]==vaddr) {
746     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
747       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
748   }
749   u_int page=get_page(vaddr);
750   struct ll_entry *head;
751   head=jump_in[page];
752   while(head!=NULL) {
753     if(head->vaddr==vaddr) {
754       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
755         // Update existing entry with current address
756         if(ht_bin[0]==vaddr) {
757           ht_bin[1]=(int)head->addr;
758           return head->addr;
759         }
760         if(ht_bin[2]==vaddr) {
761           ht_bin[3]=(int)head->addr;
762           return head->addr;
763         }
764         // Insert into hash table with low priority.
765         // Don't evict existing entries, as they are probably
766         // addresses that are being accessed frequently.
767         if(ht_bin[0]==-1) {
768           ht_bin[1]=(int)head->addr;
769           ht_bin[0]=vaddr;
770         }else if(ht_bin[2]==-1) {
771           ht_bin[3]=(int)head->addr;
772           ht_bin[2]=vaddr;
773         }
774         return head->addr;
775       }
776     }
777     head=head->next;
778   }
779   return 0;
780 }
781
782 void remove_hash(int vaddr)
783 {
784   //printf("remove hash: %x\n",vaddr);
785   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
786   if(ht_bin[2]==vaddr) {
787     ht_bin[2]=ht_bin[3]=-1;
788   }
789   if(ht_bin[0]==vaddr) {
790     ht_bin[0]=ht_bin[2];
791     ht_bin[1]=ht_bin[3];
792     ht_bin[2]=ht_bin[3]=-1;
793   }
794 }
795
796 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
797 {
798   struct ll_entry *next;
799   while(*head) {
800     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
801        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
802     {
803       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
804       remove_hash((*head)->vaddr);
805       next=(*head)->next;
806       free(*head);
807       *head=next;
808     }
809     else
810     {
811       head=&((*head)->next);
812     }
813   }
814 }
815
816 // Remove all entries from linked list
817 void ll_clear(struct ll_entry **head)
818 {
819   struct ll_entry *cur;
820   struct ll_entry *next;
821   if((cur=*head)) {
822     *head=0;
823     while(cur) {
824       next=cur->next;
825       free(cur);
826       cur=next;
827     }
828   }
829 }
830
831 // Dereference the pointers and remove if it matches
832 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
833 {
834   while(head) {
835     int ptr=get_pointer(head->addr);
836     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
837     if(((ptr>>shift)==(addr>>shift)) ||
838        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
839     {
840       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
841       u_int host_addr=(u_int)kill_pointer(head->addr);
842       #ifdef __arm__
843         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
844       #endif
845     }
846     head=head->next;
847   }
848 }
849
850 // This is called when we write to a compiled block (see do_invstub)
851 void invalidate_page(u_int page)
852 {
853   struct ll_entry *head;
854   struct ll_entry *next;
855   head=jump_in[page];
856   jump_in[page]=0;
857   while(head!=NULL) {
858     inv_debug("INVALIDATE: %x\n",head->vaddr);
859     remove_hash(head->vaddr);
860     next=head->next;
861     free(head);
862     head=next;
863   }
864   head=jump_out[page];
865   jump_out[page]=0;
866   while(head!=NULL) {
867     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
868     u_int host_addr=(u_int)kill_pointer(head->addr);
869     #ifdef __arm__
870       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
871     #endif
872     next=head->next;
873     free(head);
874     head=next;
875   }
876 }
877
878 static void invalidate_block_range(u_int block, u_int first, u_int last)
879 {
880   u_int page=get_page(block<<12);
881   //printf("first=%d last=%d\n",first,last);
882   invalidate_page(page);
883   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
884   assert(last<page+5);
885   // Invalidate the adjacent pages if a block crosses a 4K boundary
886   while(first<page) {
887     invalidate_page(first);
888     first++;
889   }
890   for(first=page+1;first<last;first++) {
891     invalidate_page(first);
892   }
893   #ifdef __arm__
894     do_clear_cache();
895   #endif
896
897   // Don't trap writes
898   invalid_code[block]=1;
899
900   #ifdef USE_MINI_HT
901   memset(mini_ht,-1,sizeof(mini_ht));
902   #endif
903 }
904
905 void invalidate_block(u_int block)
906 {
907   u_int page=get_page(block<<12);
908   u_int vpage=get_vpage(block<<12);
909   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
910   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
911   u_int first,last;
912   first=last=page;
913   struct ll_entry *head;
914   head=jump_dirty[vpage];
915   //printf("page=%d vpage=%d\n",page,vpage);
916   while(head!=NULL) {
917     u_int start,end;
918     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
919       get_bounds((int)head->addr,&start,&end);
920       //printf("start: %x end: %x\n",start,end);
921       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
922         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
923           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
924           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
925         }
926       }
927     }
928     head=head->next;
929   }
930   invalidate_block_range(block,first,last);
931 }
932
933 void invalidate_addr(u_int addr)
934 {
935   //static int rhits;
936   // this check is done by the caller
937   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
938   u_int page=get_vpage(addr);
939   if(page<2048) { // RAM
940     struct ll_entry *head;
941     u_int addr_min=~0, addr_max=0;
942     u_int mask=RAM_SIZE-1;
943     u_int addr_main=0x80000000|(addr&mask);
944     int pg1;
945     inv_code_start=addr_main&~0xfff;
946     inv_code_end=addr_main|0xfff;
947     pg1=page;
948     if (pg1>0) {
949       // must check previous page too because of spans..
950       pg1--;
951       inv_code_start-=0x1000;
952     }
953     for(;pg1<=page;pg1++) {
954       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
955         u_int start,end;
956         get_bounds((int)head->addr,&start,&end);
957         if(ram_offset) {
958           start-=ram_offset;
959           end-=ram_offset;
960         }
961         if(start<=addr_main&&addr_main<end) {
962           if(start<addr_min) addr_min=start;
963           if(end>addr_max) addr_max=end;
964         }
965         else if(addr_main<start) {
966           if(start<inv_code_end)
967             inv_code_end=start-1;
968         }
969         else {
970           if(end>inv_code_start)
971             inv_code_start=end;
972         }
973       }
974     }
975     if (addr_min!=~0) {
976       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
977       inv_code_start=inv_code_end=~0;
978       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
979       return;
980     }
981     else {
982       inv_code_start=(addr&~mask)|(inv_code_start&mask);
983       inv_code_end=(addr&~mask)|(inv_code_end&mask);
984       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
985       return;
986     }
987   }
988   invalidate_block(addr>>12);
989 }
990
991 // This is called when loading a save state.
992 // Anything could have changed, so invalidate everything.
993 void invalidate_all_pages()
994 {
995   u_int page;
996   for(page=0;page<4096;page++)
997     invalidate_page(page);
998   for(page=0;page<1048576;page++)
999     if(!invalid_code[page]) {
1000       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1001       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1002     }
1003   #ifdef USE_MINI_HT
1004   memset(mini_ht,-1,sizeof(mini_ht));
1005   #endif
1006 }
1007
1008 // Add an entry to jump_out after making a link
1009 void add_link(u_int vaddr,void *src)
1010 {
1011   u_int page=get_page(vaddr);
1012   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1013   int *ptr=(int *)(src+4);
1014   assert((*ptr&0x0fff0000)==0x059f0000);
1015   (void)ptr;
1016   ll_add(jump_out+page,vaddr,src);
1017   //int ptr=get_pointer(src);
1018   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1019 }
1020
1021 // If a code block was found to be unmodified (bit was set in
1022 // restore_candidate) and it remains unmodified (bit is clear
1023 // in invalid_code) then move the entries for that 4K page from
1024 // the dirty list to the clean list.
1025 void clean_blocks(u_int page)
1026 {
1027   struct ll_entry *head;
1028   inv_debug("INV: clean_blocks page=%d\n",page);
1029   head=jump_dirty[page];
1030   while(head!=NULL) {
1031     if(!invalid_code[head->vaddr>>12]) {
1032       // Don't restore blocks which are about to expire from the cache
1033       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1034         u_int start,end;
1035         if(verify_dirty(head->addr)) {
1036           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1037           u_int i;
1038           u_int inv=0;
1039           get_bounds((int)head->addr,&start,&end);
1040           if(start-(u_int)rdram<RAM_SIZE) {
1041             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1042               inv|=invalid_code[i];
1043             }
1044           }
1045           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1046             inv=1;
1047           }
1048           if(!inv) {
1049             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1050             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1051               u_int ppage=page;
1052               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1053               //printf("page=%x, addr=%x\n",page,head->vaddr);
1054               //assert(head->vaddr>>12==(page|0x80000));
1055               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1056               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1057               if(ht_bin[0]==head->vaddr) {
1058                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1059               }
1060               if(ht_bin[2]==head->vaddr) {
1061                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1062               }
1063             }
1064           }
1065         }
1066       }
1067     }
1068     head=head->next;
1069   }
1070 }
1071
1072
1073 void mov_alloc(struct regstat *current,int i)
1074 {
1075   // Note: Don't need to actually alloc the source registers
1076   if((~current->is32>>rs1[i])&1) {
1077     //alloc_reg64(current,i,rs1[i]);
1078     alloc_reg64(current,i,rt1[i]);
1079     current->is32&=~(1LL<<rt1[i]);
1080   } else {
1081     //alloc_reg(current,i,rs1[i]);
1082     alloc_reg(current,i,rt1[i]);
1083     current->is32|=(1LL<<rt1[i]);
1084   }
1085   clear_const(current,rs1[i]);
1086   clear_const(current,rt1[i]);
1087   dirty_reg(current,rt1[i]);
1088 }
1089
1090 void shiftimm_alloc(struct regstat *current,int i)
1091 {
1092   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1093   {
1094     if(rt1[i]) {
1095       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1096       else lt1[i]=rs1[i];
1097       alloc_reg(current,i,rt1[i]);
1098       current->is32|=1LL<<rt1[i];
1099       dirty_reg(current,rt1[i]);
1100       if(is_const(current,rs1[i])) {
1101         int v=get_const(current,rs1[i]);
1102         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1103         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1104         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1105       }
1106       else clear_const(current,rt1[i]);
1107     }
1108   }
1109   else
1110   {
1111     clear_const(current,rs1[i]);
1112     clear_const(current,rt1[i]);
1113   }
1114
1115   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1116   {
1117     if(rt1[i]) {
1118       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1119       alloc_reg64(current,i,rt1[i]);
1120       current->is32&=~(1LL<<rt1[i]);
1121       dirty_reg(current,rt1[i]);
1122     }
1123   }
1124   if(opcode2[i]==0x3c) // DSLL32
1125   {
1126     if(rt1[i]) {
1127       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1128       alloc_reg64(current,i,rt1[i]);
1129       current->is32&=~(1LL<<rt1[i]);
1130       dirty_reg(current,rt1[i]);
1131     }
1132   }
1133   if(opcode2[i]==0x3e) // DSRL32
1134   {
1135     if(rt1[i]) {
1136       alloc_reg64(current,i,rs1[i]);
1137       if(imm[i]==32) {
1138         alloc_reg64(current,i,rt1[i]);
1139         current->is32&=~(1LL<<rt1[i]);
1140       } else {
1141         alloc_reg(current,i,rt1[i]);
1142         current->is32|=1LL<<rt1[i];
1143       }
1144       dirty_reg(current,rt1[i]);
1145     }
1146   }
1147   if(opcode2[i]==0x3f) // DSRA32
1148   {
1149     if(rt1[i]) {
1150       alloc_reg64(current,i,rs1[i]);
1151       alloc_reg(current,i,rt1[i]);
1152       current->is32|=1LL<<rt1[i];
1153       dirty_reg(current,rt1[i]);
1154     }
1155   }
1156 }
1157
1158 void shift_alloc(struct regstat *current,int i)
1159 {
1160   if(rt1[i]) {
1161     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1162     {
1163       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1164       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1165       alloc_reg(current,i,rt1[i]);
1166       if(rt1[i]==rs2[i]) {
1167         alloc_reg_temp(current,i,-1);
1168         minimum_free_regs[i]=1;
1169       }
1170       current->is32|=1LL<<rt1[i];
1171     } else { // DSLLV/DSRLV/DSRAV
1172       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1173       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1174       alloc_reg64(current,i,rt1[i]);
1175       current->is32&=~(1LL<<rt1[i]);
1176       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1177       {
1178         alloc_reg_temp(current,i,-1);
1179         minimum_free_regs[i]=1;
1180       }
1181     }
1182     clear_const(current,rs1[i]);
1183     clear_const(current,rs2[i]);
1184     clear_const(current,rt1[i]);
1185     dirty_reg(current,rt1[i]);
1186   }
1187 }
1188
1189 void alu_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1192     if(rt1[i]) {
1193       if(rs1[i]&&rs2[i]) {
1194         alloc_reg(current,i,rs1[i]);
1195         alloc_reg(current,i,rs2[i]);
1196       }
1197       else {
1198         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1199         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1200       }
1201       alloc_reg(current,i,rt1[i]);
1202     }
1203     current->is32|=1LL<<rt1[i];
1204   }
1205   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1206     if(rt1[i]) {
1207       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1208       {
1209         alloc_reg64(current,i,rs1[i]);
1210         alloc_reg64(current,i,rs2[i]);
1211         alloc_reg(current,i,rt1[i]);
1212       } else {
1213         alloc_reg(current,i,rs1[i]);
1214         alloc_reg(current,i,rs2[i]);
1215         alloc_reg(current,i,rt1[i]);
1216       }
1217     }
1218     current->is32|=1LL<<rt1[i];
1219   }
1220   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1221     if(rt1[i]) {
1222       if(rs1[i]&&rs2[i]) {
1223         alloc_reg(current,i,rs1[i]);
1224         alloc_reg(current,i,rs2[i]);
1225       }
1226       else
1227       {
1228         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1229         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1230       }
1231       alloc_reg(current,i,rt1[i]);
1232       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1233       {
1234         if(!((current->uu>>rt1[i])&1)) {
1235           alloc_reg64(current,i,rt1[i]);
1236         }
1237         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1238           if(rs1[i]&&rs2[i]) {
1239             alloc_reg64(current,i,rs1[i]);
1240             alloc_reg64(current,i,rs2[i]);
1241           }
1242           else
1243           {
1244             // Is is really worth it to keep 64-bit values in registers?
1245             #ifdef NATIVE_64BIT
1246             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1247             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1248             #endif
1249           }
1250         }
1251         current->is32&=~(1LL<<rt1[i]);
1252       } else {
1253         current->is32|=1LL<<rt1[i];
1254       }
1255     }
1256   }
1257   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1258     if(rt1[i]) {
1259       if(rs1[i]&&rs2[i]) {
1260         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1261           alloc_reg64(current,i,rs1[i]);
1262           alloc_reg64(current,i,rs2[i]);
1263           alloc_reg64(current,i,rt1[i]);
1264         } else {
1265           alloc_reg(current,i,rs1[i]);
1266           alloc_reg(current,i,rs2[i]);
1267           alloc_reg(current,i,rt1[i]);
1268         }
1269       }
1270       else {
1271         alloc_reg(current,i,rt1[i]);
1272         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1273           // DADD used as move, or zeroing
1274           // If we have a 64-bit source, then make the target 64 bits too
1275           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1276             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1277             alloc_reg64(current,i,rt1[i]);
1278           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1279             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1280             alloc_reg64(current,i,rt1[i]);
1281           }
1282           if(opcode2[i]>=0x2e&&rs2[i]) {
1283             // DSUB used as negation - 64-bit result
1284             // If we have a 32-bit register, extend it to 64 bits
1285             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1286             alloc_reg64(current,i,rt1[i]);
1287           }
1288         }
1289       }
1290       if(rs1[i]&&rs2[i]) {
1291         current->is32&=~(1LL<<rt1[i]);
1292       } else if(rs1[i]) {
1293         current->is32&=~(1LL<<rt1[i]);
1294         if((current->is32>>rs1[i])&1)
1295           current->is32|=1LL<<rt1[i];
1296       } else if(rs2[i]) {
1297         current->is32&=~(1LL<<rt1[i]);
1298         if((current->is32>>rs2[i])&1)
1299           current->is32|=1LL<<rt1[i];
1300       } else {
1301         current->is32|=1LL<<rt1[i];
1302       }
1303     }
1304   }
1305   clear_const(current,rs1[i]);
1306   clear_const(current,rs2[i]);
1307   clear_const(current,rt1[i]);
1308   dirty_reg(current,rt1[i]);
1309 }
1310
1311 void imm16_alloc(struct regstat *current,int i)
1312 {
1313   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1314   else lt1[i]=rs1[i];
1315   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1316   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1317     current->is32&=~(1LL<<rt1[i]);
1318     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1319       // TODO: Could preserve the 32-bit flag if the immediate is zero
1320       alloc_reg64(current,i,rt1[i]);
1321       alloc_reg64(current,i,rs1[i]);
1322     }
1323     clear_const(current,rs1[i]);
1324     clear_const(current,rt1[i]);
1325   }
1326   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1327     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1328     current->is32|=1LL<<rt1[i];
1329     clear_const(current,rs1[i]);
1330     clear_const(current,rt1[i]);
1331   }
1332   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1333     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1334       if(rs1[i]!=rt1[i]) {
1335         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1336         alloc_reg64(current,i,rt1[i]);
1337         current->is32&=~(1LL<<rt1[i]);
1338       }
1339     }
1340     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1341     if(is_const(current,rs1[i])) {
1342       int v=get_const(current,rs1[i]);
1343       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1344       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1345       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1346     }
1347     else clear_const(current,rt1[i]);
1348   }
1349   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1350     if(is_const(current,rs1[i])) {
1351       int v=get_const(current,rs1[i]);
1352       set_const(current,rt1[i],v+imm[i]);
1353     }
1354     else clear_const(current,rt1[i]);
1355     current->is32|=1LL<<rt1[i];
1356   }
1357   else {
1358     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1359     current->is32|=1LL<<rt1[i];
1360   }
1361   dirty_reg(current,rt1[i]);
1362 }
1363
1364 void load_alloc(struct regstat *current,int i)
1365 {
1366   clear_const(current,rt1[i]);
1367   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1368   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1369   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1370   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1371     alloc_reg(current,i,rt1[i]);
1372     assert(get_reg(current->regmap,rt1[i])>=0);
1373     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1374     {
1375       current->is32&=~(1LL<<rt1[i]);
1376       alloc_reg64(current,i,rt1[i]);
1377     }
1378     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1379     {
1380       current->is32&=~(1LL<<rt1[i]);
1381       alloc_reg64(current,i,rt1[i]);
1382       alloc_all(current,i);
1383       alloc_reg64(current,i,FTEMP);
1384       minimum_free_regs[i]=HOST_REGS;
1385     }
1386     else current->is32|=1LL<<rt1[i];
1387     dirty_reg(current,rt1[i]);
1388     // LWL/LWR need a temporary register for the old value
1389     if(opcode[i]==0x22||opcode[i]==0x26)
1390     {
1391       alloc_reg(current,i,FTEMP);
1392       alloc_reg_temp(current,i,-1);
1393       minimum_free_regs[i]=1;
1394     }
1395   }
1396   else
1397   {
1398     // Load to r0 or unneeded register (dummy load)
1399     // but we still need a register to calculate the address
1400     if(opcode[i]==0x22||opcode[i]==0x26)
1401     {
1402       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1403     }
1404     alloc_reg_temp(current,i,-1);
1405     minimum_free_regs[i]=1;
1406     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1407     {
1408       alloc_all(current,i);
1409       alloc_reg64(current,i,FTEMP);
1410       minimum_free_regs[i]=HOST_REGS;
1411     }
1412   }
1413 }
1414
1415 void store_alloc(struct regstat *current,int i)
1416 {
1417   clear_const(current,rs2[i]);
1418   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1419   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1420   alloc_reg(current,i,rs2[i]);
1421   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1422     alloc_reg64(current,i,rs2[i]);
1423     if(rs2[i]) alloc_reg(current,i,FTEMP);
1424   }
1425   #if defined(HOST_IMM8)
1426   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1427   else alloc_reg(current,i,INVCP);
1428   #endif
1429   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1430     alloc_reg(current,i,FTEMP);
1431   }
1432   // We need a temporary register for address generation
1433   alloc_reg_temp(current,i,-1);
1434   minimum_free_regs[i]=1;
1435 }
1436
1437 void c1ls_alloc(struct regstat *current,int i)
1438 {
1439   //clear_const(current,rs1[i]); // FIXME
1440   clear_const(current,rt1[i]);
1441   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1442   alloc_reg(current,i,CSREG); // Status
1443   alloc_reg(current,i,FTEMP);
1444   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1445     alloc_reg64(current,i,FTEMP);
1446   }
1447   #if defined(HOST_IMM8)
1448   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1449   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1450     alloc_reg(current,i,INVCP);
1451   #endif
1452   // We need a temporary register for address generation
1453   alloc_reg_temp(current,i,-1);
1454 }
1455
1456 void c2ls_alloc(struct regstat *current,int i)
1457 {
1458   clear_const(current,rt1[i]);
1459   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1460   alloc_reg(current,i,FTEMP);
1461   #if defined(HOST_IMM8)
1462   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1463   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1464     alloc_reg(current,i,INVCP);
1465   #endif
1466   // We need a temporary register for address generation
1467   alloc_reg_temp(current,i,-1);
1468   minimum_free_regs[i]=1;
1469 }
1470
1471 #ifndef multdiv_alloc
1472 void multdiv_alloc(struct regstat *current,int i)
1473 {
1474   //  case 0x18: MULT
1475   //  case 0x19: MULTU
1476   //  case 0x1A: DIV
1477   //  case 0x1B: DIVU
1478   //  case 0x1C: DMULT
1479   //  case 0x1D: DMULTU
1480   //  case 0x1E: DDIV
1481   //  case 0x1F: DDIVU
1482   clear_const(current,rs1[i]);
1483   clear_const(current,rs2[i]);
1484   if(rs1[i]&&rs2[i])
1485   {
1486     if((opcode2[i]&4)==0) // 32-bit
1487     {
1488       current->u&=~(1LL<<HIREG);
1489       current->u&=~(1LL<<LOREG);
1490       alloc_reg(current,i,HIREG);
1491       alloc_reg(current,i,LOREG);
1492       alloc_reg(current,i,rs1[i]);
1493       alloc_reg(current,i,rs2[i]);
1494       current->is32|=1LL<<HIREG;
1495       current->is32|=1LL<<LOREG;
1496       dirty_reg(current,HIREG);
1497       dirty_reg(current,LOREG);
1498     }
1499     else // 64-bit
1500     {
1501       current->u&=~(1LL<<HIREG);
1502       current->u&=~(1LL<<LOREG);
1503       current->uu&=~(1LL<<HIREG);
1504       current->uu&=~(1LL<<LOREG);
1505       alloc_reg64(current,i,HIREG);
1506       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1507       alloc_reg64(current,i,rs1[i]);
1508       alloc_reg64(current,i,rs2[i]);
1509       alloc_all(current,i);
1510       current->is32&=~(1LL<<HIREG);
1511       current->is32&=~(1LL<<LOREG);
1512       dirty_reg(current,HIREG);
1513       dirty_reg(current,LOREG);
1514       minimum_free_regs[i]=HOST_REGS;
1515     }
1516   }
1517   else
1518   {
1519     // Multiply by zero is zero.
1520     // MIPS does not have a divide by zero exception.
1521     // The result is undefined, we return zero.
1522     alloc_reg(current,i,HIREG);
1523     alloc_reg(current,i,LOREG);
1524     current->is32|=1LL<<HIREG;
1525     current->is32|=1LL<<LOREG;
1526     dirty_reg(current,HIREG);
1527     dirty_reg(current,LOREG);
1528   }
1529 }
1530 #endif
1531
1532 void cop0_alloc(struct regstat *current,int i)
1533 {
1534   if(opcode2[i]==0) // MFC0
1535   {
1536     if(rt1[i]) {
1537       clear_const(current,rt1[i]);
1538       alloc_all(current,i);
1539       alloc_reg(current,i,rt1[i]);
1540       current->is32|=1LL<<rt1[i];
1541       dirty_reg(current,rt1[i]);
1542     }
1543   }
1544   else if(opcode2[i]==4) // MTC0
1545   {
1546     if(rs1[i]){
1547       clear_const(current,rs1[i]);
1548       alloc_reg(current,i,rs1[i]);
1549       alloc_all(current,i);
1550     }
1551     else {
1552       alloc_all(current,i); // FIXME: Keep r0
1553       current->u&=~1LL;
1554       alloc_reg(current,i,0);
1555     }
1556   }
1557   else
1558   {
1559     // TLBR/TLBWI/TLBWR/TLBP/ERET
1560     assert(opcode2[i]==0x10);
1561     alloc_all(current,i);
1562   }
1563   minimum_free_regs[i]=HOST_REGS;
1564 }
1565
1566 void cop1_alloc(struct regstat *current,int i)
1567 {
1568   alloc_reg(current,i,CSREG); // Load status
1569   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1570   {
1571     if(rt1[i]){
1572       clear_const(current,rt1[i]);
1573       if(opcode2[i]==1) {
1574         alloc_reg64(current,i,rt1[i]); // DMFC1
1575         current->is32&=~(1LL<<rt1[i]);
1576       }else{
1577         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1578         current->is32|=1LL<<rt1[i];
1579       }
1580       dirty_reg(current,rt1[i]);
1581     }
1582     alloc_reg_temp(current,i,-1);
1583   }
1584   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1585   {
1586     if(rs1[i]){
1587       clear_const(current,rs1[i]);
1588       if(opcode2[i]==5)
1589         alloc_reg64(current,i,rs1[i]); // DMTC1
1590       else
1591         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1592       alloc_reg_temp(current,i,-1);
1593     }
1594     else {
1595       current->u&=~1LL;
1596       alloc_reg(current,i,0);
1597       alloc_reg_temp(current,i,-1);
1598     }
1599   }
1600   minimum_free_regs[i]=1;
1601 }
1602 void fconv_alloc(struct regstat *current,int i)
1603 {
1604   alloc_reg(current,i,CSREG); // Load status
1605   alloc_reg_temp(current,i,-1);
1606   minimum_free_regs[i]=1;
1607 }
1608 void float_alloc(struct regstat *current,int i)
1609 {
1610   alloc_reg(current,i,CSREG); // Load status
1611   alloc_reg_temp(current,i,-1);
1612   minimum_free_regs[i]=1;
1613 }
1614 void c2op_alloc(struct regstat *current,int i)
1615 {
1616   alloc_reg_temp(current,i,-1);
1617 }
1618 void fcomp_alloc(struct regstat *current,int i)
1619 {
1620   alloc_reg(current,i,CSREG); // Load status
1621   alloc_reg(current,i,FSREG); // Load flags
1622   dirty_reg(current,FSREG); // Flag will be modified
1623   alloc_reg_temp(current,i,-1);
1624   minimum_free_regs[i]=1;
1625 }
1626
1627 void syscall_alloc(struct regstat *current,int i)
1628 {
1629   alloc_cc(current,i);
1630   dirty_reg(current,CCREG);
1631   alloc_all(current,i);
1632   minimum_free_regs[i]=HOST_REGS;
1633   current->isconst=0;
1634 }
1635
1636 void delayslot_alloc(struct regstat *current,int i)
1637 {
1638   switch(itype[i]) {
1639     case UJUMP:
1640     case CJUMP:
1641     case SJUMP:
1642     case RJUMP:
1643     case FJUMP:
1644     case SYSCALL:
1645     case HLECALL:
1646     case SPAN:
1647       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1648       SysPrintf("Disabled speculative precompilation\n");
1649       stop_after_jal=1;
1650       break;
1651     case IMM16:
1652       imm16_alloc(current,i);
1653       break;
1654     case LOAD:
1655     case LOADLR:
1656       load_alloc(current,i);
1657       break;
1658     case STORE:
1659     case STORELR:
1660       store_alloc(current,i);
1661       break;
1662     case ALU:
1663       alu_alloc(current,i);
1664       break;
1665     case SHIFT:
1666       shift_alloc(current,i);
1667       break;
1668     case MULTDIV:
1669       multdiv_alloc(current,i);
1670       break;
1671     case SHIFTIMM:
1672       shiftimm_alloc(current,i);
1673       break;
1674     case MOV:
1675       mov_alloc(current,i);
1676       break;
1677     case COP0:
1678       cop0_alloc(current,i);
1679       break;
1680     case COP1:
1681     case COP2:
1682       cop1_alloc(current,i);
1683       break;
1684     case C1LS:
1685       c1ls_alloc(current,i);
1686       break;
1687     case C2LS:
1688       c2ls_alloc(current,i);
1689       break;
1690     case FCONV:
1691       fconv_alloc(current,i);
1692       break;
1693     case FLOAT:
1694       float_alloc(current,i);
1695       break;
1696     case FCOMP:
1697       fcomp_alloc(current,i);
1698       break;
1699     case C2OP:
1700       c2op_alloc(current,i);
1701       break;
1702   }
1703 }
1704
1705 // Special case where a branch and delay slot span two pages in virtual memory
1706 static void pagespan_alloc(struct regstat *current,int i)
1707 {
1708   current->isconst=0;
1709   current->wasconst=0;
1710   regs[i].wasconst=0;
1711   minimum_free_regs[i]=HOST_REGS;
1712   alloc_all(current,i);
1713   alloc_cc(current,i);
1714   dirty_reg(current,CCREG);
1715   if(opcode[i]==3) // JAL
1716   {
1717     alloc_reg(current,i,31);
1718     dirty_reg(current,31);
1719   }
1720   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1721   {
1722     alloc_reg(current,i,rs1[i]);
1723     if (rt1[i]!=0) {
1724       alloc_reg(current,i,rt1[i]);
1725       dirty_reg(current,rt1[i]);
1726     }
1727   }
1728   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1729   {
1730     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1731     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1732     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1733     {
1734       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1735       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1736     }
1737   }
1738   else
1739   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1740   {
1741     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1742     if(!((current->is32>>rs1[i])&1))
1743     {
1744       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1745     }
1746   }
1747   else
1748   if(opcode[i]==0x11) // BC1
1749   {
1750     alloc_reg(current,i,FSREG);
1751     alloc_reg(current,i,CSREG);
1752   }
1753   //else ...
1754 }
1755
1756 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1757 {
1758   stubs[stubcount][0]=type;
1759   stubs[stubcount][1]=addr;
1760   stubs[stubcount][2]=retaddr;
1761   stubs[stubcount][3]=a;
1762   stubs[stubcount][4]=b;
1763   stubs[stubcount][5]=c;
1764   stubs[stubcount][6]=d;
1765   stubs[stubcount][7]=e;
1766   stubcount++;
1767 }
1768
1769 // Write out a single register
1770 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1771 {
1772   int hr;
1773   for(hr=0;hr<HOST_REGS;hr++) {
1774     if(hr!=EXCLUDE_REG) {
1775       if((regmap[hr]&63)==r) {
1776         if((dirty>>hr)&1) {
1777           if(regmap[hr]<64) {
1778             emit_storereg(r,hr);
1779           }else{
1780             emit_storereg(r|64,hr);
1781           }
1782         }
1783       }
1784     }
1785   }
1786 }
1787
1788 int mchecksum()
1789 {
1790   //if(!tracedebug) return 0;
1791   int i;
1792   int sum=0;
1793   for(i=0;i<2097152;i++) {
1794     unsigned int temp=sum;
1795     sum<<=1;
1796     sum|=(~temp)>>31;
1797     sum^=((u_int *)rdram)[i];
1798   }
1799   return sum;
1800 }
1801 int rchecksum()
1802 {
1803   int i;
1804   int sum=0;
1805   for(i=0;i<64;i++)
1806     sum^=((u_int *)reg)[i];
1807   return sum;
1808 }
1809 void rlist()
1810 {
1811   int i;
1812   printf("TRACE: ");
1813   for(i=0;i<32;i++)
1814     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1815   printf("\n");
1816 }
1817
1818 void enabletrace()
1819 {
1820   tracedebug=1;
1821 }
1822
1823 void memdebug(int i)
1824 {
1825   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1826   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1827   //rlist();
1828   //if(tracedebug) {
1829   //if(Count>=-2084597794) {
1830   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1831   //if(0) {
1832     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1833     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1834     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1835     rlist();
1836     #ifdef __i386__
1837     printf("TRACE: %x\n",(&i)[-1]);
1838     #endif
1839     #ifdef __arm__
1840     int j;
1841     printf("TRACE: %x \n",(&j)[10]);
1842     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1843     #endif
1844     //fflush(stdout);
1845   }
1846   //printf("TRACE: %x\n",(&i)[-1]);
1847 }
1848
1849 void alu_assemble(int i,struct regstat *i_regs)
1850 {
1851   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1852     if(rt1[i]) {
1853       signed char s1,s2,t;
1854       t=get_reg(i_regs->regmap,rt1[i]);
1855       if(t>=0) {
1856         s1=get_reg(i_regs->regmap,rs1[i]);
1857         s2=get_reg(i_regs->regmap,rs2[i]);
1858         if(rs1[i]&&rs2[i]) {
1859           assert(s1>=0);
1860           assert(s2>=0);
1861           if(opcode2[i]&2) emit_sub(s1,s2,t);
1862           else emit_add(s1,s2,t);
1863         }
1864         else if(rs1[i]) {
1865           if(s1>=0) emit_mov(s1,t);
1866           else emit_loadreg(rs1[i],t);
1867         }
1868         else if(rs2[i]) {
1869           if(s2>=0) {
1870             if(opcode2[i]&2) emit_neg(s2,t);
1871             else emit_mov(s2,t);
1872           }
1873           else {
1874             emit_loadreg(rs2[i],t);
1875             if(opcode2[i]&2) emit_neg(t,t);
1876           }
1877         }
1878         else emit_zeroreg(t);
1879       }
1880     }
1881   }
1882   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1883     if(rt1[i]) {
1884       signed char s1l,s2l,s1h,s2h,tl,th;
1885       tl=get_reg(i_regs->regmap,rt1[i]);
1886       th=get_reg(i_regs->regmap,rt1[i]|64);
1887       if(tl>=0) {
1888         s1l=get_reg(i_regs->regmap,rs1[i]);
1889         s2l=get_reg(i_regs->regmap,rs2[i]);
1890         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1891         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1892         if(rs1[i]&&rs2[i]) {
1893           assert(s1l>=0);
1894           assert(s2l>=0);
1895           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1896           else emit_adds(s1l,s2l,tl);
1897           if(th>=0) {
1898             #ifdef INVERTED_CARRY
1899             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1900             #else
1901             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1902             #endif
1903             else emit_add(s1h,s2h,th);
1904           }
1905         }
1906         else if(rs1[i]) {
1907           if(s1l>=0) emit_mov(s1l,tl);
1908           else emit_loadreg(rs1[i],tl);
1909           if(th>=0) {
1910             if(s1h>=0) emit_mov(s1h,th);
1911             else emit_loadreg(rs1[i]|64,th);
1912           }
1913         }
1914         else if(rs2[i]) {
1915           if(s2l>=0) {
1916             if(opcode2[i]&2) emit_negs(s2l,tl);
1917             else emit_mov(s2l,tl);
1918           }
1919           else {
1920             emit_loadreg(rs2[i],tl);
1921             if(opcode2[i]&2) emit_negs(tl,tl);
1922           }
1923           if(th>=0) {
1924             #ifdef INVERTED_CARRY
1925             if(s2h>=0) emit_mov(s2h,th);
1926             else emit_loadreg(rs2[i]|64,th);
1927             if(opcode2[i]&2) {
1928               emit_adcimm(-1,th); // x86 has inverted carry flag
1929               emit_not(th,th);
1930             }
1931             #else
1932             if(opcode2[i]&2) {
1933               if(s2h>=0) emit_rscimm(s2h,0,th);
1934               else {
1935                 emit_loadreg(rs2[i]|64,th);
1936                 emit_rscimm(th,0,th);
1937               }
1938             }else{
1939               if(s2h>=0) emit_mov(s2h,th);
1940               else emit_loadreg(rs2[i]|64,th);
1941             }
1942             #endif
1943           }
1944         }
1945         else {
1946           emit_zeroreg(tl);
1947           if(th>=0) emit_zeroreg(th);
1948         }
1949       }
1950     }
1951   }
1952   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1953     if(rt1[i]) {
1954       signed char s1l,s1h,s2l,s2h,t;
1955       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1956       {
1957         t=get_reg(i_regs->regmap,rt1[i]);
1958         //assert(t>=0);
1959         if(t>=0) {
1960           s1l=get_reg(i_regs->regmap,rs1[i]);
1961           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1962           s2l=get_reg(i_regs->regmap,rs2[i]);
1963           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1964           if(rs2[i]==0) // rx<r0
1965           {
1966             assert(s1h>=0);
1967             if(opcode2[i]==0x2a) // SLT
1968               emit_shrimm(s1h,31,t);
1969             else // SLTU (unsigned can not be less than zero)
1970               emit_zeroreg(t);
1971           }
1972           else if(rs1[i]==0) // r0<rx
1973           {
1974             assert(s2h>=0);
1975             if(opcode2[i]==0x2a) // SLT
1976               emit_set_gz64_32(s2h,s2l,t);
1977             else // SLTU (set if not zero)
1978               emit_set_nz64_32(s2h,s2l,t);
1979           }
1980           else {
1981             assert(s1l>=0);assert(s1h>=0);
1982             assert(s2l>=0);assert(s2h>=0);
1983             if(opcode2[i]==0x2a) // SLT
1984               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
1985             else // SLTU
1986               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
1987           }
1988         }
1989       } else {
1990         t=get_reg(i_regs->regmap,rt1[i]);
1991         //assert(t>=0);
1992         if(t>=0) {
1993           s1l=get_reg(i_regs->regmap,rs1[i]);
1994           s2l=get_reg(i_regs->regmap,rs2[i]);
1995           if(rs2[i]==0) // rx<r0
1996           {
1997             assert(s1l>=0);
1998             if(opcode2[i]==0x2a) // SLT
1999               emit_shrimm(s1l,31,t);
2000             else // SLTU (unsigned can not be less than zero)
2001               emit_zeroreg(t);
2002           }
2003           else if(rs1[i]==0) // r0<rx
2004           {
2005             assert(s2l>=0);
2006             if(opcode2[i]==0x2a) // SLT
2007               emit_set_gz32(s2l,t);
2008             else // SLTU (set if not zero)
2009               emit_set_nz32(s2l,t);
2010           }
2011           else{
2012             assert(s1l>=0);assert(s2l>=0);
2013             if(opcode2[i]==0x2a) // SLT
2014               emit_set_if_less32(s1l,s2l,t);
2015             else // SLTU
2016               emit_set_if_carry32(s1l,s2l,t);
2017           }
2018         }
2019       }
2020     }
2021   }
2022   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2023     if(rt1[i]) {
2024       signed char s1l,s1h,s2l,s2h,th,tl;
2025       tl=get_reg(i_regs->regmap,rt1[i]);
2026       th=get_reg(i_regs->regmap,rt1[i]|64);
2027       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2028       {
2029         assert(tl>=0);
2030         if(tl>=0) {
2031           s1l=get_reg(i_regs->regmap,rs1[i]);
2032           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2033           s2l=get_reg(i_regs->regmap,rs2[i]);
2034           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2035           if(rs1[i]&&rs2[i]) {
2036             assert(s1l>=0);assert(s1h>=0);
2037             assert(s2l>=0);assert(s2h>=0);
2038             if(opcode2[i]==0x24) { // AND
2039               emit_and(s1l,s2l,tl);
2040               emit_and(s1h,s2h,th);
2041             } else
2042             if(opcode2[i]==0x25) { // OR
2043               emit_or(s1l,s2l,tl);
2044               emit_or(s1h,s2h,th);
2045             } else
2046             if(opcode2[i]==0x26) { // XOR
2047               emit_xor(s1l,s2l,tl);
2048               emit_xor(s1h,s2h,th);
2049             } else
2050             if(opcode2[i]==0x27) { // NOR
2051               emit_or(s1l,s2l,tl);
2052               emit_or(s1h,s2h,th);
2053               emit_not(tl,tl);
2054               emit_not(th,th);
2055             }
2056           }
2057           else
2058           {
2059             if(opcode2[i]==0x24) { // AND
2060               emit_zeroreg(tl);
2061               emit_zeroreg(th);
2062             } else
2063             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2064               if(rs1[i]){
2065                 if(s1l>=0) emit_mov(s1l,tl);
2066                 else emit_loadreg(rs1[i],tl);
2067                 if(s1h>=0) emit_mov(s1h,th);
2068                 else emit_loadreg(rs1[i]|64,th);
2069               }
2070               else
2071               if(rs2[i]){
2072                 if(s2l>=0) emit_mov(s2l,tl);
2073                 else emit_loadreg(rs2[i],tl);
2074                 if(s2h>=0) emit_mov(s2h,th);
2075                 else emit_loadreg(rs2[i]|64,th);
2076               }
2077               else{
2078                 emit_zeroreg(tl);
2079                 emit_zeroreg(th);
2080               }
2081             } else
2082             if(opcode2[i]==0x27) { // NOR
2083               if(rs1[i]){
2084                 if(s1l>=0) emit_not(s1l,tl);
2085                 else{
2086                   emit_loadreg(rs1[i],tl);
2087                   emit_not(tl,tl);
2088                 }
2089                 if(s1h>=0) emit_not(s1h,th);
2090                 else{
2091                   emit_loadreg(rs1[i]|64,th);
2092                   emit_not(th,th);
2093                 }
2094               }
2095               else
2096               if(rs2[i]){
2097                 if(s2l>=0) emit_not(s2l,tl);
2098                 else{
2099                   emit_loadreg(rs2[i],tl);
2100                   emit_not(tl,tl);
2101                 }
2102                 if(s2h>=0) emit_not(s2h,th);
2103                 else{
2104                   emit_loadreg(rs2[i]|64,th);
2105                   emit_not(th,th);
2106                 }
2107               }
2108               else {
2109                 emit_movimm(-1,tl);
2110                 emit_movimm(-1,th);
2111               }
2112             }
2113           }
2114         }
2115       }
2116       else
2117       {
2118         // 32 bit
2119         if(tl>=0) {
2120           s1l=get_reg(i_regs->regmap,rs1[i]);
2121           s2l=get_reg(i_regs->regmap,rs2[i]);
2122           if(rs1[i]&&rs2[i]) {
2123             assert(s1l>=0);
2124             assert(s2l>=0);
2125             if(opcode2[i]==0x24) { // AND
2126               emit_and(s1l,s2l,tl);
2127             } else
2128             if(opcode2[i]==0x25) { // OR
2129               emit_or(s1l,s2l,tl);
2130             } else
2131             if(opcode2[i]==0x26) { // XOR
2132               emit_xor(s1l,s2l,tl);
2133             } else
2134             if(opcode2[i]==0x27) { // NOR
2135               emit_or(s1l,s2l,tl);
2136               emit_not(tl,tl);
2137             }
2138           }
2139           else
2140           {
2141             if(opcode2[i]==0x24) { // AND
2142               emit_zeroreg(tl);
2143             } else
2144             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2145               if(rs1[i]){
2146                 if(s1l>=0) emit_mov(s1l,tl);
2147                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2148               }
2149               else
2150               if(rs2[i]){
2151                 if(s2l>=0) emit_mov(s2l,tl);
2152                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2153               }
2154               else emit_zeroreg(tl);
2155             } else
2156             if(opcode2[i]==0x27) { // NOR
2157               if(rs1[i]){
2158                 if(s1l>=0) emit_not(s1l,tl);
2159                 else {
2160                   emit_loadreg(rs1[i],tl);
2161                   emit_not(tl,tl);
2162                 }
2163               }
2164               else
2165               if(rs2[i]){
2166                 if(s2l>=0) emit_not(s2l,tl);
2167                 else {
2168                   emit_loadreg(rs2[i],tl);
2169                   emit_not(tl,tl);
2170                 }
2171               }
2172               else emit_movimm(-1,tl);
2173             }
2174           }
2175         }
2176       }
2177     }
2178   }
2179 }
2180
2181 void imm16_assemble(int i,struct regstat *i_regs)
2182 {
2183   if (opcode[i]==0x0f) { // LUI
2184     if(rt1[i]) {
2185       signed char t;
2186       t=get_reg(i_regs->regmap,rt1[i]);
2187       //assert(t>=0);
2188       if(t>=0) {
2189         if(!((i_regs->isconst>>t)&1))
2190           emit_movimm(imm[i]<<16,t);
2191       }
2192     }
2193   }
2194   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2195     if(rt1[i]) {
2196       signed char s,t;
2197       t=get_reg(i_regs->regmap,rt1[i]);
2198       s=get_reg(i_regs->regmap,rs1[i]);
2199       if(rs1[i]) {
2200         //assert(t>=0);
2201         //assert(s>=0);
2202         if(t>=0) {
2203           if(!((i_regs->isconst>>t)&1)) {
2204             if(s<0) {
2205               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2206               emit_addimm(t,imm[i],t);
2207             }else{
2208               if(!((i_regs->wasconst>>s)&1))
2209                 emit_addimm(s,imm[i],t);
2210               else
2211                 emit_movimm(constmap[i][s]+imm[i],t);
2212             }
2213           }
2214         }
2215       } else {
2216         if(t>=0) {
2217           if(!((i_regs->isconst>>t)&1))
2218             emit_movimm(imm[i],t);
2219         }
2220       }
2221     }
2222   }
2223   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2224     if(rt1[i]) {
2225       signed char sh,sl,th,tl;
2226       th=get_reg(i_regs->regmap,rt1[i]|64);
2227       tl=get_reg(i_regs->regmap,rt1[i]);
2228       sh=get_reg(i_regs->regmap,rs1[i]|64);
2229       sl=get_reg(i_regs->regmap,rs1[i]);
2230       if(tl>=0) {
2231         if(rs1[i]) {
2232           assert(sh>=0);
2233           assert(sl>=0);
2234           if(th>=0) {
2235             emit_addimm64_32(sh,sl,imm[i],th,tl);
2236           }
2237           else {
2238             emit_addimm(sl,imm[i],tl);
2239           }
2240         } else {
2241           emit_movimm(imm[i],tl);
2242           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2243         }
2244       }
2245     }
2246   }
2247   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2248     if(rt1[i]) {
2249       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2250       signed char sh,sl,t;
2251       t=get_reg(i_regs->regmap,rt1[i]);
2252       sh=get_reg(i_regs->regmap,rs1[i]|64);
2253       sl=get_reg(i_regs->regmap,rs1[i]);
2254       //assert(t>=0);
2255       if(t>=0) {
2256         if(rs1[i]>0) {
2257           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2258           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2259             if(opcode[i]==0x0a) { // SLTI
2260               if(sl<0) {
2261                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2262                 emit_slti32(t,imm[i],t);
2263               }else{
2264                 emit_slti32(sl,imm[i],t);
2265               }
2266             }
2267             else { // SLTIU
2268               if(sl<0) {
2269                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2270                 emit_sltiu32(t,imm[i],t);
2271               }else{
2272                 emit_sltiu32(sl,imm[i],t);
2273               }
2274             }
2275           }else{ // 64-bit
2276             assert(sl>=0);
2277             if(opcode[i]==0x0a) // SLTI
2278               emit_slti64_32(sh,sl,imm[i],t);
2279             else // SLTIU
2280               emit_sltiu64_32(sh,sl,imm[i],t);
2281           }
2282         }else{
2283           // SLTI(U) with r0 is just stupid,
2284           // nonetheless examples can be found
2285           if(opcode[i]==0x0a) // SLTI
2286             if(0<imm[i]) emit_movimm(1,t);
2287             else emit_zeroreg(t);
2288           else // SLTIU
2289           {
2290             if(imm[i]) emit_movimm(1,t);
2291             else emit_zeroreg(t);
2292           }
2293         }
2294       }
2295     }
2296   }
2297   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2298     if(rt1[i]) {
2299       signed char sh,sl,th,tl;
2300       th=get_reg(i_regs->regmap,rt1[i]|64);
2301       tl=get_reg(i_regs->regmap,rt1[i]);
2302       sh=get_reg(i_regs->regmap,rs1[i]|64);
2303       sl=get_reg(i_regs->regmap,rs1[i]);
2304       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2305         if(opcode[i]==0x0c) //ANDI
2306         {
2307           if(rs1[i]) {
2308             if(sl<0) {
2309               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2310               emit_andimm(tl,imm[i],tl);
2311             }else{
2312               if(!((i_regs->wasconst>>sl)&1))
2313                 emit_andimm(sl,imm[i],tl);
2314               else
2315                 emit_movimm(constmap[i][sl]&imm[i],tl);
2316             }
2317           }
2318           else
2319             emit_zeroreg(tl);
2320           if(th>=0) emit_zeroreg(th);
2321         }
2322         else
2323         {
2324           if(rs1[i]) {
2325             if(sl<0) {
2326               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2327             }
2328             if(th>=0) {
2329               if(sh<0) {
2330                 emit_loadreg(rs1[i]|64,th);
2331               }else{
2332                 emit_mov(sh,th);
2333               }
2334             }
2335             if(opcode[i]==0x0d) { // ORI
2336               if(sl<0) {
2337                 emit_orimm(tl,imm[i],tl);
2338               }else{
2339                 if(!((i_regs->wasconst>>sl)&1))
2340                   emit_orimm(sl,imm[i],tl);
2341                 else
2342                   emit_movimm(constmap[i][sl]|imm[i],tl);
2343               }
2344             }
2345             if(opcode[i]==0x0e) { // XORI
2346               if(sl<0) {
2347                 emit_xorimm(tl,imm[i],tl);
2348               }else{
2349                 if(!((i_regs->wasconst>>sl)&1))
2350                   emit_xorimm(sl,imm[i],tl);
2351                 else
2352                   emit_movimm(constmap[i][sl]^imm[i],tl);
2353               }
2354             }
2355           }
2356           else {
2357             emit_movimm(imm[i],tl);
2358             if(th>=0) emit_zeroreg(th);
2359           }
2360         }
2361       }
2362     }
2363   }
2364 }
2365
2366 void shiftimm_assemble(int i,struct regstat *i_regs)
2367 {
2368   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2369   {
2370     if(rt1[i]) {
2371       signed char s,t;
2372       t=get_reg(i_regs->regmap,rt1[i]);
2373       s=get_reg(i_regs->regmap,rs1[i]);
2374       //assert(t>=0);
2375       if(t>=0&&!((i_regs->isconst>>t)&1)){
2376         if(rs1[i]==0)
2377         {
2378           emit_zeroreg(t);
2379         }
2380         else
2381         {
2382           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2383           if(imm[i]) {
2384             if(opcode2[i]==0) // SLL
2385             {
2386               emit_shlimm(s<0?t:s,imm[i],t);
2387             }
2388             if(opcode2[i]==2) // SRL
2389             {
2390               emit_shrimm(s<0?t:s,imm[i],t);
2391             }
2392             if(opcode2[i]==3) // SRA
2393             {
2394               emit_sarimm(s<0?t:s,imm[i],t);
2395             }
2396           }else{
2397             // Shift by zero
2398             if(s>=0 && s!=t) emit_mov(s,t);
2399           }
2400         }
2401       }
2402       //emit_storereg(rt1[i],t); //DEBUG
2403     }
2404   }
2405   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2406   {
2407     if(rt1[i]) {
2408       signed char sh,sl,th,tl;
2409       th=get_reg(i_regs->regmap,rt1[i]|64);
2410       tl=get_reg(i_regs->regmap,rt1[i]);
2411       sh=get_reg(i_regs->regmap,rs1[i]|64);
2412       sl=get_reg(i_regs->regmap,rs1[i]);
2413       if(tl>=0) {
2414         if(rs1[i]==0)
2415         {
2416           emit_zeroreg(tl);
2417           if(th>=0) emit_zeroreg(th);
2418         }
2419         else
2420         {
2421           assert(sl>=0);
2422           assert(sh>=0);
2423           if(imm[i]) {
2424             if(opcode2[i]==0x38) // DSLL
2425             {
2426               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2427               emit_shlimm(sl,imm[i],tl);
2428             }
2429             if(opcode2[i]==0x3a) // DSRL
2430             {
2431               emit_shrdimm(sl,sh,imm[i],tl);
2432               if(th>=0) emit_shrimm(sh,imm[i],th);
2433             }
2434             if(opcode2[i]==0x3b) // DSRA
2435             {
2436               emit_shrdimm(sl,sh,imm[i],tl);
2437               if(th>=0) emit_sarimm(sh,imm[i],th);
2438             }
2439           }else{
2440             // Shift by zero
2441             if(sl!=tl) emit_mov(sl,tl);
2442             if(th>=0&&sh!=th) emit_mov(sh,th);
2443           }
2444         }
2445       }
2446     }
2447   }
2448   if(opcode2[i]==0x3c) // DSLL32
2449   {
2450     if(rt1[i]) {
2451       signed char sl,tl,th;
2452       tl=get_reg(i_regs->regmap,rt1[i]);
2453       th=get_reg(i_regs->regmap,rt1[i]|64);
2454       sl=get_reg(i_regs->regmap,rs1[i]);
2455       if(th>=0||tl>=0){
2456         assert(tl>=0);
2457         assert(th>=0);
2458         assert(sl>=0);
2459         emit_mov(sl,th);
2460         emit_zeroreg(tl);
2461         if(imm[i]>32)
2462         {
2463           emit_shlimm(th,imm[i]&31,th);
2464         }
2465       }
2466     }
2467   }
2468   if(opcode2[i]==0x3e) // DSRL32
2469   {
2470     if(rt1[i]) {
2471       signed char sh,tl,th;
2472       tl=get_reg(i_regs->regmap,rt1[i]);
2473       th=get_reg(i_regs->regmap,rt1[i]|64);
2474       sh=get_reg(i_regs->regmap,rs1[i]|64);
2475       if(tl>=0){
2476         assert(sh>=0);
2477         emit_mov(sh,tl);
2478         if(th>=0) emit_zeroreg(th);
2479         if(imm[i]>32)
2480         {
2481           emit_shrimm(tl,imm[i]&31,tl);
2482         }
2483       }
2484     }
2485   }
2486   if(opcode2[i]==0x3f) // DSRA32
2487   {
2488     if(rt1[i]) {
2489       signed char sh,tl;
2490       tl=get_reg(i_regs->regmap,rt1[i]);
2491       sh=get_reg(i_regs->regmap,rs1[i]|64);
2492       if(tl>=0){
2493         assert(sh>=0);
2494         emit_mov(sh,tl);
2495         if(imm[i]>32)
2496         {
2497           emit_sarimm(tl,imm[i]&31,tl);
2498         }
2499       }
2500     }
2501   }
2502 }
2503
2504 #ifndef shift_assemble
2505 void shift_assemble(int i,struct regstat *i_regs)
2506 {
2507   printf("Need shift_assemble for this architecture.\n");
2508   exit(1);
2509 }
2510 #endif
2511
2512 void load_assemble(int i,struct regstat *i_regs)
2513 {
2514   int s,th,tl,addr,map=-1;
2515   int offset;
2516   int jaddr=0;
2517   int memtarget=0,c=0;
2518   int fastload_reg_override=0;
2519   u_int hr,reglist=0;
2520   th=get_reg(i_regs->regmap,rt1[i]|64);
2521   tl=get_reg(i_regs->regmap,rt1[i]);
2522   s=get_reg(i_regs->regmap,rs1[i]);
2523   offset=imm[i];
2524   for(hr=0;hr<HOST_REGS;hr++) {
2525     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2526   }
2527   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2528   if(s>=0) {
2529     c=(i_regs->wasconst>>s)&1;
2530     if (c) {
2531       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2532     }
2533   }
2534   //printf("load_assemble: c=%d\n",c);
2535   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2536   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2537   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2538     ||rt1[i]==0) {
2539       // could be FIFO, must perform the read
2540       // ||dummy read
2541       assem_debug("(forced read)\n");
2542       tl=get_reg(i_regs->regmap,-1);
2543       assert(tl>=0);
2544   }
2545   if(offset||s<0||c) addr=tl;
2546   else addr=s;
2547   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2548  if(tl>=0) {
2549   //printf("load_assemble: c=%d\n",c);
2550   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2551   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2552   reglist&=~(1<<tl);
2553   if(th>=0) reglist&=~(1<<th);
2554   if(!c) {
2555     #ifdef RAM_OFFSET
2556     map=get_reg(i_regs->regmap,ROREG);
2557     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2558     #endif
2559     #ifdef R29_HACK
2560     // Strmnnrmn's speed hack
2561     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2562     #endif
2563     {
2564       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2565     }
2566   }
2567   else if(ram_offset&&memtarget) {
2568     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2569     fastload_reg_override=HOST_TEMPREG;
2570   }
2571   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2572   if (opcode[i]==0x20) { // LB
2573     if(!c||memtarget) {
2574       if(!dummy) {
2575         #ifdef HOST_IMM_ADDR32
2576         if(c)
2577           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2578         else
2579         #endif
2580         {
2581           //emit_xorimm(addr,3,tl);
2582           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2583           int x=0,a=tl;
2584 #ifdef BIG_ENDIAN_MIPS
2585           if(!c) emit_xorimm(addr,3,tl);
2586           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2587 #else
2588           if(!c) a=addr;
2589 #endif
2590           if(fastload_reg_override) a=fastload_reg_override;
2591
2592           emit_movsbl_indexed_tlb(x,a,map,tl);
2593         }
2594       }
2595       if(jaddr)
2596         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2597     }
2598     else
2599       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2600   }
2601   if (opcode[i]==0x21) { // LH
2602     if(!c||memtarget) {
2603       if(!dummy) {
2604         #ifdef HOST_IMM_ADDR32
2605         if(c)
2606           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2607         else
2608         #endif
2609         {
2610           int x=0,a=tl;
2611 #ifdef BIG_ENDIAN_MIPS
2612           if(!c) emit_xorimm(addr,2,tl);
2613           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2614 #else
2615           if(!c) a=addr;
2616 #endif
2617           if(fastload_reg_override) a=fastload_reg_override;
2618           //#ifdef
2619           //emit_movswl_indexed_tlb(x,tl,map,tl);
2620           //else
2621           if(map>=0) {
2622             emit_movswl_indexed(x,a,tl);
2623           }else{
2624             #if 1 //def RAM_OFFSET
2625             emit_movswl_indexed(x,a,tl);
2626             #else
2627             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2628             #endif
2629           }
2630         }
2631       }
2632       if(jaddr)
2633         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2634     }
2635     else
2636       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2637   }
2638   if (opcode[i]==0x23) { // LW
2639     if(!c||memtarget) {
2640       if(!dummy) {
2641         int a=addr;
2642         if(fastload_reg_override) a=fastload_reg_override;
2643         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2644         #ifdef HOST_IMM_ADDR32
2645         if(c)
2646           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2647         else
2648         #endif
2649         emit_readword_indexed_tlb(0,a,map,tl);
2650       }
2651       if(jaddr)
2652         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2653     }
2654     else
2655       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2656   }
2657   if (opcode[i]==0x24) { // LBU
2658     if(!c||memtarget) {
2659       if(!dummy) {
2660         #ifdef HOST_IMM_ADDR32
2661         if(c)
2662           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2663         else
2664         #endif
2665         {
2666           //emit_xorimm(addr,3,tl);
2667           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2668           int x=0,a=tl;
2669 #ifdef BIG_ENDIAN_MIPS
2670           if(!c) emit_xorimm(addr,3,tl);
2671           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2672 #else
2673           if(!c) a=addr;
2674 #endif
2675           if(fastload_reg_override) a=fastload_reg_override;
2676
2677           emit_movzbl_indexed_tlb(x,a,map,tl);
2678         }
2679       }
2680       if(jaddr)
2681         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2682     }
2683     else
2684       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2685   }
2686   if (opcode[i]==0x25) { // LHU
2687     if(!c||memtarget) {
2688       if(!dummy) {
2689         #ifdef HOST_IMM_ADDR32
2690         if(c)
2691           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2692         else
2693         #endif
2694         {
2695           int x=0,a=tl;
2696 #ifdef BIG_ENDIAN_MIPS
2697           if(!c) emit_xorimm(addr,2,tl);
2698           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2699 #else
2700           if(!c) a=addr;
2701 #endif
2702           if(fastload_reg_override) a=fastload_reg_override;
2703           //#ifdef
2704           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2705           //#else
2706           if(map>=0) {
2707             emit_movzwl_indexed(x,a,tl);
2708           }else{
2709             #if 1 //def RAM_OFFSET
2710             emit_movzwl_indexed(x,a,tl);
2711             #else
2712             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2713             #endif
2714           }
2715         }
2716       }
2717       if(jaddr)
2718         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2719     }
2720     else
2721       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2722   }
2723   if (opcode[i]==0x27) { // LWU
2724     assert(th>=0);
2725     if(!c||memtarget) {
2726       if(!dummy) {
2727         int a=addr;
2728         if(fastload_reg_override) a=fastload_reg_override;
2729         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2730         #ifdef HOST_IMM_ADDR32
2731         if(c)
2732           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2733         else
2734         #endif
2735         emit_readword_indexed_tlb(0,a,map,tl);
2736       }
2737       if(jaddr)
2738         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2739     }
2740     else {
2741       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2742     }
2743     emit_zeroreg(th);
2744   }
2745   if (opcode[i]==0x37) { // LD
2746     if(!c||memtarget) {
2747       if(!dummy) {
2748         int a=addr;
2749         if(fastload_reg_override) a=fastload_reg_override;
2750         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2751         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2752         #ifdef HOST_IMM_ADDR32
2753         if(c)
2754           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2755         else
2756         #endif
2757         emit_readdword_indexed_tlb(0,a,map,th,tl);
2758       }
2759       if(jaddr)
2760         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2761     }
2762     else
2763       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2764   }
2765  }
2766   //emit_storereg(rt1[i],tl); // DEBUG
2767   //if(opcode[i]==0x23)
2768   //if(opcode[i]==0x24)
2769   //if(opcode[i]==0x23||opcode[i]==0x24)
2770   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2771   {
2772     //emit_pusha();
2773     save_regs(0x100f);
2774         emit_readword((int)&last_count,ECX);
2775         #ifdef __i386__
2776         if(get_reg(i_regs->regmap,CCREG)<0)
2777           emit_loadreg(CCREG,HOST_CCREG);
2778         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2779         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2780         emit_writeword(HOST_CCREG,(int)&Count);
2781         #endif
2782         #ifdef __arm__
2783         if(get_reg(i_regs->regmap,CCREG)<0)
2784           emit_loadreg(CCREG,0);
2785         else
2786           emit_mov(HOST_CCREG,0);
2787         emit_add(0,ECX,0);
2788         emit_addimm(0,2*ccadj[i],0);
2789         emit_writeword(0,(int)&Count);
2790         #endif
2791     emit_call((int)memdebug);
2792     //emit_popa();
2793     restore_regs(0x100f);
2794   }*/
2795 }
2796
2797 #ifndef loadlr_assemble
2798 void loadlr_assemble(int i,struct regstat *i_regs)
2799 {
2800   printf("Need loadlr_assemble for this architecture.\n");
2801   exit(1);
2802 }
2803 #endif
2804
2805 void store_assemble(int i,struct regstat *i_regs)
2806 {
2807   int s,th,tl,map=-1;
2808   int addr,temp;
2809   int offset;
2810   int jaddr=0,type;
2811   int memtarget=0,c=0;
2812   int agr=AGEN1+(i&1);
2813   int faststore_reg_override=0;
2814   u_int hr,reglist=0;
2815   th=get_reg(i_regs->regmap,rs2[i]|64);
2816   tl=get_reg(i_regs->regmap,rs2[i]);
2817   s=get_reg(i_regs->regmap,rs1[i]);
2818   temp=get_reg(i_regs->regmap,agr);
2819   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2820   offset=imm[i];
2821   if(s>=0) {
2822     c=(i_regs->wasconst>>s)&1;
2823     if(c) {
2824       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2825     }
2826   }
2827   assert(tl>=0);
2828   assert(temp>=0);
2829   for(hr=0;hr<HOST_REGS;hr++) {
2830     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2831   }
2832   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2833   if(offset||s<0||c) addr=temp;
2834   else addr=s;
2835   if(!c) {
2836     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2837   }
2838   else if(ram_offset&&memtarget) {
2839     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2840     faststore_reg_override=HOST_TEMPREG;
2841   }
2842
2843   if (opcode[i]==0x28) { // SB
2844     if(!c||memtarget) {
2845       int x=0,a=temp;
2846 #ifdef BIG_ENDIAN_MIPS
2847       if(!c) emit_xorimm(addr,3,temp);
2848       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2849 #else
2850       if(!c) a=addr;
2851 #endif
2852       if(faststore_reg_override) a=faststore_reg_override;
2853       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2854       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2855     }
2856     type=STOREB_STUB;
2857   }
2858   if (opcode[i]==0x29) { // SH
2859     if(!c||memtarget) {
2860       int x=0,a=temp;
2861 #ifdef BIG_ENDIAN_MIPS
2862       if(!c) emit_xorimm(addr,2,temp);
2863       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2864 #else
2865       if(!c) a=addr;
2866 #endif
2867       if(faststore_reg_override) a=faststore_reg_override;
2868       //#ifdef
2869       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2870       //#else
2871       if(map>=0) {
2872         emit_writehword_indexed(tl,x,a);
2873       }else
2874         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2875         emit_writehword_indexed(tl,x,a);
2876     }
2877     type=STOREH_STUB;
2878   }
2879   if (opcode[i]==0x2B) { // SW
2880     if(!c||memtarget) {
2881       int a=addr;
2882       if(faststore_reg_override) a=faststore_reg_override;
2883       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2884       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2885     }
2886     type=STOREW_STUB;
2887   }
2888   if (opcode[i]==0x3F) { // SD
2889     if(!c||memtarget) {
2890       int a=addr;
2891       if(faststore_reg_override) a=faststore_reg_override;
2892       if(rs2[i]) {
2893         assert(th>=0);
2894         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2895         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2896         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2897       }else{
2898         // Store zero
2899         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2900         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2901         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2902       }
2903     }
2904     type=STORED_STUB;
2905   }
2906   if(jaddr) {
2907     // PCSX store handlers don't check invcode again
2908     reglist|=1<<addr;
2909     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2910     jaddr=0;
2911   }
2912   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2913     if(!c||memtarget) {
2914       #ifdef DESTRUCTIVE_SHIFT
2915       // The x86 shift operation is 'destructive'; it overwrites the
2916       // source register, so we need to make a copy first and use that.
2917       addr=temp;
2918       #endif
2919       #if defined(HOST_IMM8)
2920       int ir=get_reg(i_regs->regmap,INVCP);
2921       assert(ir>=0);
2922       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2923       #else
2924       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
2925       #endif
2926       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2927       emit_callne(invalidate_addr_reg[addr]);
2928       #else
2929       int jaddr2=(int)out;
2930       emit_jne(0);
2931       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2932       #endif
2933     }
2934   }
2935   u_int addr_val=constmap[i][s]+offset;
2936   if(jaddr) {
2937     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2938   } else if(c&&!memtarget) {
2939     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2940   }
2941   // basic current block modification detection..
2942   // not looking back as that should be in mips cache already
2943   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2944     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2945     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2946     if(i_regs->regmap==regs[i].regmap) {
2947       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2948       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2949       emit_movimm(start+i*4+4,0);
2950       emit_writeword(0,(int)&pcaddr);
2951       emit_jmp((int)do_interrupt);
2952     }
2953   }
2954   //if(opcode[i]==0x2B || opcode[i]==0x3F)
2955   //if(opcode[i]==0x2B || opcode[i]==0x28)
2956   //if(opcode[i]==0x2B || opcode[i]==0x29)
2957   //if(opcode[i]==0x2B)
2958   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
2959   {
2960     #ifdef __i386__
2961     emit_pusha();
2962     #endif
2963     #ifdef __arm__
2964     save_regs(0x100f);
2965     #endif
2966         emit_readword((int)&last_count,ECX);
2967         #ifdef __i386__
2968         if(get_reg(i_regs->regmap,CCREG)<0)
2969           emit_loadreg(CCREG,HOST_CCREG);
2970         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2971         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2972         emit_writeword(HOST_CCREG,(int)&Count);
2973         #endif
2974         #ifdef __arm__
2975         if(get_reg(i_regs->regmap,CCREG)<0)
2976           emit_loadreg(CCREG,0);
2977         else
2978           emit_mov(HOST_CCREG,0);
2979         emit_add(0,ECX,0);
2980         emit_addimm(0,2*ccadj[i],0);
2981         emit_writeword(0,(int)&Count);
2982         #endif
2983     emit_call((int)memdebug);
2984     #ifdef __i386__
2985     emit_popa();
2986     #endif
2987     #ifdef __arm__
2988     restore_regs(0x100f);
2989     #endif
2990   }*/
2991 }
2992
2993 void storelr_assemble(int i,struct regstat *i_regs)
2994 {
2995   int s,th,tl;
2996   int temp;
2997   int temp2=-1;
2998   int offset;
2999   int jaddr=0;
3000   int case1,case2,case3;
3001   int done0,done1,done2;
3002   int memtarget=0,c=0;
3003   int agr=AGEN1+(i&1);
3004   u_int hr,reglist=0;
3005   th=get_reg(i_regs->regmap,rs2[i]|64);
3006   tl=get_reg(i_regs->regmap,rs2[i]);
3007   s=get_reg(i_regs->regmap,rs1[i]);
3008   temp=get_reg(i_regs->regmap,agr);
3009   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3010   offset=imm[i];
3011   if(s>=0) {
3012     c=(i_regs->isconst>>s)&1;
3013     if(c) {
3014       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3015     }
3016   }
3017   assert(tl>=0);
3018   for(hr=0;hr<HOST_REGS;hr++) {
3019     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3020   }
3021   assert(temp>=0);
3022   if(!c) {
3023     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3024     if(!offset&&s!=temp) emit_mov(s,temp);
3025     jaddr=(int)out;
3026     emit_jno(0);
3027   }
3028   else
3029   {
3030     if(!memtarget||!rs1[i]) {
3031       jaddr=(int)out;
3032       emit_jmp(0);
3033     }
3034   }
3035   #ifdef RAM_OFFSET
3036   int map=get_reg(i_regs->regmap,ROREG);
3037   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3038   #else
3039   if((u_int)rdram!=0x80000000)
3040     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3041   #endif
3042
3043   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3044     temp2=get_reg(i_regs->regmap,FTEMP);
3045     if(!rs2[i]) temp2=th=tl;
3046   }
3047
3048 #ifndef BIG_ENDIAN_MIPS
3049     emit_xorimm(temp,3,temp);
3050 #endif
3051   emit_testimm(temp,2);
3052   case2=(int)out;
3053   emit_jne(0);
3054   emit_testimm(temp,1);
3055   case1=(int)out;
3056   emit_jne(0);
3057   // 0
3058   if (opcode[i]==0x2A) { // SWL
3059     emit_writeword_indexed(tl,0,temp);
3060   }
3061   if (opcode[i]==0x2E) { // SWR
3062     emit_writebyte_indexed(tl,3,temp);
3063   }
3064   if (opcode[i]==0x2C) { // SDL
3065     emit_writeword_indexed(th,0,temp);
3066     if(rs2[i]) emit_mov(tl,temp2);
3067   }
3068   if (opcode[i]==0x2D) { // SDR
3069     emit_writebyte_indexed(tl,3,temp);
3070     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3071   }
3072   done0=(int)out;
3073   emit_jmp(0);
3074   // 1
3075   set_jump_target(case1,(int)out);
3076   if (opcode[i]==0x2A) { // SWL
3077     // Write 3 msb into three least significant bytes
3078     if(rs2[i]) emit_rorimm(tl,8,tl);
3079     emit_writehword_indexed(tl,-1,temp);
3080     if(rs2[i]) emit_rorimm(tl,16,tl);
3081     emit_writebyte_indexed(tl,1,temp);
3082     if(rs2[i]) emit_rorimm(tl,8,tl);
3083   }
3084   if (opcode[i]==0x2E) { // SWR
3085     // Write two lsb into two most significant bytes
3086     emit_writehword_indexed(tl,1,temp);
3087   }
3088   if (opcode[i]==0x2C) { // SDL
3089     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3090     // Write 3 msb into three least significant bytes
3091     if(rs2[i]) emit_rorimm(th,8,th);
3092     emit_writehword_indexed(th,-1,temp);
3093     if(rs2[i]) emit_rorimm(th,16,th);
3094     emit_writebyte_indexed(th,1,temp);
3095     if(rs2[i]) emit_rorimm(th,8,th);
3096   }
3097   if (opcode[i]==0x2D) { // SDR
3098     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3099     // Write two lsb into two most significant bytes
3100     emit_writehword_indexed(tl,1,temp);
3101   }
3102   done1=(int)out;
3103   emit_jmp(0);
3104   // 2
3105   set_jump_target(case2,(int)out);
3106   emit_testimm(temp,1);
3107   case3=(int)out;
3108   emit_jne(0);
3109   if (opcode[i]==0x2A) { // SWL
3110     // Write two msb into two least significant bytes
3111     if(rs2[i]) emit_rorimm(tl,16,tl);
3112     emit_writehword_indexed(tl,-2,temp);
3113     if(rs2[i]) emit_rorimm(tl,16,tl);
3114   }
3115   if (opcode[i]==0x2E) { // SWR
3116     // Write 3 lsb into three most significant bytes
3117     emit_writebyte_indexed(tl,-1,temp);
3118     if(rs2[i]) emit_rorimm(tl,8,tl);
3119     emit_writehword_indexed(tl,0,temp);
3120     if(rs2[i]) emit_rorimm(tl,24,tl);
3121   }
3122   if (opcode[i]==0x2C) { // SDL
3123     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3124     // Write two msb into two least significant bytes
3125     if(rs2[i]) emit_rorimm(th,16,th);
3126     emit_writehword_indexed(th,-2,temp);
3127     if(rs2[i]) emit_rorimm(th,16,th);
3128   }
3129   if (opcode[i]==0x2D) { // SDR
3130     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3131     // Write 3 lsb into three most significant bytes
3132     emit_writebyte_indexed(tl,-1,temp);
3133     if(rs2[i]) emit_rorimm(tl,8,tl);
3134     emit_writehword_indexed(tl,0,temp);
3135     if(rs2[i]) emit_rorimm(tl,24,tl);
3136   }
3137   done2=(int)out;
3138   emit_jmp(0);
3139   // 3
3140   set_jump_target(case3,(int)out);
3141   if (opcode[i]==0x2A) { // SWL
3142     // Write msb into least significant byte
3143     if(rs2[i]) emit_rorimm(tl,24,tl);
3144     emit_writebyte_indexed(tl,-3,temp);
3145     if(rs2[i]) emit_rorimm(tl,8,tl);
3146   }
3147   if (opcode[i]==0x2E) { // SWR
3148     // Write entire word
3149     emit_writeword_indexed(tl,-3,temp);
3150   }
3151   if (opcode[i]==0x2C) { // SDL
3152     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3153     // Write msb into least significant byte
3154     if(rs2[i]) emit_rorimm(th,24,th);
3155     emit_writebyte_indexed(th,-3,temp);
3156     if(rs2[i]) emit_rorimm(th,8,th);
3157   }
3158   if (opcode[i]==0x2D) { // SDR
3159     if(rs2[i]) emit_mov(th,temp2);
3160     // Write entire word
3161     emit_writeword_indexed(tl,-3,temp);
3162   }
3163   set_jump_target(done0,(int)out);
3164   set_jump_target(done1,(int)out);
3165   set_jump_target(done2,(int)out);
3166   if (opcode[i]==0x2C) { // SDL
3167     emit_testimm(temp,4);
3168     done0=(int)out;
3169     emit_jne(0);
3170     emit_andimm(temp,~3,temp);
3171     emit_writeword_indexed(temp2,4,temp);
3172     set_jump_target(done0,(int)out);
3173   }
3174   if (opcode[i]==0x2D) { // SDR
3175     emit_testimm(temp,4);
3176     done0=(int)out;
3177     emit_jeq(0);
3178     emit_andimm(temp,~3,temp);
3179     emit_writeword_indexed(temp2,-4,temp);
3180     set_jump_target(done0,(int)out);
3181   }
3182   if(!c||!memtarget)
3183     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3184   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3185     #ifdef RAM_OFFSET
3186     int map=get_reg(i_regs->regmap,ROREG);
3187     if(map<0) map=HOST_TEMPREG;
3188     gen_orig_addr_w(temp,map);
3189     #else
3190     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3191     #endif
3192     #if defined(HOST_IMM8)
3193     int ir=get_reg(i_regs->regmap,INVCP);
3194     assert(ir>=0);
3195     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3196     #else
3197     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3198     #endif
3199     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3200     emit_callne(invalidate_addr_reg[temp]);
3201     #else
3202     int jaddr2=(int)out;
3203     emit_jne(0);
3204     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3205     #endif
3206   }
3207   /*
3208     emit_pusha();
3209     //save_regs(0x100f);
3210         emit_readword((int)&last_count,ECX);
3211         if(get_reg(i_regs->regmap,CCREG)<0)
3212           emit_loadreg(CCREG,HOST_CCREG);
3213         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3214         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3215         emit_writeword(HOST_CCREG,(int)&Count);
3216     emit_call((int)memdebug);
3217     emit_popa();
3218     //restore_regs(0x100f);
3219   */
3220 }
3221
3222 void c1ls_assemble(int i,struct regstat *i_regs)
3223 {
3224   cop1_unusable(i, i_regs);
3225 }
3226
3227 void c2ls_assemble(int i,struct regstat *i_regs)
3228 {
3229   int s,tl;
3230   int ar;
3231   int offset;
3232   int memtarget=0,c=0;
3233   int jaddr2=0,type;
3234   int agr=AGEN1+(i&1);
3235   int fastio_reg_override=0;
3236   u_int hr,reglist=0;
3237   u_int copr=(source[i]>>16)&0x1f;
3238   s=get_reg(i_regs->regmap,rs1[i]);
3239   tl=get_reg(i_regs->regmap,FTEMP);
3240   offset=imm[i];
3241   assert(rs1[i]>0);
3242   assert(tl>=0);
3243
3244   for(hr=0;hr<HOST_REGS;hr++) {
3245     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3246   }
3247   if(i_regs->regmap[HOST_CCREG]==CCREG)
3248     reglist&=~(1<<HOST_CCREG);
3249
3250   // get the address
3251   if (opcode[i]==0x3a) { // SWC2
3252     ar=get_reg(i_regs->regmap,agr);
3253     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3254     reglist|=1<<ar;
3255   } else { // LWC2
3256     ar=tl;
3257   }
3258   if(s>=0) c=(i_regs->wasconst>>s)&1;
3259   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3260   if (!offset&&!c&&s>=0) ar=s;
3261   assert(ar>=0);
3262
3263   if (opcode[i]==0x3a) { // SWC2
3264     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3265     type=STOREW_STUB;
3266   }
3267   else
3268     type=LOADW_STUB;
3269
3270   if(c&&!memtarget) {
3271     jaddr2=(int)out;
3272     emit_jmp(0); // inline_readstub/inline_writestub?
3273   }
3274   else {
3275     if(!c) {
3276       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3277     }
3278     else if(ram_offset&&memtarget) {
3279       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3280       fastio_reg_override=HOST_TEMPREG;
3281     }
3282     if (opcode[i]==0x32) { // LWC2
3283       #ifdef HOST_IMM_ADDR32
3284       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3285       else
3286       #endif
3287       int a=ar;
3288       if(fastio_reg_override) a=fastio_reg_override;
3289       emit_readword_indexed(0,a,tl);
3290     }
3291     if (opcode[i]==0x3a) { // SWC2
3292       #ifdef DESTRUCTIVE_SHIFT
3293       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3294       #endif
3295       int a=ar;
3296       if(fastio_reg_override) a=fastio_reg_override;
3297       emit_writeword_indexed(tl,0,a);
3298     }
3299   }
3300   if(jaddr2)
3301     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3302   if(opcode[i]==0x3a) // SWC2
3303   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3304 #if defined(HOST_IMM8)
3305     int ir=get_reg(i_regs->regmap,INVCP);
3306     assert(ir>=0);
3307     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3308 #else
3309     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3310 #endif
3311     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3312     emit_callne(invalidate_addr_reg[ar]);
3313     #else
3314     int jaddr3=(int)out;
3315     emit_jne(0);
3316     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3317     #endif
3318   }
3319   if (opcode[i]==0x32) { // LWC2
3320     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3321   }
3322 }
3323
3324 #ifndef multdiv_assemble
3325 void multdiv_assemble(int i,struct regstat *i_regs)
3326 {
3327   printf("Need multdiv_assemble for this architecture.\n");
3328   exit(1);
3329 }
3330 #endif
3331
3332 void mov_assemble(int i,struct regstat *i_regs)
3333 {
3334   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3335   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3336   if(rt1[i]) {
3337     signed char sh,sl,th,tl;
3338     th=get_reg(i_regs->regmap,rt1[i]|64);
3339     tl=get_reg(i_regs->regmap,rt1[i]);
3340     //assert(tl>=0);
3341     if(tl>=0) {
3342       sh=get_reg(i_regs->regmap,rs1[i]|64);
3343       sl=get_reg(i_regs->regmap,rs1[i]);
3344       if(sl>=0) emit_mov(sl,tl);
3345       else emit_loadreg(rs1[i],tl);
3346       if(th>=0) {
3347         if(sh>=0) emit_mov(sh,th);
3348         else emit_loadreg(rs1[i]|64,th);
3349       }
3350     }
3351   }
3352 }
3353
3354 #ifndef fconv_assemble
3355 void fconv_assemble(int i,struct regstat *i_regs)
3356 {
3357   printf("Need fconv_assemble for this architecture.\n");
3358   exit(1);
3359 }
3360 #endif
3361
3362 #if 0
3363 void float_assemble(int i,struct regstat *i_regs)
3364 {
3365   printf("Need float_assemble for this architecture.\n");
3366   exit(1);
3367 }
3368 #endif
3369
3370 void syscall_assemble(int i,struct regstat *i_regs)
3371 {
3372   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3373   assert(ccreg==HOST_CCREG);
3374   assert(!is_delayslot);
3375   (void)ccreg;
3376   emit_movimm(start+i*4,EAX); // Get PC
3377   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3378   emit_jmp((int)jump_syscall_hle); // XXX
3379 }
3380
3381 void hlecall_assemble(int i,struct regstat *i_regs)
3382 {
3383   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3384   assert(ccreg==HOST_CCREG);
3385   assert(!is_delayslot);
3386   (void)ccreg;
3387   emit_movimm(start+i*4+4,0); // Get PC
3388   emit_movimm((int)psxHLEt[source[i]&7],1);
3389   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3390   emit_jmp((int)jump_hlecall);
3391 }
3392
3393 void intcall_assemble(int i,struct regstat *i_regs)
3394 {