1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Mupen64plus/PCSX - assem_arm64.c *
3 * Copyright (C) 2009-2011 Ari64 *
4 * Copyright (C) 2009-2018 Gillou68310 *
5 * Copyright (C) 2021 notaz *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
12 * This program is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
17 * You should have received a copy of the GNU General Public License *
18 * along with this program; if not, write to the *
19 * Free Software Foundation, Inc., *
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
21 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
24 #include "arm_features.h"
26 #define unused __attribute__((unused))
28 void do_memhandler_pre();
29 void do_memhandler_post();
32 static void set_jump_target(void *addr, void *target)
35 intptr_t offset = (u_char *)target - (u_char *)addr;
37 if ((*ptr&0xFC000000) == 0x14000000) { // b
38 assert(offset>=-134217728LL&&offset<134217728LL);
39 *ptr=(*ptr&0xFC000000)|((offset>>2)&0x3ffffff);
41 else if ((*ptr&0xff000000) == 0x54000000 // b.cond
42 || (*ptr&0x7e000000) == 0x34000000) { // cbz/cbnz
43 // Conditional branch are limited to +/- 1MB
44 // block max size is 256k so branching beyond the +/- 1MB limit
45 // should only happen when jumping to an already compiled block (see add_jump_out)
46 // a workaround would be to do a trampoline jump via a stub at the end of the block
47 assert(-1048576 <= offset && offset < 1048576);
48 *ptr=(*ptr&0xFF00000F)|(((offset>>2)&0x7ffff)<<5);
50 else if((*ptr&0x9f000000)==0x10000000) { // adr
51 // generated by do_miniht_insert
52 assert(offset>=-1048576LL&&offset<1048576LL);
53 *ptr=(*ptr&0x9F00001F)|(offset&0x3)<<29|((offset>>2)&0x7ffff)<<5;
56 abort(); // should not happen
59 // from a pointer to external jump stub (which was produced by emit_extjump2)
60 // find where the jumping insn is
61 static void *find_extjump_insn(void *stub)
63 int *ptr = (int *)stub + 2;
64 assert((*ptr&0x9f000000) == 0x10000000); // adr
65 int offset = (((signed int)(*ptr<<8)>>13)<<2)|((*ptr>>29)&0x3);
66 return ptr + offset / 4;
70 // find where external branch is liked to using addr of it's stub:
71 // get address that the stub loads (dyna_linker arg1),
72 // treat it as a pointer to branch insn,
73 // return addr where that branch jumps to
74 static void *get_pointer(void *stub)
76 int *i_ptr = find_extjump_insn(stub);
77 if ((*i_ptr&0xfc000000) == 0x14000000) // b
78 return i_ptr + ((signed int)(*i_ptr<<6)>>6);
79 if ((*i_ptr&0xff000000) == 0x54000000 // b.cond
80 || (*i_ptr&0x7e000000) == 0x34000000) // cbz/cbnz
81 return i_ptr + ((signed int)(*i_ptr<<8)>>13);
87 // Allocate a specific ARM register.
88 static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr)
93 // see if it's already allocated (and dealloc it)
94 for(n=0;n<HOST_REGS;n++)
96 if(n!=EXCLUDE_REG&&cur->regmap[n]==reg) {
97 dirty=(cur->dirty>>n)&1;
103 cur->dirty&=~(1<<hr);
104 cur->dirty|=dirty<<hr;
105 cur->isconst&=~(1<<hr);
108 // Alloc cycle count into dedicated register
109 static void alloc_cc(struct regstat *cur,int i)
111 alloc_arm_reg(cur,i,CCREG,HOST_CCREG);
119 static unused const char *regname[32] = {
120 "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7",
121 "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15",
122 "ip0", "ip1", "w18", "w19", "w20", "w21", "w22", "w23",
123 "w24", "w25", "w26", "w27", "w28", "wfp", "wlr", "wsp"
126 static unused const char *regname64[32] = {
127 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
128 "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
129 "ip0", "ip1", "x18", "x19", "x20", "x21", "x22", "x23",
130 "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"
134 COND_EQ, COND_NE, COND_CS, COND_CC, COND_MI, COND_PL, COND_VS, COND_VC,
135 COND_HI, COND_LS, COND_GE, COND_LT, COND_GT, COND_LE, COND_AW, COND_NV
138 static unused const char *condname[16] = {
139 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
140 "hi", "ls", "ge", "lt", "gt", "le", "aw", "nv"
143 static void output_w32(u_int word)
145 *((u_int *)out) = word;
149 static u_int rn_rd(u_int rn, u_int rd)
153 return (rn << 5) | rd;
156 static u_int rm_rn_rd(u_int rm, u_int rn, u_int rd)
161 return (rm << 16) | (rn << 5) | rd;
164 static u_int rm_ra_rn_rd(u_int rm, u_int ra, u_int rn, u_int rd)
167 return rm_rn_rd(rm, rn, rd) | (ra << 10);
170 static u_int imm7_rt2_rn_rt(u_int imm7, u_int rt2, u_int rn, u_int rt)
176 return (imm7 << 15) | (rt2 << 10) | (rn << 5) | rt;
179 static u_int rm_imm6_rn_rd(u_int rm, u_int imm6, u_int rn, u_int rd)
182 return rm_rn_rd(rm, rn, rd) | (imm6 << 10);
185 static u_int imm16_rd(u_int imm16, u_int rd)
187 assert(imm16 < 0x10000);
189 return (imm16 << 5) | rd;
192 static u_int imm12_rn_rd(u_int imm12, u_int rn, u_int rd)
194 assert(imm12 < 0x1000);
197 return (imm12 << 10) | (rn << 5) | rd;
200 static u_int imm9_rn_rt(u_int imm9, u_int rn, u_int rd)
202 assert(imm9 < 0x200);
205 return (imm9 << 12) | (rn << 5) | rd;
208 static u_int imm19_rt(u_int imm19, u_int rt)
210 assert(imm19 < 0x80000);
212 return (imm19 << 5) | rt;
215 static u_int n_immr_imms_rn_rd(u_int n, u_int immr, u_int imms, u_int rn, u_int rd)
222 return (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd;
225 static u_int genjmp(const u_char *addr)
227 intptr_t offset = addr - out;
228 if ((uintptr_t)addr < 3) return 0; // a branch that will be patched later
229 if (offset < -134217728 || offset > 134217727) {
230 SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset);
234 return ((u_int)offset >> 2) & 0x03ffffff;
237 static u_int genjmpcc(const u_char *addr)
239 intptr_t offset = addr - out;
240 if ((uintptr_t)addr < 3) return 0;
241 if (offset < -1048576 || offset > 1048572) {
242 SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset);
246 return ((u_int)offset >> 2) & 0x7ffff;
249 static uint32_t is_mask(u_int value)
251 return value && ((value + 1) & value) == 0;
254 // This function returns true if the argument contains a
255 // non-empty sequence of ones (possibly rotated) with the remainder zero.
256 static uint32_t is_rotated_mask(u_int value)
258 if (value == 0 || value == ~0)
260 if (is_mask((value - 1) | value))
262 return is_mask((~value - 1) | ~value);
265 static void gen_logical_imm(u_int value, u_int *immr, u_int *imms)
267 int lzeros, tzeros, ones;
269 if (is_mask((value - 1) | value)) {
270 lzeros = __builtin_clz(value);
271 tzeros = __builtin_ctz(value);
272 ones = 32 - lzeros - tzeros;
273 *immr = (32 - tzeros) & 31;
278 if (is_mask((value - 1) | value)) {
279 lzeros = __builtin_clz(value);
280 tzeros = __builtin_ctz(value);
281 ones = 32 - lzeros - tzeros;
289 static void emit_mov(u_int rs, u_int rt)
291 assem_debug("mov %s,%s\n", regname[rt], regname[rs]);
292 output_w32(0x2a000000 | rm_rn_rd(rs, WZR, rt));
295 static void emit_mov64(u_int rs, u_int rt)
297 assem_debug("mov %s,%s\n", regname64[rt], regname64[rs]);
298 output_w32(0xaa000000 | rm_rn_rd(rs, WZR, rt));
301 static void emit_add(u_int rs1, u_int rs2, u_int rt)
303 assem_debug("add %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]);
304 output_w32(0x0b000000 | rm_rn_rd(rs2, rs1, rt));
307 static void emit_add64(u_int rs1, u_int rs2, u_int rt)
309 assem_debug("add %s,%s,%s\n", regname64[rt], regname64[rs1], regname64[rs2]);
310 output_w32(0x8b000000 | rm_rn_rd(rs2, rs1, rt));
313 static void emit_adds64(u_int rs1, u_int rs2, u_int rt)
315 assem_debug("adds %s,%s,%s\n",regname64[rt],regname64[rs1],regname64[rs2]);
316 output_w32(0xab000000 | rm_rn_rd(rs2, rs1, rt));
318 #define emit_adds_ptr emit_adds64
320 static void emit_neg(u_int rs, u_int rt)
322 assem_debug("neg %s,%s\n",regname[rt],regname[rs]);
323 output_w32(0x4b000000 | rm_rn_rd(rs, WZR, rt));
326 static void emit_sub(u_int rs1, u_int rs2, u_int rt)
328 assem_debug("sub %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]);
329 output_w32(0x4b000000 | rm_imm6_rn_rd(rs2, 0, rs1, rt));
332 static void emit_sub_asrimm(u_int rs1, u_int rs2, u_int shift, u_int rt)
334 assem_debug("sub %s,%s,%s,asr #%u\n",regname[rt],regname[rs1],regname[rs2],shift);
335 output_w32(0x4b800000 | rm_imm6_rn_rd(rs2, shift, rs1, rt));
338 static void emit_movz(u_int imm, u_int rt)
340 assem_debug("movz %s,#%#x\n", regname[rt], imm);
341 output_w32(0x52800000 | imm16_rd(imm, rt));
344 static void emit_movz_lsl16(u_int imm, u_int rt)
346 assem_debug("movz %s,#%#x,lsl #16\n", regname[rt], imm);
347 output_w32(0x52a00000 | imm16_rd(imm, rt));
350 static void emit_movn(u_int imm, u_int rt)
352 assem_debug("movn %s,#%#x\n", regname[rt], imm);
353 output_w32(0x12800000 | imm16_rd(imm, rt));
356 static void emit_movn_lsl16(u_int imm,u_int rt)
358 assem_debug("movn %s,#%#x,lsl #16\n", regname[rt], imm);
359 output_w32(0x12a00000 | imm16_rd(imm, rt));
362 static void emit_movk(u_int imm,u_int rt)
364 assem_debug("movk %s,#%#x\n", regname[rt], imm);
365 output_w32(0x72800000 | imm16_rd(imm, rt));
368 static void emit_movk_lsl16(u_int imm,u_int rt)
371 assem_debug("movk %s,#%#x,lsl #16\n", regname[rt], imm);
372 output_w32(0x72a00000 | imm16_rd(imm, rt));
375 static void emit_zeroreg(u_int rt)
380 static void emit_movimm(u_int imm, u_int rt)
384 else if ((~imm) < 65536)
386 else if ((imm&0xffff) == 0)
387 emit_movz_lsl16(imm >> 16, rt);
388 else if (((~imm)&0xffff) == 0)
389 emit_movn_lsl16(~imm >> 16, rt);
390 else if (is_rotated_mask(imm)) {
392 gen_logical_imm(imm, &immr, &imms);
393 assem_debug("orr %s,wzr,#%#x\n", regname[rt], imm);
394 output_w32(0x32000000 | n_immr_imms_rn_rd(0, immr, imms, WZR, rt));
397 emit_movz(imm & 0xffff, rt);
398 emit_movk_lsl16(imm >> 16, rt);
402 static void emit_readword(void *addr, u_int rt)
404 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
405 if (!(offset & 3) && offset <= 16380) {
406 assem_debug("ldr %s,[x%d+%#lx]\n", regname[rt], FP, offset);
407 output_w32(0xb9400000 | imm12_rn_rd(offset >> 2, FP, rt));
413 static void emit_readdword(void *addr, u_int rt)
415 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
416 if (!(offset & 7) && offset <= 32760) {
417 assem_debug("ldr %s,[x%d+%#lx]\n", regname64[rt], FP, offset);
418 output_w32(0xf9400000 | imm12_rn_rd(offset >> 3, FP, rt));
423 #define emit_readptr emit_readdword
425 static void emit_readshword(void *addr, u_int rt)
427 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
428 if (!(offset & 1) && offset <= 8190) {
429 assem_debug("ldrsh %s,[x%d+%#lx]\n", regname[rt], FP, offset);
430 output_w32(0x79c00000 | imm12_rn_rd(offset >> 1, FP, rt));
436 static void emit_loadreg(u_int r, u_int hr)
444 //case HIREG: addr = &hi; break;
445 //case LOREG: addr = &lo; break;
446 case CCREG: addr = &cycle_count; break;
447 case CSREG: addr = &Status; break;
448 case INVCP: addr = &invc_ptr; is64 = 1; break;
449 case ROREG: addr = &ram_offset; is64 = 1; break;
452 addr = &psxRegs.GPR.r[r];
456 emit_readdword(addr, hr);
458 emit_readword(addr, hr);
462 static void emit_writeword(u_int rt, void *addr)
464 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
465 if (!(offset & 3) && offset <= 16380) {
466 assem_debug("str %s,[x%d+%#lx]\n", regname[rt], FP, offset);
467 output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, FP, rt));
473 static void emit_writedword(u_int rt, void *addr)
475 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
476 if (!(offset & 7) && offset <= 32760) {
477 assem_debug("str %s,[x%d+%#lx]\n", regname64[rt], FP, offset);
478 output_w32(0xf9000000 | imm12_rn_rd(offset >> 3, FP, rt));
484 static void emit_storereg(u_int r, u_int hr)
487 void *addr = &psxRegs.GPR.r[r];
489 //case HIREG: addr = &hi; break;
490 //case LOREG: addr = &lo; break;
491 case CCREG: addr = &cycle_count; break;
492 default: assert(r < 34); break;
494 emit_writeword(hr, addr);
497 static void emit_test(u_int rs, u_int rt)
499 assem_debug("tst %s,%s\n", regname[rs], regname[rt]);
500 output_w32(0x6a000000 | rm_rn_rd(rt, rs, WZR));
503 static void emit_testimm(u_int rs, u_int imm)
506 assem_debug("tst %s,#%#x\n", regname[rs], imm);
507 assert(is_rotated_mask(imm)); // good enough for PCSX
508 gen_logical_imm(imm, &immr, &imms);
509 output_w32(0x72000000 | n_immr_imms_rn_rd(0, immr, imms, rs, WZR));
512 static void emit_not(u_int rs,u_int rt)
514 assem_debug("mvn %s,%s\n",regname[rt],regname[rs]);
515 output_w32(0x2a200000 | rm_rn_rd(rs, WZR, rt));
518 static void emit_and(u_int rs1,u_int rs2,u_int rt)
520 assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
521 output_w32(0x0a000000 | rm_rn_rd(rs2, rs1, rt));
524 static void emit_or(u_int rs1,u_int rs2,u_int rt)
526 assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
527 output_w32(0x2a000000 | rm_rn_rd(rs2, rs1, rt));
530 static void emit_bic(u_int rs1,u_int rs2,u_int rt)
532 assem_debug("bic %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
533 output_w32(0x0a200000 | rm_rn_rd(rs2, rs1, rt));
536 static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt)
538 assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm);
539 output_w32(0x2a000000 | rm_imm6_rn_rd(rs, imm, rt, rt));
542 static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt)
544 assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm);
545 output_w32(0x2a400000 | rm_imm6_rn_rd(rs, imm, rt, rt));
548 static void emit_bicsar_imm(u_int rs,u_int imm,u_int rt)
550 assem_debug("bic %s,%s,%s,asr #%d\n",regname[rt],regname[rt],regname[rs],imm);
551 output_w32(0x0aa00000 | rm_imm6_rn_rd(rs, imm, rt, rt));
554 static void emit_xor(u_int rs1,u_int rs2,u_int rt)
556 assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
557 output_w32(0x4a000000 | rm_rn_rd(rs2, rs1, rt));
560 static void emit_xorsar_imm(u_int rs1, u_int rs2, u_int imm, u_int rt)
562 assem_debug("eor %s,%s,%s,asr #%d\n",regname[rt],regname[rs1],regname[rs2],imm);
563 output_w32(0x4a800000 | rm_imm6_rn_rd(rs2, imm, rs1, rt));
566 static void emit_addimm_s(u_int s, u_int is64, u_int rs, uintptr_t imm, u_int rt)
568 unused const char *st = s ? "s" : "";
569 s = s ? 0x20000000 : 0;
570 is64 = is64 ? 0x80000000 : 0;
572 assem_debug("add%s %s,%s,%#lx\n", st, regname[rt], regname[rs], imm);
573 output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm, rs, rt));
575 else if (-imm < 4096) {
576 assem_debug("sub%s %s,%s,%#lx\n", st, regname[rt], regname[rs], -imm);
577 output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm, rs, rt));
579 else if (imm < 16777216) {
580 assem_debug("add %s,%s,#%#lx\n",regname[rt],regname[rt],imm&0xfff000);
581 output_w32(0x11400000 | is64 | imm12_rn_rd(imm >> 12, rs, rt));
582 if ((imm & 0xfff) || s) {
583 assem_debug("add%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],imm&0xfff);
584 output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm & 0xfff, rt, rt));
587 else if (-imm < 16777216) {
588 assem_debug("sub %s,%s,#%#lx\n",regname[rt],regname[rt],-imm&0xfff000);
589 output_w32(0x51400000 | is64 | imm12_rn_rd(-imm >> 12, rs, rt));
590 if ((imm & 0xfff) || s) {
591 assem_debug("sub%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],-imm&0xfff);
592 output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm & 0xfff, rt, rt));
599 static void emit_addimm(u_int rs, uintptr_t imm, u_int rt)
601 emit_addimm_s(0, 0, rs, imm, rt);
604 static void emit_addimm64(u_int rs, uintptr_t imm, u_int rt)
606 emit_addimm_s(0, 1, rs, imm, rt);
609 static void emit_addimm_and_set_flags(int imm, u_int rt)
611 emit_addimm_s(1, 0, rt, imm, rt);
614 static void emit_logicop_imm(u_int op, u_int rs, u_int imm, u_int rt)
616 const char *names[] = { "and", "orr", "eor", "ands" };
617 const char *name = names[op];
620 if (is_rotated_mask(imm)) {
621 gen_logical_imm(imm, &immr, &imms);
622 assem_debug("%s %s,%s,#%#x\n", name, regname[rt], regname[rs], imm);
623 output_w32(op | 0x12000000 | n_immr_imms_rn_rd(0, immr, imms, rs, rt));
626 if (rs == HOST_TEMPREG || rt != HOST_TEMPREG)
627 host_tempreg_acquire();
628 emit_movimm(imm, HOST_TEMPREG);
629 assem_debug("%s %s,%s,%s\n", name, regname[rt], regname[rs], regname[HOST_TEMPREG]);
630 output_w32(op | 0x0a000000 | rm_rn_rd(HOST_TEMPREG, rs, rt));
631 if (rs == HOST_TEMPREG || rt != HOST_TEMPREG)
632 host_tempreg_release();
637 static void emit_andimm(u_int rs, u_int imm, u_int rt)
642 emit_logicop_imm(0, rs, imm, rt);
645 static void emit_orimm(u_int rs, u_int imm, u_int rt)
652 emit_logicop_imm(1, rs, imm, rt);
655 static void emit_xorimm(u_int rs, u_int imm, u_int rt)
662 emit_logicop_imm(2, rs, imm, rt);
665 static void emit_sbfm(u_int rs,u_int imm,u_int rt)
667 assem_debug("sbfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm);
668 output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt));
671 static void emit_ubfm(u_int rs,u_int imm,u_int rt)
673 assem_debug("ubfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm);
674 output_w32(0x53000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt));
677 static void emit_shlimm(u_int rs,u_int imm,u_int rt)
679 assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm);
680 output_w32(0x53000000 | n_immr_imms_rn_rd(0, (31-imm)+1, 31-imm, rs, rt));
683 static void emit_shrimm(u_int rs,u_int imm,u_int rt)
685 assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
686 output_w32(0x53000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt));
689 static void emit_shrimm64(u_int rs,u_int imm,u_int rt)
691 assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
692 output_w32(0xd3400000 | n_immr_imms_rn_rd(0, imm, 63, rs, rt));
695 static void emit_sarimm(u_int rs,u_int imm,u_int rt)
697 assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm);
698 output_w32(0x13000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt));
701 static void emit_rorimm(u_int rs,u_int imm,u_int rt)
703 assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm);
704 output_w32(0x13800000 | rm_imm6_rn_rd(rs, imm, rs, rt));
707 static void emit_signextend16(u_int rs, u_int rt)
709 assem_debug("sxth %s,%s\n", regname[rt], regname[rs]);
710 output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, 15, rs, rt));
713 static void emit_shl(u_int rs,u_int rshift,u_int rt)
715 assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
716 output_w32(0x1ac02000 | rm_rn_rd(rshift, rs, rt));
719 static void emit_shr(u_int rs,u_int rshift,u_int rt)
721 assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
722 output_w32(0x1ac02400 | rm_rn_rd(rshift, rs, rt));
725 static void emit_sar(u_int rs,u_int rshift,u_int rt)
727 assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
728 output_w32(0x1ac02800 | rm_rn_rd(rshift, rs, rt));
731 static void emit_cmpimm(u_int rs, u_int imm)
734 assem_debug("cmp %s,%#x\n", regname[rs], imm);
735 output_w32(0x71000000 | imm12_rn_rd(imm, rs, WZR));
737 else if (-imm < 4096) {
738 assem_debug("cmn %s,%#x\n", regname[rs], imm);
739 output_w32(0x31000000 | imm12_rn_rd(-imm, rs, WZR));
741 else if (imm < 16777216 && !(imm & 0xfff)) {
742 assem_debug("cmp %s,#%#x\n", regname[rs], imm);
743 output_w32(0x71400000 | imm12_rn_rd(imm >> 12, rs, WZR));
746 host_tempreg_acquire();
747 emit_movimm(imm, HOST_TEMPREG);
748 assem_debug("cmp %s,%s\n", regname[rs], regname[HOST_TEMPREG]);
749 output_w32(0x6b000000 | rm_rn_rd(HOST_TEMPREG, rs, WZR));
750 host_tempreg_release();
754 static void emit_cmov_imm(u_int cond0, u_int cond1, u_int imm, u_int rt)
756 assert(imm == 0 || imm == 1);
757 assert(cond0 < 0x10);
758 assert(cond1 < 0x10);
760 assem_debug("csinc %s,%s,%s,%s\n",regname[rt],regname[rt],regname[WZR],condname[cond1]);
761 output_w32(0x1a800400 | (cond1 << 12) | rm_rn_rd(WZR, rt, rt));
763 assem_debug("csel %s,%s,%s,%s\n",regname[rt],regname[WZR],regname[rt],condname[cond0]);
764 output_w32(0x1a800000 | (cond0 << 12) | rm_rn_rd(rt, WZR, rt));
768 static void emit_cmovne_imm(u_int imm,u_int rt)
770 emit_cmov_imm(COND_NE, COND_EQ, imm, rt);
773 static void emit_cmovl_imm(u_int imm,u_int rt)
775 emit_cmov_imm(COND_LT, COND_GE, imm, rt);
778 static void emit_cmovb_imm(int imm,u_int rt)
780 emit_cmov_imm(COND_CC, COND_CS, imm, rt);
783 static void emit_cmoveq_reg(u_int rs,u_int rt)
785 assem_debug("csel %s,%s,%s,eq\n",regname[rt],regname[rs],regname[rt]);
786 output_w32(0x1a800000 | (COND_EQ << 12) | rm_rn_rd(rt, rs, rt));
789 static void emit_cmovne_reg(u_int rs,u_int rt)
791 assem_debug("csel %s,%s,%s,ne\n",regname[rt],regname[rs],regname[rt]);
792 output_w32(0x1a800000 | (COND_NE << 12) | rm_rn_rd(rt, rs, rt));
795 static void emit_cmovl_reg(u_int rs,u_int rt)
797 assem_debug("csel %s,%s,%s,lt\n",regname[rt],regname[rs],regname[rt]);
798 output_w32(0x1a800000 | (COND_LT << 12) | rm_rn_rd(rt, rs, rt));
801 static void emit_cmovb_reg(u_int rs,u_int rt)
803 assem_debug("csel %s,%s,%s,cc\n",regname[rt],regname[rs],regname[rt]);
804 output_w32(0x1a800000 | (COND_CC << 12) | rm_rn_rd(rt, rs, rt));
807 static void emit_cmovs_reg(u_int rs,u_int rt)
809 assem_debug("csel %s,%s,%s,mi\n",regname[rt],regname[rs],regname[rt]);
810 output_w32(0x1a800000 | (COND_MI << 12) | rm_rn_rd(rt, rs, rt));
813 static void emit_csinvle_reg(u_int rs1,u_int rs2,u_int rt)
815 assem_debug("csinv %s,%s,%s,le\n",regname[rt],regname[rs1],regname[rs2]);
816 output_w32(0x5a800000 | (COND_LE << 12) | rm_rn_rd(rs2, rs1, rt));
819 static void emit_slti32(u_int rs,int imm,u_int rt)
821 if(rs!=rt) emit_zeroreg(rt);
823 if(rs==rt) emit_movimm(0,rt);
824 emit_cmovl_imm(1,rt);
827 static void emit_sltiu32(u_int rs,int imm,u_int rt)
829 if(rs!=rt) emit_zeroreg(rt);
831 if(rs==rt) emit_movimm(0,rt);
832 emit_cmovb_imm(1,rt);
835 static void emit_cmp(u_int rs,u_int rt)
837 assem_debug("cmp %s,%s\n",regname[rs],regname[rt]);
838 output_w32(0x6b000000 | rm_rn_rd(rt, rs, WZR));
841 static void emit_cmpcs(u_int rs,u_int rt)
843 assem_debug("ccmp %s,%s,#0,cs\n",regname[rs],regname[rt]);
844 output_w32(0x7a400000 | (COND_CS << 12) | rm_rn_rd(rt, rs, 0));
847 static void emit_set_gz32(u_int rs, u_int rt)
849 //assem_debug("set_gz32\n");
852 emit_cmovl_imm(0,rt);
855 static void emit_set_nz32(u_int rs, u_int rt)
857 //assem_debug("set_nz32\n");
858 if(rs!=rt) emit_mov(rs,rt);
860 emit_cmovne_imm(1,rt);
863 static void emit_set_if_less32(u_int rs1, u_int rs2, u_int rt)
865 //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
866 if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
868 if(rs1==rt||rs2==rt) emit_movimm(0,rt);
869 emit_cmovl_imm(1,rt);
872 static void emit_set_if_carry32(u_int rs1, u_int rs2, u_int rt)
874 //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
875 if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
877 if(rs1==rt||rs2==rt) emit_movimm(0,rt);
878 emit_cmovb_imm(1,rt);
881 static int can_jump_or_call(const void *a)
883 intptr_t diff = (u_char *)a - out;
884 return (-134217728 <= diff && diff <= 134217727);
887 static void emit_call(const void *a)
889 intptr_t diff = (u_char *)a - out;
890 assem_debug("bl %p (%p+%lx)%s\n", a, out, diff, func_name(a));
892 if (-134217728 <= diff && diff <= 134217727)
893 output_w32(0x94000000 | ((diff >> 2) & 0x03ffffff));
898 static void emit_jmp(const void *a)
900 assem_debug("b %p (%p+%lx)%s\n", a, out, (u_char *)a - out, func_name(a));
901 u_int offset = genjmp(a);
902 output_w32(0x14000000 | offset);
905 static void emit_jne(const void *a)
907 assem_debug("bne %p\n", a);
908 u_int offset = genjmpcc(a);
909 output_w32(0x54000000 | (offset << 5) | COND_NE);
912 static void emit_jeq(const void *a)
914 assem_debug("beq %p\n", a);
915 u_int offset = genjmpcc(a);
916 output_w32(0x54000000 | (offset << 5) | COND_EQ);
919 static void emit_js(const void *a)
921 assem_debug("bmi %p\n", a);
922 u_int offset = genjmpcc(a);
923 output_w32(0x54000000 | (offset << 5) | COND_MI);
926 static void emit_jns(const void *a)
928 assem_debug("bpl %p\n", a);
929 u_int offset = genjmpcc(a);
930 output_w32(0x54000000 | (offset << 5) | COND_PL);
933 static void emit_jl(const void *a)
935 assem_debug("blt %p\n", a);
936 u_int offset = genjmpcc(a);
937 output_w32(0x54000000 | (offset << 5) | COND_LT);
940 static void emit_jge(const void *a)
942 assem_debug("bge %p\n", a);
943 u_int offset = genjmpcc(a);
944 output_w32(0x54000000 | (offset << 5) | COND_GE);
947 static void emit_jno(const void *a)
949 assem_debug("bvc %p\n", a);
950 u_int offset = genjmpcc(a);
951 output_w32(0x54000000 | (offset << 5) | COND_VC);
954 static void emit_jc(const void *a)
956 assem_debug("bcs %p\n", a);
957 u_int offset = genjmpcc(a);
958 output_w32(0x54000000 | (offset << 5) | COND_CS);
961 static void emit_cb(u_int isnz, u_int is64, const void *a, u_int r)
963 assem_debug("cb%sz %s,%p\n", isnz?"n":"", is64?regname64[r]:regname[r], a);
964 u_int offset = genjmpcc(a);
965 is64 = is64 ? 0x80000000 : 0;
966 isnz = isnz ? 0x01000000 : 0;
967 output_w32(0x34000000 | is64 | isnz | imm19_rt(offset, r));
970 static unused void emit_cbz(const void *a, u_int r)
975 static void emit_jmpreg(u_int r)
977 assem_debug("br %s\n", regname64[r]);
978 output_w32(0xd61f0000 | rm_rn_rd(0, r, 0));
981 static void emit_retreg(u_int r)
983 assem_debug("ret %s\n", r == LR ? "" : regname64[r]);
984 output_w32(0xd65f0000 | rm_rn_rd(0, r, 0));
987 static void emit_ret(void)
992 static void emit_adr(void *addr, u_int rt)
994 intptr_t offset = (u_char *)addr - out;
995 assert(-1048576 <= offset && offset < 1048576);
997 assem_debug("adr x%d,#%#lx\n", rt, offset);
998 output_w32(0x10000000 | ((offset&0x3) << 29) | (((offset>>2)&0x7ffff) << 5) | rt);
1001 static void emit_adrp(void *addr, u_int rt)
1003 intptr_t offset = ((intptr_t)addr & ~0xfffl) - ((intptr_t)out & ~0xfffl);
1004 assert(-4294967296l <= offset && offset < 4294967296l);
1007 assem_debug("adrp %s,#%#lx(000)\n",regname64[rt],offset);
1008 output_w32(0x90000000 | ((offset&0x3)<<29) | (((offset>>2)&0x7ffff)<<5) | rt);
1011 static void emit_readword_indexed(int offset, u_int rs, u_int rt)
1013 assem_debug("ldur %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1014 assert(-256 <= offset && offset < 256);
1015 output_w32(0xb8400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1018 static void emit_strb_dualindexed(u_int rs1, u_int rs2, u_int rt)
1020 assem_debug("strb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1021 output_w32(0x38204800 | rm_rn_rd(rs2, rs1, rt));
1024 static void emit_strh_dualindexed(u_int rs1, u_int rs2, u_int rt)
1026 assem_debug("strh %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1027 output_w32(0x78204800 | rm_rn_rd(rs2, rs1, rt));
1030 static void emit_str_dualindexed(u_int rs1, u_int rs2, u_int rt)
1032 assem_debug("str %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1033 output_w32(0xb8204800 | rm_rn_rd(rs2, rs1, rt));
1036 static void emit_readdword_dualindexedx8(u_int rs1, u_int rs2, u_int rt)
1038 assem_debug("ldr %s, [%s,%s, uxtw #3]\n",regname64[rt],regname64[rs1],regname[rs2]);
1039 output_w32(0xf8605800 | rm_rn_rd(rs2, rs1, rt));
1041 #define emit_readptr_dualindexedx_ptrlen emit_readdword_dualindexedx8
1043 static void emit_ldrb_dualindexed(u_int rs1, u_int rs2, u_int rt)
1045 assem_debug("ldrb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1046 output_w32(0x38604800 | rm_rn_rd(rs2, rs1, rt));
1049 static void emit_ldrsb_dualindexed(u_int rs1, u_int rs2, u_int rt)
1051 assem_debug("ldrsb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1052 output_w32(0x38a04800 | rm_rn_rd(rs2, rs1, rt));
1055 static void emit_ldrh_dualindexed(u_int rs1, u_int rs2, u_int rt)
1057 assem_debug("ldrh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
1058 output_w32(0x78604800 | rm_rn_rd(rs2, rs1, rt));
1061 static void emit_ldrsh_dualindexed(u_int rs1, u_int rs2, u_int rt)
1063 assem_debug("ldrsh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
1064 output_w32(0x78a04800 | rm_rn_rd(rs2, rs1, rt));
1067 static void emit_ldr_dualindexed(u_int rs1, u_int rs2, u_int rt)
1069 assem_debug("ldr %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
1070 output_w32(0xb8604800 | rm_rn_rd(rs2, rs1, rt));
1073 static void emit_movsbl_indexed(int offset, u_int rs, u_int rt)
1075 assem_debug("ldursb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1076 assert(-256 <= offset && offset < 256);
1077 output_w32(0x38c00000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1080 static void emit_movswl_indexed(int offset, u_int rs, u_int rt)
1082 assem_debug("ldursh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1083 assert(-256 <= offset && offset < 256);
1084 output_w32(0x78c00000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1087 static void emit_movzbl_indexed(int offset, u_int rs, u_int rt)
1089 assem_debug("ldurb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1090 assert(-256 <= offset && offset < 256);
1091 output_w32(0x38400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1094 static void emit_movzwl_indexed(int offset, u_int rs, u_int rt)
1096 assem_debug("ldurh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1097 assert(-256 <= offset && offset < 256);
1098 output_w32(0x78400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1101 static void emit_writeword_indexed(u_int rt, int offset, u_int rs)
1103 if (!(offset & 3) && (u_int)offset <= 16380) {
1104 assem_debug("str %s,[%s+%#x]\n", regname[rt], regname[rs], offset);
1105 output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, rs, rt));
1107 else if (-256 <= offset && offset < 256) {
1108 assem_debug("stur %s,[%s+%#x]\n", regname[rt], regname[rs], offset);
1109 output_w32(0xb8000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
1115 static void emit_writehword_indexed(u_int rt, int offset, u_int rs)
1117 if (!(offset & 1) && (u_int)offset <= 8190) {
1118 assem_debug("strh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1119 output_w32(0x79000000 | imm12_rn_rd(offset >> 1, rs, rt));
1121 else if (-256 <= offset && offset < 256) {
1122 assem_debug("sturh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1123 output_w32(0x78000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
1129 static void emit_writebyte_indexed(u_int rt, int offset, u_int rs)
1131 if ((u_int)offset < 4096) {
1132 assem_debug("strb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1133 output_w32(0x39000000 | imm12_rn_rd(offset, rs, rt));
1135 else if (-256 <= offset && offset < 256) {
1136 assem_debug("sturb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1137 output_w32(0x38000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
1143 static void emit_umull(u_int rs1, u_int rs2, u_int rt)
1145 assem_debug("umull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]);
1146 output_w32(0x9ba00000 | rm_ra_rn_rd(rs2, WZR, rs1, rt));
1149 static void emit_smull(u_int rs1, u_int rs2, u_int rt)
1151 assem_debug("smull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]);
1152 output_w32(0x9b200000 | rm_ra_rn_rd(rs2, WZR, rs1, rt));
1155 static void emit_msub(u_int rs1, u_int rs2, u_int rs3, u_int rt)
1157 assem_debug("msub %s,%s,%s,%s\n",regname[rt],regname[rs1],regname[rs2],regname[rs3]);
1158 output_w32(0x1b008000 | rm_ra_rn_rd(rs2, rs3, rs1, rt));
1161 static void emit_sdiv(u_int rs1, u_int rs2, u_int rt)
1163 assem_debug("sdiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
1164 output_w32(0x1ac00c00 | rm_rn_rd(rs2, rs1, rt));
1167 static void emit_udiv(u_int rs1, u_int rs2, u_int rt)
1169 assem_debug("udiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
1170 output_w32(0x1ac00800 | rm_rn_rd(rs2, rs1, rt));
1173 static void emit_clz(u_int rs, u_int rt)
1175 assem_debug("clz %s,%s\n",regname[rt],regname[rs]);
1176 output_w32(0x5ac01000 | rn_rd(rs, rt));
1179 // special case for checking invalid_code
1180 static void emit_cmpmem_indexedsr12_reg(u_int rbase, u_int r, u_int imm)
1182 host_tempreg_acquire();
1183 emit_shrimm(r, 12, HOST_TEMPREG);
1184 assem_debug("ldrb %s,[%s,%s,uxtw]\n",regname[HOST_TEMPREG],regname64[rbase],regname[HOST_TEMPREG]);
1185 output_w32(0x38604800 | rm_rn_rd(HOST_TEMPREG, rbase, HOST_TEMPREG));
1186 emit_cmpimm(HOST_TEMPREG, imm);
1187 host_tempreg_release();
1190 // special for loadlr_assemble, rs2 is destroyed
1191 static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
1193 emit_shl(rs2, shift, rs2);
1194 emit_bic(rs1, rs2, rt);
1197 static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
1199 emit_shr(rs2, shift, rs2);
1200 emit_bic(rs1, rs2, rt);
1203 static void emit_ldst(int is_st, int is64, u_int rt, u_int rn, u_int ofs)
1205 u_int op = 0xb9000000;
1206 unused const char *ldst = is_st ? "st" : "ld";
1207 unused char rp = is64 ? 'x' : 'w';
1208 assem_debug("%sr %c%d,[x%d,#%#x]\n", ldst, rp, rt, rn, ofs);
1209 is64 = is64 ? 1 : 0;
1210 assert((ofs & ((1 << (2+is64)) - 1)) == 0);
1211 ofs = (ofs >> (2+is64));
1212 if (!is_st) op |= 0x00400000;
1213 if (is64) op |= 0x40000000;
1214 output_w32(op | imm12_rn_rd(ofs, rn, rt));
1217 static void emit_ldstp(int is_st, int is64, u_int rt1, u_int rt2, u_int rn, int ofs)
1219 u_int op = 0x29000000;
1220 unused const char *ldst = is_st ? "st" : "ld";
1221 unused char rp = is64 ? 'x' : 'w';
1222 assem_debug("%sp %c%d,%c%d,[x%d,#%#x]\n", ldst, rp, rt1, rp, rt2, rn, ofs);
1223 is64 = is64 ? 1 : 0;
1224 assert((ofs & ((1 << (2+is64)) - 1)) == 0);
1225 ofs = (ofs >> (2+is64));
1226 assert(-64 <= ofs && ofs <= 63);
1228 if (!is_st) op |= 0x00400000;
1229 if (is64) op |= 0x80000000;
1230 output_w32(op | imm7_rt2_rn_rt(ofs, rt2, rn, rt1));
1233 static void save_load_regs_all(int is_store, u_int reglist)
1237 for (r = 0; reglist; r++, reglist >>= 1) {
1241 emit_ldstp(is_store, 1, pair[0], pair[1], SP, SSP_CALLEE_REGS + ofs);
1247 emit_ldst(is_store, 1, pair[0], SP, SSP_CALLEE_REGS + ofs);
1250 assert(ofs <= SSP_CALLER_REGS);
1253 // Save registers before function call
1254 static void save_regs(u_int reglist)
1256 reglist &= CALLER_SAVE_REGS; // only save the caller-save registers
1257 save_load_regs_all(1, reglist);
1260 // Restore registers after function call
1261 static void restore_regs(u_int reglist)
1263 reglist &= CALLER_SAVE_REGS;
1264 save_load_regs_all(0, reglist);
1267 /* Stubs/epilogue */
1269 static void literal_pool(int n)
1274 static void literal_pool_jumpover(int n)
1278 // parsed by get_pointer, find_extjump_insn
1279 static void emit_extjump(u_char *addr, u_int target)
1281 assert(((addr[3]&0xfc)==0x14) || ((addr[3]&0xff)==0x54)); // b or b.cond
1283 emit_movz(target & 0xffff, 0);
1284 emit_movk_lsl16(target >> 16, 0);
1286 // addr is in the current recompiled block (max 256k)
1287 // offset shouldn't exceed +/-1MB
1289 emit_far_jump(dyna_linker);
1292 static void check_extjump2(void *src)
1295 assert((ptr[0] & 0xffe0001f) == 0x52800000); // movz r0, #val
1299 // put rt_val into rt, potentially making use of rs with value rs_val
1300 static void emit_movimm_from(u_int rs_val, u_int rs, u_int rt_val, u_int rt)
1302 int diff = rt_val - rs_val;
1303 if ((-4096 < diff && diff < 4096)
1304 || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff)))
1305 emit_addimm(rs, diff, rt);
1306 else if (rt_val == ~rs_val)
1308 else if (is_rotated_mask(rs_val ^ rt_val))
1309 emit_xorimm(rs, rs_val ^ rt_val, rt);
1311 emit_movimm(rt_val, rt);
1314 // return 1 if the above function can do it's job cheaply
1315 static int is_similar_value(u_int v1, u_int v2)
1318 return (-4096 < diff && diff < 4096)
1319 || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff))
1321 || is_rotated_mask(v1 ^ v2);
1324 static void emit_movimm_from64(u_int rs_val, u_int rs, uintptr_t rt_val, u_int rt)
1326 if (rt_val < 0x100000000ull) {
1327 emit_movimm_from(rs_val, rs, rt_val, rt);
1330 // just move the whole thing. At least on Linux all addresses
1331 // seem to be 48bit, so 3 insns - not great not terrible
1332 assem_debug("movz %s,#%#lx\n", regname64[rt], rt_val & 0xffff);
1333 output_w32(0xd2800000 | imm16_rd(rt_val & 0xffff, rt));
1334 assem_debug("movk %s,#%#lx,lsl #16\n", regname64[rt], (rt_val >> 16) & 0xffff);
1335 output_w32(0xf2a00000 | imm16_rd((rt_val >> 16) & 0xffff, rt));
1336 assem_debug("movk %s,#%#lx,lsl #32\n", regname64[rt], (rt_val >> 32) & 0xffff);
1337 output_w32(0xf2c00000 | imm16_rd((rt_val >> 32) & 0xffff, rt));
1339 assem_debug("movk %s,#%#lx,lsl #48\n", regname64[rt], (rt_val >> 48) & 0xffff);
1340 output_w32(0xf2e00000 | imm16_rd((rt_val >> 48) & 0xffff, rt));
1345 static void pass_args64(u_int a0, u_int a1)
1349 emit_mov64(a0,2); emit_mov64(a1,1); emit_mov64(2,0);
1351 else if(a0!=0&&a1==0) {
1353 if (a0>=0) emit_mov64(a0,0);
1356 if(a0>=0&&a0!=0) emit_mov64(a0,0);
1357 if(a1>=0&&a1!=1) emit_mov64(a1,1);
1361 static void loadstore_extend(enum stub_type type, u_int rs, u_int rt)
1364 case LOADB_STUB: emit_sbfm(rs, 7, rt); break;
1366 case STOREB_STUB: emit_ubfm(rs, 7, rt); break;
1367 case LOADH_STUB: emit_sbfm(rs, 15, rt); break;
1369 case STOREH_STUB: emit_ubfm(rs, 15, rt); break;
1371 case STOREW_STUB: if (rs != rt) emit_mov(rs, rt); break;
1376 #include "pcsxmem.h"
1377 //#include "pcsxmem_inline.c"
1379 static void do_readstub(int n)
1381 assem_debug("do_readstub %x\n",start+stubs[n].a*4);
1382 set_jump_target(stubs[n].addr, out);
1383 enum stub_type type = stubs[n].type;
1385 int rs = stubs[n].b;
1386 const struct regstat *i_regs = (void *)stubs[n].c;
1387 u_int reglist = stubs[n].e;
1388 const signed char *i_regmap = i_regs->regmap;
1390 if(dops[i].itype==C1LS||dops[i].itype==C2LS||dops[i].itype==LOADLR) {
1391 rt=get_reg(i_regmap,FTEMP);
1393 rt=get_reg(i_regmap,dops[i].rt1);
1396 int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0;
1397 void *restore_jump = NULL, *handler_jump = NULL;
1399 for (r = 0; r < HOST_CCREG; r++) {
1400 if (r != EXCLUDE_REG && ((1 << r) & reglist) == 0) {
1405 if(rt>=0&&dops[i].rt1!=0)
1412 if((regs_saved||(reglist&2)==0)&&temp!=1&&rs!=1)
1414 emit_readdword(&mem_rtab,temp);
1415 emit_shrimm(rs,12,temp2);
1416 emit_readdword_dualindexedx8(temp,temp2,temp2);
1417 emit_adds64(temp2,temp2,temp2);
1420 if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
1422 case LOADB_STUB: emit_ldrsb_dualindexed(temp2,rs,rt); break;
1423 case LOADBU_STUB: emit_ldrb_dualindexed(temp2,rs,rt); break;
1424 case LOADH_STUB: emit_ldrsh_dualindexed(temp2,rs,rt); break;
1425 case LOADHU_STUB: emit_ldrh_dualindexed(temp2,rs,rt); break;
1426 case LOADW_STUB: emit_ldr_dualindexed(temp2,rs,rt); break;
1432 emit_jmp(0); // jump to reg restore
1435 emit_jmp(stubs[n].retaddr); // return address
1436 set_jump_target(handler_jump, out);
1441 if(type==LOADB_STUB||type==LOADBU_STUB)
1442 handler=jump_handler_read8;
1443 if(type==LOADH_STUB||type==LOADHU_STUB)
1444 handler=jump_handler_read16;
1445 if(type==LOADW_STUB)
1446 handler=jump_handler_read32;
1448 pass_args64(rs,temp2);
1449 int cc=get_reg(i_regmap,CCREG);
1451 emit_loadreg(CCREG,2);
1452 emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
1453 emit_far_call(handler);
1454 // (no cycle reload after read)
1455 if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
1456 loadstore_extend(type,0,rt);
1459 set_jump_target(restore_jump, out);
1460 restore_regs(reglist);
1461 emit_jmp(stubs[n].retaddr);
1464 static void inline_readstub(enum stub_type type, int i, u_int addr,
1465 const signed char regmap[], int target, int adj, u_int reglist)
1467 int rs=get_reg(regmap,target);
1468 int rt=get_reg(regmap,target);
1469 if(rs<0) rs=get_reg_temp(regmap);
1472 uintptr_t host_addr = 0;
1474 int cc=get_reg(regmap,CCREG);
1475 //if(pcsx_direct_read(type,addr,adj,cc,target?rs:-1,rt))
1477 handler = get_direct_memhandler(mem_rtab, addr, type, &host_addr);
1478 if (handler == NULL) {
1479 if(rt<0||dops[i].rt1==0)
1481 if (addr != host_addr)
1482 emit_movimm_from64(addr, rs, host_addr, rs);
1484 case LOADB_STUB: emit_movsbl_indexed(0,rs,rt); break;
1485 case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break;
1486 case LOADH_STUB: emit_movswl_indexed(0,rs,rt); break;
1487 case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break;
1488 case LOADW_STUB: emit_readword_indexed(0,rs,rt); break;
1493 is_dynamic = pcsxmem_is_handler_dynamic(addr);
1495 if(type==LOADB_STUB||type==LOADBU_STUB)
1496 handler=jump_handler_read8;
1497 if(type==LOADH_STUB||type==LOADHU_STUB)
1498 handler=jump_handler_read16;
1499 if(type==LOADW_STUB)
1500 handler=jump_handler_read32;
1503 // call a memhandler
1504 if(rt>=0&&dops[i].rt1!=0)
1508 emit_movimm(addr,0);
1512 emit_loadreg(CCREG,2);
1513 emit_addimm(cc<0?2:cc,adj,2);
1515 uintptr_t l1 = ((uintptr_t *)mem_rtab)[addr>>12] << 1;
1516 emit_adrp((void *)l1, 1);
1517 emit_addimm64(1, l1 & 0xfff, 1);
1520 emit_far_call(do_memhandler_pre);
1522 emit_far_call(handler);
1524 // (no cycle reload after read)
1525 if(rt>=0&&dops[i].rt1!=0)
1526 loadstore_extend(type, 0, rt);
1527 restore_regs(reglist);
1530 static void do_writestub(int n)
1532 assem_debug("do_writestub %x\n",start+stubs[n].a*4);
1533 set_jump_target(stubs[n].addr, out);
1534 enum stub_type type=stubs[n].type;
1537 struct regstat *i_regs=(struct regstat *)stubs[n].c;
1538 u_int reglist=stubs[n].e;
1539 signed char *i_regmap=i_regs->regmap;
1541 if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
1542 rt=get_reg(i_regmap,r=FTEMP);
1544 rt=get_reg(i_regmap,r=dops[i].rs2);
1548 int rtmp,temp=-1,temp2,regs_saved=0;
1549 void *restore_jump = NULL, *handler_jump = NULL;
1550 int reglist2=reglist|(1<<rs)|(1<<rt);
1551 for (rtmp = 0; rtmp < HOST_CCREG; rtmp++) {
1552 if (rtmp != EXCLUDE_REG && ((1 << rtmp) & reglist) == 0) {
1560 for(rtmp=0;rtmp<=3;rtmp++)
1561 if(rtmp!=rs&&rtmp!=rt)
1564 if((regs_saved||(reglist2&8)==0)&&temp!=3&&rs!=3&&rt!=3)
1567 host_tempreg_acquire();
1570 emit_readdword(&mem_wtab,temp);
1571 emit_shrimm(rs,12,temp2);
1572 emit_readdword_dualindexedx8(temp,temp2,temp2);
1573 emit_adds64(temp2,temp2,temp2);
1577 case STOREB_STUB: emit_strb_dualindexed(temp2,rs,rt); break;
1578 case STOREH_STUB: emit_strh_dualindexed(temp2,rs,rt); break;
1579 case STOREW_STUB: emit_str_dualindexed(temp2,rs,rt); break;
1584 emit_jmp(0); // jump to reg restore
1587 emit_jmp(stubs[n].retaddr); // return address (invcode check)
1588 set_jump_target(handler_jump, out);
1594 case STOREB_STUB: handler=jump_handler_write8; break;
1595 case STOREH_STUB: handler=jump_handler_write16; break;
1596 case STOREW_STUB: handler=jump_handler_write32; break;
1602 emit_mov64(temp2,3);
1603 host_tempreg_release();
1605 int cc=get_reg(i_regmap,CCREG);
1607 emit_loadreg(CCREG,2);
1608 emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
1609 // returns new cycle_count
1610 emit_far_call(handler);
1611 emit_addimm(0,-(int)stubs[n].d,cc<0?2:cc);
1613 emit_storereg(CCREG,2);
1615 set_jump_target(restore_jump, out);
1616 restore_regs(reglist);
1617 emit_jmp(stubs[n].retaddr);
1620 static void inline_writestub(enum stub_type type, int i, u_int addr,
1621 const signed char regmap[], int target, int adj, u_int reglist)
1623 int rs = get_reg_temp(regmap);
1624 int rt = get_reg(regmap,target);
1627 uintptr_t host_addr = 0;
1628 void *handler = get_direct_memhandler(mem_wtab, addr, type, &host_addr);
1629 if (handler == NULL) {
1630 if (addr != host_addr)
1631 emit_movimm_from64(addr, rs, host_addr, rs);
1633 case STOREB_STUB: emit_writebyte_indexed(rt, 0, rs); break;
1634 case STOREH_STUB: emit_writehword_indexed(rt, 0, rs); break;
1635 case STOREW_STUB: emit_writeword_indexed(rt, 0, rs); break;
1641 // call a memhandler
1643 emit_writeword(rs, &address); // some handlers still need it
1644 loadstore_extend(type, rt, 0);
1646 cc = cc_use = get_reg(regmap, CCREG);
1648 emit_loadreg(CCREG, (cc_use = 2));
1649 emit_addimm(cc_use, adj, 2);
1651 emit_far_call(do_memhandler_pre);
1652 emit_far_call(handler);
1653 emit_far_call(do_memhandler_post);
1654 emit_addimm(0, -adj, cc_use);
1656 emit_storereg(CCREG, cc_use);
1657 restore_regs(reglist);
1662 static void c2op_prologue(u_int op, int i, const struct regstat *i_regs, u_int reglist)
1664 save_load_regs_all(1, reglist);
1665 cop2_do_stall_check(op, i, i_regs, 0);
1668 emit_far_call(pcnt_gte_start);
1670 // pointer to cop2 regs
1671 emit_addimm64(FP, (u_char *)&psxRegs.CP2D.r[0] - (u_char *)&dynarec_local, 0);
1674 static void c2op_epilogue(u_int op,u_int reglist)
1678 emit_far_call(pcnt_gte_end);
1680 save_load_regs_all(0, reglist);
1683 static void c2op_assemble(int i, const struct regstat *i_regs)
1685 u_int c2op=source[i]&0x3f;
1686 u_int hr,reglist_full=0,reglist;
1687 int need_flags,need_ir;
1688 for(hr=0;hr<HOST_REGS;hr++) {
1689 if(i_regs->regmap[hr]>=0) reglist_full|=1<<hr;
1691 reglist=reglist_full&CALLER_SAVE_REGS;
1693 if (gte_handlers[c2op]!=NULL) {
1694 need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works
1695 need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00;
1696 assem_debug("gte op %08x, unneeded %016lx, need_flags %d, need_ir %d\n",
1697 source[i],gte_unneeded[i+1],need_flags,need_ir);
1698 if(HACK_ENABLED(NDHACK_GTE_NO_FLAGS))
1700 //int shift = (source[i] >> 19) & 1;
1701 //int lm = (source[i] >> 10) & 1;
1705 c2op_prologue(c2op, i, i_regs, reglist);
1706 emit_movimm(source[i],1); // opcode
1707 emit_writeword(1,&psxRegs.code);
1708 emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]);
1711 c2op_epilogue(c2op,reglist);
1715 static void c2op_ctc2_31_assemble(signed char sl, signed char temp)
1717 //value = value & 0x7ffff000;
1718 //if (value & 0x7f87e000) value |= 0x80000000;
1719 emit_andimm(sl, 0x7fffe000, temp);
1720 emit_testimm(temp, 0xff87ffff);
1721 emit_andimm(sl, 0x7ffff000, temp);
1722 host_tempreg_acquire();
1723 emit_orimm(temp, 0x80000000, HOST_TEMPREG);
1724 emit_cmovne_reg(HOST_TEMPREG, temp);
1725 host_tempreg_release();
1726 assert(0); // testing needed
1729 static void do_mfc2_31_one(u_int copr,signed char temp)
1731 emit_readshword(®_cop2d[copr],temp);
1732 emit_bicsar_imm(temp,31,temp);
1733 emit_cmpimm(temp,0xf80);
1734 emit_csinvle_reg(temp,WZR,temp); // if (temp > 0xf80) temp = ~0;
1735 emit_andimm(temp,0xf80,temp);
1738 static void c2op_mfc2_29_assemble(signed char tl, signed char temp)
1741 host_tempreg_acquire();
1742 temp = HOST_TEMPREG;
1744 do_mfc2_31_one(9,temp);
1745 emit_shrimm(temp,7,tl);
1746 do_mfc2_31_one(10,temp);
1747 emit_orrshr_imm(temp,2,tl);
1748 do_mfc2_31_one(11,temp);
1749 emit_orrshl_imm(temp,3,tl);
1750 emit_writeword(tl,®_cop2d[29]);
1752 if (temp == HOST_TEMPREG)
1753 host_tempreg_release();
1756 static void multdiv_assemble_arm64(int i, const struct regstat *i_regs)
1762 if(dops[i].rs1&&dops[i].rs2)
1764 switch(dops[i].opcode2)
1769 signed char m1=get_reg(i_regs->regmap,dops[i].rs1);
1770 signed char m2=get_reg(i_regs->regmap,dops[i].rs2);
1771 signed char hi=get_reg(i_regs->regmap,HIREG);
1772 signed char lo=get_reg(i_regs->regmap,LOREG);
1778 if(dops[i].opcode2==0x18) // MULT
1779 emit_smull(m1,m2,hi);
1781 emit_umull(m1,m2,hi);
1784 emit_shrimm64(hi,32,hi);
1790 signed char numerator=get_reg(i_regs->regmap,dops[i].rs1);
1791 signed char denominator=get_reg(i_regs->regmap,dops[i].rs2);
1792 signed char quotient=get_reg(i_regs->regmap,LOREG);
1793 signed char remainder=get_reg(i_regs->regmap,HIREG);
1794 assert(numerator>=0);
1795 assert(denominator>=0);
1796 assert(quotient>=0);
1797 assert(remainder>=0);
1799 if (dops[i].opcode2 == 0x1A) // DIV
1800 emit_sdiv(numerator,denominator,quotient);
1802 emit_udiv(numerator,denominator,quotient);
1803 emit_msub(quotient,denominator,numerator,remainder);
1805 // div 0 quotient (remainder is already correct)
1806 host_tempreg_acquire();
1807 if (dops[i].opcode2 == 0x1A) // DIV
1808 emit_sub_asrimm(0,numerator,31,HOST_TEMPREG);
1810 emit_movimm(~0,HOST_TEMPREG);
1811 emit_test(denominator,denominator);
1812 emit_cmoveq_reg(HOST_TEMPREG,quotient);
1813 host_tempreg_release();
1822 signed char hr=get_reg(i_regs->regmap,HIREG);
1823 signed char lr=get_reg(i_regs->regmap,LOREG);
1824 if ((dops[i].opcode2==0x1A || dops[i].opcode2==0x1B) && dops[i].rs2==0) // div 0
1827 signed char numerator = get_reg(i_regs->regmap, dops[i].rs1);
1828 assert(numerator >= 0);
1830 emit_mov(numerator,hr);
1832 if (dops[i].opcode2 == 0x1A) // DIV
1833 emit_sub_asrimm(0,numerator,31,lr);
1839 if (hr >= 0) emit_zeroreg(hr);
1840 if (lr >= 0) emit_movimm(~0,lr);
1845 // Multiply by zero is zero.
1846 if (hr >= 0) emit_zeroreg(hr);
1847 if (lr >= 0) emit_zeroreg(lr);
1851 #define multdiv_assemble multdiv_assemble_arm64
1853 static void do_jump_vaddr(u_int rs)
1857 emit_far_call(ndrc_get_addr_ht);
1861 static void do_preload_rhash(u_int r) {
1862 // Don't need this for ARM. On x86, this puts the value 0xf8 into the
1863 // register. On ARM the hash can be done with a single instruction (below)
1866 static void do_preload_rhtbl(u_int ht) {
1867 emit_addimm64(FP, (u_char *)&mini_ht - (u_char *)&dynarec_local, ht);
1870 static void do_rhash(u_int rs,u_int rh) {
1871 emit_andimm(rs, 0xf8, rh);
1874 static void do_miniht_load(int ht, u_int rh) {
1875 emit_add64(ht, rh, ht);
1876 emit_ldst(0, 0, rh, ht, 0);
1879 static void do_miniht_jump(u_int rs, u_int rh, u_int ht) {
1885 set_jump_target(jaddr, out);
1886 assem_debug("ldr %s,[%s,#8]\n",regname64[ht], regname64[ht]);
1887 output_w32(0xf9400000 | imm12_rn_rd(8 >> 3, ht, ht));
1891 // parsed by set_jump_target?
1892 static void do_miniht_insert(u_int return_address,u_int rt,int temp) {
1893 emit_movz_lsl16((return_address>>16)&0xffff,rt);
1894 emit_movk(return_address&0xffff,rt);
1895 add_to_linker(out,return_address,1);
1897 emit_writedword(temp,&mini_ht[(return_address&0xFF)>>3][1]);
1898 emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]);
1901 static void clear_cache_arm64(char *start, char *end)
1903 // Don't rely on GCC's __clear_cache implementation, as it caches
1904 // icache/dcache cache line sizes, that can vary between cores on
1905 // big.LITTLE architectures.
1906 uint64_t addr, ctr_el0;
1907 static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
1908 size_t isize, dsize;
1910 __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
1911 isize = 4 << ((ctr_el0 >> 0) & 0xf);
1912 dsize = 4 << ((ctr_el0 >> 16) & 0xf);
1914 // use the global minimum cache line size
1915 icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
1916 dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
1918 /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
1919 not required for instruction to data coherence. */
1920 if ((ctr_el0 & (1 << 28)) == 0x0) {
1921 addr = (uint64_t)start & ~(uint64_t)(dsize - 1);
1922 for (; addr < (uint64_t)end; addr += dsize)
1923 // use "civac" instead of "cvau", as this is the suggested workaround for
1924 // Cortex-A53 errata 819472, 826319, 827319 and 824069.
1925 __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
1927 __asm__ volatile("dsb ish" : : : "memory");
1929 /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
1930 Unification is not required for instruction to data coherence. */
1931 if ((ctr_el0 & (1 << 29)) == 0x0) {
1932 addr = (uint64_t)start & ~(uint64_t)(isize - 1);
1933 for (; addr < (uint64_t)end; addr += isize)
1934 __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
1936 __asm__ volatile("dsb ish" : : : "memory");
1939 __asm__ volatile("isb" : : : "memory");
1942 // CPU-architecture-specific initialization
1943 static void arch_init(void)
1945 uintptr_t diff = (u_char *)&ndrc->tramp.f - (u_char *)&ndrc->tramp.ops;
1946 struct tramp_insns *ops = ndrc->tramp.ops;
1948 assert(!(diff & 3));
1949 start_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
1950 for (i = 0; i < ARRAY_SIZE(ndrc->tramp.ops); i++) {
1951 ops[i].ldr = 0x58000000 | imm19_rt(diff >> 2, 17); // ldr x17, [=val]
1952 ops[i].br = 0xd61f0000 | rm_rn_rd(0, 17, 0); // br x17
1954 end_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
1957 // vim:shiftwidth=2:expandtab