From: Paul Cercueil Date: Sat, 11 Feb 2023 09:38:17 +0000 (+0000) Subject: Update Lightrec 2023-02-08 (#715) X-Git-Tag: r24l~336 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9259d7486618d69721fee743c3e4d0b5c83805fe;p=pcsx_rearmed.git Update Lightrec 2023-02-08 (#715) * git subrepo pull --force deps/lightrec subrepo: subdir: "deps/lightrec" merged: "3ff589bcb7" upstream: origin: "https://github.com/pcercuei/lightrec.git" branch: "master" commit: "3ff589bcb7" git-subrepo: version: "0.4.3" origin: "https://github.com/ingydotnet/git-subrepo.git" commit: "2f68596" * lightrec: Add new C files to the Makefile Lightrec v0.7 added a "constprop.c" source file which we need to compile as well. Signed-off-by: Paul Cercueil --------- Signed-off-by: Paul Cercueil --- diff --git a/Makefile b/Makefile index e694818d..7d0757d8 100644 --- a/Makefile +++ b/Makefile @@ -113,6 +113,7 @@ OBJS += deps/lightning/lib/jit_disasm.o \ deps/lightning/lib/jit_size.o \ deps/lightning/lib/lightning.o \ deps/lightrec/blockcache.o \ + deps/lightrec/constprop.o \ deps/lightrec/disassembler.o \ deps/lightrec/emitter.o \ deps/lightrec/interpreter.o \ diff --git a/deps/lightrec/.gitrepo b/deps/lightrec/.gitrepo index 0941dea9..6e8794f0 100644 --- a/deps/lightrec/.gitrepo +++ b/deps/lightrec/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/pcercuei/lightrec.git branch = master - commit = 4e55a15656deb7b2529546af114668fb5dc6870e - parent = fc7764f123b6445060e56dd27152fffefdec9404 + commit = 3ff589bcb7d52b3a091fe0b922ba02a0b1a7f095 + parent = aced3eb3fcaa0fe13c44c4dd196cdab42555fd98 method = merge cmdver = 0.4.3 diff --git a/deps/lightrec/CMakeLists.txt b/deps/lightrec/CMakeLists.txt index aa8440b6..12da14ea 100644 --- a/deps/lightrec/CMakeLists.txt +++ b/deps/lightrec/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.0) -project(lightrec LANGUAGES C VERSION 0.5) +project(lightrec LANGUAGES C VERSION 0.7) set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared libraries") if (NOT BUILD_SHARED_LIBS) @@ -29,6 +29,7 @@ set(HAS_DEFAULT_ELM ${CMAKE_COMPILER_IS_GNUCC}) list(APPEND LIGHTREC_SOURCES blockcache.c + constprop.c emitter.c interpreter.c lightrec.c @@ -38,6 +39,7 @@ list(APPEND LIGHTREC_SOURCES ) list(APPEND LIGHTREC_HEADERS blockcache.h + constprop.h debug.h disassembler.h emitter.h @@ -52,7 +54,7 @@ list(APPEND LIGHTREC_HEADERS option(ENABLE_FIRST_PASS "Run the interpreter as first-pass optimization" ON) -option(ENABLE_THREADED_COMPILER "Enable threaded compiler" ON) +option(ENABLE_THREADED_COMPILER "Enable threaded compiler" OFF) if (ENABLE_THREADED_COMPILER) list(APPEND LIGHTREC_SOURCES recompiler.c reaper.c) @@ -99,7 +101,7 @@ if (ENABLE_THREADED_COMPILER) target_link_libraries(${PROJECT_NAME} PRIVATE ${PTHREAD_LIBRARIES}) endif (ENABLE_THREADED_COMPILER) -option(ENABLE_CODE_BUFFER "Enable external code buffer" OFF) +option(ENABLE_CODE_BUFFER "Enable external code buffer" ON) if (ENABLE_CODE_BUFFER) target_sources(${PROJECT_NAME} PRIVATE tlsf/tlsf.c) target_include_directories(${PROJECT_NAME} PRIVATE tlsf) diff --git a/deps/lightrec/constprop.c b/deps/lightrec/constprop.c new file mode 100644 index 00000000..353f42f1 --- /dev/null +++ b/deps/lightrec/constprop.c @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +/* + * Copyright (C) 2022 Paul Cercueil + */ + +#include "constprop.h" +#include "disassembler.h" +#include "lightrec-private.h" + +#include +#include + +static u32 get_min_value(const struct constprop_data *d) +{ + /* Min value: all sign bits to 1, all unknown bits but MSB to 0 */ + return (d->value & d->known) | d->sign | (~d->known & BIT(31)); +} + +static u32 get_max_value(const struct constprop_data *d) +{ + /* Max value: all sign bits to 0, all unknown bits to 1 */ + return ((d->value & d->known) | ~d->known) & ~d->sign; +} + +static u32 lightrec_same_sign(const struct constprop_data *d1, + const struct constprop_data *d2) +{ + u32 min1, min2, max1, max2, a, b, c, d; + + min1 = get_min_value(d1); + max1 = get_max_value(d1); + min2 = get_min_value(d2); + max2 = get_max_value(d2); + + a = min1 + min2; + b = min1 + max2; + c = max1 + min2; + d = max1 + max2; + + return ((a & b & c & d) | (~a & ~b & ~c & ~d)) & BIT(31); +} + +static u32 lightrec_get_sign_mask(const struct constprop_data *d) +{ + u32 imm; + + if (d->sign) + return d->sign; + + imm = (d->value & BIT(31)) ? d->value : ~d->value; + imm = ~(imm & d->known); + if (imm) + imm = 32 - clz32(imm); + + return imm < 32 ? GENMASK(31, imm) : 0; +} + +static void lightrec_propagate_addi(u32 rs, u32 rd, + const struct constprop_data *d, + struct constprop_data *v) +{ + u32 end, bit, sum, min, mask, imm, value; + struct constprop_data result = { + .value = v[rd].value, + .known = v[rd].known, + .sign = v[rd].sign, + }; + bool carry = false; + + /* clear unknown bits to ease processing */ + v[rs].value &= v[rs].known; + value = d->value & d->known; + + mask = ~(lightrec_get_sign_mask(d) & lightrec_get_sign_mask(&v[rs])); + end = mask ? 32 - clz32(mask) : 0; + + for (bit = 0; bit < 32; bit++) { + if (v[rs].known & d->known & BIT(bit)) { + /* the bits are known - compute the resulting bit and + * the carry */ + sum = ((u32)carry << bit) + (v[rs].value & BIT(bit)) + + (value & BIT(bit)); + + if (sum & BIT(bit)) + result.value |= BIT(bit); + else + result.value &= ~BIT(bit); + + result.known |= BIT(bit); + result.sign &= ~BIT(bit); + carry = sum & BIT(bit + 1); + continue; + } + + if (bit >= end) { + /* We're past the last significant bits of the values + * (extra sign bits excepted). + * The destination register will be sign-extended + * starting from here (if no carry) or from the next + * bit (if carry). + * If the source registers are not sign-extended and we + * have no carry, the algorithm is done here. */ + + if ((v[rs].sign | d->sign) & BIT(bit)) { + mask = GENMASK(31, bit); + + if (lightrec_same_sign(&v[rs], d)) { + /* Theorical minimum and maximum values + * have the same sign; therefore the + * sign bits are known. */ + min = get_min_value(&v[rs]) + + get_min_value(d); + result.value = (min & mask) + | (result.value & ~mask); + result.known |= mask << carry; + result.sign = 0; + } else { + /* min/max have different signs. */ + result.sign = mask << 1; + result.known &= ~mask; + } + break; + } else if (!carry) { + /* Past end bit, no carry; we're done here. */ + break; + } + } + + result.known &= ~BIT(bit); + result.sign &= ~BIT(bit); + + /* Found an unknown bit in one of the registers. + * If the carry and the bit in the other register are both zero, + * we can continue the algorithm. */ + if (!carry && (((d->known & ~value) + | (v[rs].known & ~v[rs].value)) & BIT(bit))) + continue; + + /* We have an unknown bit in one of the source registers, and we + * may generate a carry: there's nothing to do. Everything from + * this bit till the next known 0 bit or sign bit will be marked + * as unknown. The algorithm can then restart at the following + * bit. */ + + imm = (v[rs].known & d->known & ~v[rs].value & ~value) + | v[rs].sign | d->sign; + + imm &= GENMASK(31, bit); + imm = imm ? ctz32(imm) : 31; + mask = GENMASK(imm, bit); + result.known &= ~mask; + result.sign &= ~mask; + + bit = imm; + carry = false; + } + + v[rd] = result; +} + +static void lightrec_propagate_sub(u32 rs, u32 rt, u32 rd, + struct constprop_data *v) +{ + struct constprop_data d = { + .value = ~v[rt].value, + .known = v[rt].known, + .sign = v[rt].sign, + }; + u32 imm, mask, bit; + + /* Negate the known Rt value, then propagate as a regular ADD. */ + + for (bit = 0; bit < 32; bit++) { + if (!(d.known & BIT(bit))) { + /* Unknown bit - mark bits unknown up to the next known 0 */ + + imm = (d.known & ~d.value) | d.sign; + imm &= GENMASK(31, bit); + imm = imm ? ctz32(imm) : 31; + mask = GENMASK(imm, bit); + d.known &= ~mask; + d.sign &= ~mask; + break; + } + + if (!(d.value & BIT(bit))) { + /* Bit is 0: we can set our carry, and the algorithm is done. */ + d.value |= BIT(bit); + break; + } + + /* Bit is 1 - set to 0 and continue algorithm */ + d.value &= ~BIT(bit); + } + + lightrec_propagate_addi(rs, rd, &d, v); +} + +static void lightrec_propagate_slt(u32 rs, u32 rd, bool is_signed, + const struct constprop_data *d, + struct constprop_data *v) +{ + unsigned int bit; + + if (is_signed && (v[rs].known & d->known + & (v[rs].value ^ d->value) & BIT(31))) { + /* If doing a signed comparison and the two bits 31 are known + * to be opposite, we can deduce the value. */ + v[rd].value = v[rs].value >> 31; + v[rd].known = 0xffffffff; + v[rd].sign = 0; + return; + } + + for (bit = 32; bit > 0; bit--) { + if (!(v[rs].known & d->known & BIT(bit - 1))) { + /* One bit is unknown and we cannot figure out which + * value is smaller. We still know that the upper 31 + * bits are zero. */ + v[rd].value = 0; + v[rd].known = 0xfffffffe; + v[rd].sign = 0; + break; + } + + /* The two bits are equal - continue to the next bit. */ + if (~(v[rs].value ^ d->value) & BIT(bit - 1)) + continue; + + /* The two bits aren't equal; we can therefore deduce which + * value is smaller. */ + v[rd].value = !(v[rs].value & BIT(bit - 1)); + v[rd].known = 0xffffffff; + v[rd].sign = 0; + break; + } + + if (bit == 0) { + /* rs == rt and all bits are known */ + v[rd].value = 0; + v[rd].known = 0xffffffff; + v[rd].sign = 0; + } +} + +void lightrec_consts_propagate(const struct opcode *list, + unsigned int idx, + struct constprop_data *v) +{ + union code c; + u32 imm; + + if (idx == 0) + return; + + /* Register $zero is always, well, zero */ + v[0].value = 0; + v[0].sign = 0; + v[0].known = 0xffffffff; + + if (op_flag_sync(list[idx].flags)) { + memset(&v[1], 0, sizeof(*v) * 31); + return; + } + + if (idx > 1 && !op_flag_sync(list[idx - 1].flags)) { + c = list[idx - 2].c; + + switch (c.i.op) { + case OP_BNE: + /* After a BNE $zero + delay slot, we know that the + * branch wasn't taken, and therefore the other register + * is zero. */ + if (c.i.rs == 0) { + v[c.i.rt].value = 0; + v[c.i.rt].sign = 0; + v[c.i.rt].known = 0xffffffff; + } else if (c.i.rt == 0) { + v[c.i.rs].value = 0; + v[c.i.rs].sign = 0; + v[c.i.rs].known = 0xffffffff; + } + break; + case OP_BLEZ: + v[c.i.rs].value &= ~BIT(31); + v[c.i.rs].known |= BIT(31); + fallthrough; + case OP_BEQ: + /* TODO: handle non-zero? */ + break; + case OP_REGIMM: + switch (c.r.rt) { + case OP_REGIMM_BLTZ: + case OP_REGIMM_BLTZAL: + v[c.i.rs].value &= ~BIT(31); + v[c.i.rs].known |= BIT(31); + break; + case OP_REGIMM_BGEZ: + case OP_REGIMM_BGEZAL: + v[c.i.rs].value |= BIT(31); + v[c.i.rs].known |= BIT(31); + /* TODO: handle non-zero? */ + break; + } + break; + default: + break; + } + } + + c = list[idx - 1].c; + + switch (c.i.op) { + case OP_SPECIAL: + switch (c.r.op) { + case OP_SPECIAL_SLL: + v[c.r.rd].value = v[c.r.rt].value << c.r.imm; + v[c.r.rd].known = (v[c.r.rt].known << c.r.imm) + | (BIT(c.r.imm) - 1); + v[c.r.rd].sign = v[c.r.rt].sign << c.r.imm; + break; + + case OP_SPECIAL_SRL: + v[c.r.rd].value = v[c.r.rt].value >> c.r.imm; + v[c.r.rd].known = (v[c.r.rt].known >> c.r.imm) + | (BIT(c.r.imm) - 1 << 32 - c.r.imm); + v[c.r.rd].sign = c.r.imm ? 0 : v[c.r.rt].sign; + break; + + case OP_SPECIAL_SRA: + v[c.r.rd].value = (s32)v[c.r.rt].value >> c.r.imm; + v[c.r.rd].known = (s32)v[c.r.rt].known >> c.r.imm; + v[c.r.rd].sign = (s32)v[c.r.rt].sign >> c.r.imm; + break; + + case OP_SPECIAL_SLLV: + if ((v[c.r.rs].known & 0x1f) == 0x1f) { + imm = v[c.r.rs].value & 0x1f; + v[c.r.rd].value = v[c.r.rt].value << imm; + v[c.r.rd].known = (v[c.r.rt].known << imm) + | (BIT(imm) - 1); + v[c.r.rd].sign = v[c.r.rt].sign << imm; + } else { + v[c.r.rd].known = 0; + v[c.r.rd].sign = 0; + } + break; + + case OP_SPECIAL_SRLV: + if ((v[c.r.rs].known & 0x1f) == 0x1f) { + imm = v[c.r.rs].value & 0x1f; + v[c.r.rd].value = v[c.r.rt].value >> imm; + v[c.r.rd].known = (v[c.r.rt].known >> imm) + | (BIT(imm) - 1 << 32 - imm); + if (imm) + v[c.r.rd].sign = 0; + } else { + v[c.r.rd].known = 0; + v[c.r.rd].sign = 0; + } + break; + + case OP_SPECIAL_SRAV: + if ((v[c.r.rs].known & 0x1f) == 0x1f) { + imm = v[c.r.rs].value & 0x1f; + v[c.r.rd].value = (s32)v[c.r.rt].value >> imm; + v[c.r.rd].known = (s32)v[c.r.rt].known >> imm; + v[c.r.rd].sign = (s32)v[c.r.rt].sign >> imm; + } else { + v[c.r.rd].known = 0; + v[c.r.rd].sign = 0; + } + break; + + case OP_SPECIAL_ADD: + case OP_SPECIAL_ADDU: + if (is_known_zero(v, c.r.rs)) + v[c.r.rd] = v[c.r.rt]; + else if (is_known_zero(v, c.r.rt)) + v[c.r.rd] = v[c.r.rs]; + else + lightrec_propagate_addi(c.r.rs, c.r.rd, &v[c.r.rt], v); + break; + + case OP_SPECIAL_SUB: + case OP_SPECIAL_SUBU: + if (c.r.rs == c.r.rt) { + v[c.r.rd].value = 0; + v[c.r.rd].known = 0xffffffff; + v[c.r.rd].sign = 0; + } else { + lightrec_propagate_sub(c.r.rs, c.r.rt, c.r.rd, v); + } + break; + + case OP_SPECIAL_AND: + v[c.r.rd].known = (v[c.r.rt].known & v[c.r.rs].known) + | (~v[c.r.rt].value & v[c.r.rt].known) + | (~v[c.r.rs].value & v[c.r.rs].known); + v[c.r.rd].value = v[c.r.rt].value & v[c.r.rs].value & v[c.r.rd].known; + v[c.r.rd].sign = v[c.r.rt].sign & v[c.r.rs].sign; + break; + + case OP_SPECIAL_OR: + v[c.r.rd].known = (v[c.r.rt].known & v[c.r.rs].known) + | (v[c.r.rt].value & v[c.r.rt].known) + | (v[c.r.rs].value & v[c.r.rs].known); + v[c.r.rd].value = (v[c.r.rt].value | v[c.r.rs].value) & v[c.r.rd].known; + v[c.r.rd].sign = v[c.r.rt].sign & v[c.r.rs].sign; + break; + + case OP_SPECIAL_XOR: + v[c.r.rd].value = v[c.r.rt].value ^ v[c.r.rs].value; + v[c.r.rd].known = v[c.r.rt].known & v[c.r.rs].known; + v[c.r.rd].sign = v[c.r.rt].sign & v[c.r.rs].sign; + break; + + case OP_SPECIAL_NOR: + v[c.r.rd].known = (v[c.r.rt].known & v[c.r.rs].known) + | (v[c.r.rt].value & v[c.r.rt].known) + | (v[c.r.rs].value & v[c.r.rs].known); + v[c.r.rd].value = ~(v[c.r.rt].value | v[c.r.rs].value) & v[c.r.rd].known; + v[c.r.rd].sign = v[c.r.rt].sign & v[c.r.rs].sign; + break; + + case OP_SPECIAL_SLT: + case OP_SPECIAL_SLTU: + lightrec_propagate_slt(c.r.rs, c.r.rd, + c.r.op == OP_SPECIAL_SLT, + &v[c.r.rt], v); + break; + + case OP_SPECIAL_MULT: + case OP_SPECIAL_MULTU: + case OP_SPECIAL_DIV: + case OP_SPECIAL_DIVU: + if (OPT_FLAG_MULT_DIV && c.r.rd) { + v[c.r.rd].known = 0; + v[c.r.rd].sign = 0; + } + if (OPT_FLAG_MULT_DIV && c.r.imm) { + v[c.r.imm].known = 0; + v[c.r.imm].sign = 0; + } + break; + + case OP_SPECIAL_MFLO: + case OP_SPECIAL_MFHI: + v[c.r.rd].known = 0; + v[c.r.rd].sign = 0; + break; + default: + break; + } + break; + + case OP_META_MULT2: + case OP_META_MULTU2: + if (OPT_FLAG_MULT_DIV && c.r.rd) { + if (c.r.op < 32) { + v[c.r.rd].value = v[c.r.rs].value << c.r.op; + v[c.r.rd].known = (v[c.r.rs].known << c.r.op) + | (BIT(c.r.op) - 1); + v[c.r.rd].sign = v[c.r.rs].sign << c.r.op; + } else { + v[c.r.rd].value = 0; + v[c.r.rd].known = 0xffffffff; + v[c.r.rd].sign = 0; + } + } + + if (OPT_FLAG_MULT_DIV && c.r.imm) { + if (c.r.op >= 32) { + v[c.r.imm].value = v[c.r.rs].value << c.r.op - 32; + v[c.r.imm].known = (v[c.r.rs].known << c.r.op - 32) + | (BIT(c.r.op - 32) - 1); + v[c.r.imm].sign = v[c.r.rs].sign << c.r.op - 32; + } else if (c.i.op == OP_META_MULT2) { + v[c.r.imm].value = (s32)v[c.r.rs].value >> 32 - c.r.op; + v[c.r.imm].known = (s32)v[c.r.rs].known >> 32 - c.r.op; + v[c.r.imm].sign = (s32)v[c.r.rs].sign >> 32 - c.r.op; + } else { + v[c.r.imm].value = v[c.r.rs].value >> 32 - c.r.op; + v[c.r.imm].known = v[c.r.rs].known >> 32 - c.r.op; + v[c.r.imm].sign = v[c.r.rs].sign >> 32 - c.r.op; + } + } + break; + + case OP_REGIMM: + break; + + case OP_ADDI: + case OP_ADDIU: + if (c.i.imm) { + struct constprop_data d = { + .value = (s32)(s16)c.i.imm, + .known = 0xffffffff, + .sign = 0, + }; + + lightrec_propagate_addi(c.i.rs, c.i.rt, &d, v); + } else { + /* immediate is zero - that's just a register copy. */ + v[c.i.rt] = v[c.i.rs]; + } + break; + + case OP_SLTI: + case OP_SLTIU: + { + struct constprop_data d = { + .value = (s32)(s16)c.i.imm, + .known = 0xffffffff, + .sign = 0, + }; + + lightrec_propagate_slt(c.i.rs, c.i.rt, + c.i.op == OP_SLTI, &d, v); + } + break; + + case OP_ANDI: + v[c.i.rt].value = v[c.i.rs].value & c.i.imm; + v[c.i.rt].known = v[c.i.rs].known | ~c.i.imm; + v[c.i.rt].sign = 0; + break; + + case OP_ORI: + v[c.i.rt].value = v[c.i.rs].value | c.i.imm; + v[c.i.rt].known = v[c.i.rs].known | c.i.imm; + v[c.i.rt].sign = (v[c.i.rs].sign & 0xffff) ? 0xffff0000 : v[c.i.rs].sign; + break; + + case OP_XORI: + v[c.i.rt].value = v[c.i.rs].value ^ c.i.imm; + v[c.i.rt].known = v[c.i.rs].known; + v[c.i.rt].sign = (v[c.i.rs].sign & 0xffff) ? 0xffff0000 : v[c.i.rs].sign; + break; + + case OP_LUI: + v[c.i.rt].value = c.i.imm << 16; + v[c.i.rt].known = 0xffffffff; + v[c.i.rt].sign = 0; + break; + + case OP_CP0: + switch (c.r.rs) { + case OP_CP0_MFC0: + case OP_CP0_CFC0: + v[c.r.rt].known = 0; + v[c.r.rt].sign = 0; + break; + default: + break; + } + break; + + case OP_CP2: + if (c.r.op == OP_CP2_BASIC) { + switch (c.r.rs) { + case OP_CP2_BASIC_MFC2: + switch (c.r.rd) { + case 1: + case 3: + case 5: + case 8: + case 9: + case 10: + case 11: + /* Signed 16-bit */ + v[c.r.rt].known = 0; + v[c.r.rt].sign = 0xffff8000; + break; + case 7: + case 16: + case 17: + case 18: + case 19: + /* Unsigned 16-bit */ + v[c.r.rt].value = 0; + v[c.r.rt].known = 0xffff0000; + v[c.r.rt].sign = 0; + break; + default: + /* 32-bit */ + v[c.r.rt].known = 0; + v[c.r.rt].sign = 0; + break; + } + break; + case OP_CP2_BASIC_CFC2: + switch (c.r.rd) { + case 4: + case 12: + case 20: + case 26: + case 27: + case 29: + case 30: + /* Signed 16-bit */ + v[c.r.rt].known = 0; + v[c.r.rt].sign = 0xffff8000; + break; + default: + /* 32-bit */ + v[c.r.rt].known = 0; + v[c.r.rt].sign = 0; + break; + } + break; + } + } + break; + case OP_LB: + v[c.i.rt].known = 0; + v[c.i.rt].sign = 0xffffff80; + break; + case OP_LH: + v[c.i.rt].known = 0; + v[c.i.rt].sign = 0xffff8000; + break; + case OP_LBU: + v[c.i.rt].value = 0; + v[c.i.rt].known = 0xffffff00; + v[c.i.rt].sign = 0; + break; + case OP_LHU: + v[c.i.rt].value = 0; + v[c.i.rt].known = 0xffff0000; + v[c.i.rt].sign = 0; + break; + case OP_LWL: + case OP_LWR: + /* LWL/LWR don't write the full register if the address is + * unaligned, so we only need to know the low 2 bits */ + if (v[c.i.rs].known & 0x3) { + imm = (v[c.i.rs].value & 0x3) * 8; + + if (c.i.op == OP_LWL) { + imm = BIT(24 - imm) - 1; + v[c.i.rt].sign &= ~imm; + } else { + imm = imm ? GENMASK(31, 32 - imm) : 0; + v[c.i.rt].sign = 0; + } + v[c.i.rt].known &= ~imm; + break; + } + fallthrough; + case OP_LW: + v[c.i.rt].known = 0; + v[c.i.rt].sign = 0; + break; + case OP_META_MOV: + v[c.r.rd] = v[c.r.rs]; + break; + case OP_META_EXTC: + v[c.i.rt].value = (s32)(s8)v[c.i.rs].value; + if (v[c.i.rs].known & BIT(7)) { + v[c.i.rt].known = v[c.i.rs].known | 0xffffff00; + v[c.i.rt].sign = 0; + } else { + v[c.i.rt].known = v[c.i.rs].known & 0x7f; + v[c.i.rt].sign = 0xffffff80; + } + break; + + case OP_META_EXTS: + v[c.i.rt].value = (s32)(s16)v[c.i.rs].value; + if (v[c.i.rs].known & BIT(15)) { + v[c.i.rt].known = v[c.i.rs].known | 0xffff0000; + v[c.i.rt].sign = 0; + } else { + v[c.i.rt].known = v[c.i.rs].known & 0x7fff; + v[c.i.rt].sign = 0xffff8000; + } + break; + + default: + break; + } + + /* Reset register 0 which may have been used as a target */ + v[0].value = 0; + v[0].sign = 0; + v[0].known = 0xffffffff; +} + +enum psx_map +lightrec_get_constprop_map(const struct lightrec_state *state, + const struct constprop_data *v, u8 reg, s16 imm) +{ + const struct lightrec_mem_map *map; + unsigned int i; + u32 min, max; + + min = get_min_value(&v[reg]) + imm; + max = get_max_value(&v[reg]) + imm; + + /* Handle the case where max + imm overflows */ + if ((min & 0xe0000000) != (max & 0xe0000000)) + return PSX_MAP_UNKNOWN; + + pr_debug("Min: 0x%08x max: 0x%08x Known: 0x%08x Sign: 0x%08x\n", + min, max, v[reg].known, v[reg].sign); + + min = kunseg(min); + max = kunseg(max); + + for (i = 0; i < state->nb_maps; i++) { + map = &state->maps[i]; + + if (min >= map->pc && min < map->pc + map->length + && max >= map->pc && max < map->pc + map->length) + return (enum psx_map) i; + } + + return PSX_MAP_UNKNOWN; +} diff --git a/deps/lightrec/constprop.h b/deps/lightrec/constprop.h new file mode 100644 index 00000000..cebf0b38 --- /dev/null +++ b/deps/lightrec/constprop.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* + * Copyright (C) 2022 Paul Cercueil + */ + +#ifndef __LIGHTREC_CONSTPROP_H__ +#define __LIGHTREC_CONSTPROP_H__ + +#include "lightrec.h" + +#define LIGHTREC_CONSTPROP_INITIALIZER { { 0, 0xffffffff, 0 }, } + +struct opcode; + +struct constprop_data { + u32 value; + u32 known; + u32 sign; +}; + +static inline _Bool is_known(const struct constprop_data *v, u8 reg) +{ + return v[reg].known == 0xffffffff; +} + +static inline _Bool bits_are_known_zero(const struct constprop_data *v, + u8 reg, u32 mask) +{ + return !(~v[reg].known & mask) && !(v[reg].value & mask); +} + +static inline _Bool is_known_zero(const struct constprop_data *v, u8 reg) +{ + return bits_are_known_zero(v, reg, 0xffffffff); +} + +void lightrec_consts_propagate(const struct opcode *list, + unsigned int idx, + struct constprop_data *v); + +enum psx_map +lightrec_get_constprop_map(const struct lightrec_state *state, + const struct constprop_data *v, u8 reg, s16 imm); + +#endif /* __LIGHTREC_CONSTPROP_H__ */ diff --git a/deps/lightrec/disassembler.c b/deps/lightrec/disassembler.c index 1a217bc2..bef95948 100644 --- a/deps/lightrec/disassembler.c +++ b/deps/lightrec/disassembler.c @@ -295,7 +295,7 @@ static int print_op_special(union code c, char *buf, size_t len, return snprintf(buf, len, "%s%s,%s", special_opcodes[c.r.op], lightrec_reg_name(c.r.rd), - lightrec_reg_name(c.r.rt)); + lightrec_reg_name(c.r.rs)); case OP_SPECIAL_SYSCALL: case OP_SPECIAL_BREAK: return snprintf(buf, len, "%s", special_opcodes[c.r.op]); diff --git a/deps/lightrec/emitter.c b/deps/lightrec/emitter.c index be50d6d8..0eff0ce3 100644 --- a/deps/lightrec/emitter.c +++ b/deps/lightrec/emitter.c @@ -21,6 +21,11 @@ static void rec_SPECIAL(struct lightrec_cstate *state, const struct block *block static void rec_REGIMM(struct lightrec_cstate *state, const struct block *block, u16 offset); static void rec_CP0(struct lightrec_cstate *state, const struct block *block, u16 offset); static void rec_CP2(struct lightrec_cstate *state, const struct block *block, u16 offset); +static void rec_cp2_do_mtc2(struct lightrec_cstate *state, + const struct block *block, u16 offset, u8 reg, u8 in_reg); +static void rec_cp2_do_mfc2(struct lightrec_cstate *state, + const struct block *block, u16 offset, + u8 reg, u8 out_reg); static void unknown_opcode(struct lightrec_cstate *state, const struct block *block, u16 offset) { @@ -38,6 +43,16 @@ lightrec_jump_to_eob(struct lightrec_cstate *state, jit_state_t *_jit) jit_patch_abs(jit_jmpi(), state->state->eob_wrapper_func); } +static void update_ra_register(struct regcache *reg_cache, jit_state_t *_jit, + u8 ra_reg, u32 pc, u32 link) +{ + u8 link_reg; + + link_reg = lightrec_alloc_reg_out(reg_cache, _jit, ra_reg, 0); + lightrec_load_imm(reg_cache, _jit, link_reg, pc, link); + lightrec_free_reg(reg_cache, link_reg); +} + static void lightrec_emit_end_of_block(struct lightrec_cstate *state, const struct block *block, u16 offset, s8 reg_new_pc, u32 imm, u8 ra_reg, @@ -51,18 +66,19 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state, jit_note(__FILE__, __LINE__); - if (link) { - /* Update the $ra register */ - u8 link_reg = lightrec_alloc_reg_out(reg_cache, _jit, ra_reg, 0); - jit_movi(link_reg, link); - lightrec_free_reg(reg_cache, link_reg); - } + if (link && ra_reg != reg_new_pc) + update_ra_register(reg_cache, _jit, ra_reg, block->pc, link); - if (reg_new_pc < 0) { - reg_new_pc = lightrec_alloc_reg(reg_cache, _jit, JIT_V0); - lightrec_lock_reg(reg_cache, _jit, reg_new_pc); + if (reg_new_pc < 0) + lightrec_load_next_pc_imm(reg_cache, _jit, block->pc, imm); + else + lightrec_load_next_pc(reg_cache, _jit, reg_new_pc); - jit_movi(reg_new_pc, imm); + if (link && ra_reg == reg_new_pc) { + /* Handle the special case: JALR $r0, $r0 + * In that case the target PC should be the old value of the + * register. */ + update_ra_register(reg_cache, _jit, ra_reg, block->pc, link); } if (has_delay_slot(op->c) && @@ -77,8 +93,6 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state, /* Clean the remaining registers */ lightrec_clean_regs(reg_cache, _jit); - jit_movr(JIT_V0, reg_new_pc); - if (cycles && update_cycles) { jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, cycles); pr_debug("EOB: %u cycles\n", cycles); @@ -95,40 +109,27 @@ void lightrec_emit_eob(struct lightrec_cstate *state, lightrec_clean_regs(reg_cache, _jit); - jit_movi(JIT_V0, block->pc + (offset << 2)); + lightrec_load_imm(reg_cache, _jit, JIT_V0, block->pc, + block->pc + (offset << 2)); jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, state->cycles); lightrec_jump_to_eob(state, _jit); } -static u8 get_jr_jalr_reg(struct lightrec_cstate *state, const struct block *block, u16 offset) -{ - struct regcache *reg_cache = state->reg_cache; - jit_state_t *_jit = block->_jit; - const struct opcode *op = &block->opcode_list[offset]; - u8 rs; - - rs = lightrec_request_reg_in(reg_cache, _jit, op->r.rs, JIT_V0); - lightrec_lock_reg(reg_cache, _jit, rs); - - return rs; -} - static void rec_special_JR(struct lightrec_cstate *state, const struct block *block, u16 offset) { - u8 rs = get_jr_jalr_reg(state, block, offset); + union code c = block->opcode_list[offset].c; _jit_name(block->_jit, __func__); - lightrec_emit_end_of_block(state, block, offset, rs, 0, 31, 0, true); + lightrec_emit_end_of_block(state, block, offset, c.r.rs, 0, 31, 0, true); } static void rec_special_JALR(struct lightrec_cstate *state, const struct block *block, u16 offset) { - u8 rs = get_jr_jalr_reg(state, block, offset); union code c = block->opcode_list[offset].c; _jit_name(block->_jit, __func__); - lightrec_emit_end_of_block(state, block, offset, rs, 0, c.r.rd, + lightrec_emit_end_of_block(state, block, offset, c.r.rs, 0, c.r.rd, get_branch_pc(block, offset, 2), true); } @@ -199,12 +200,12 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 const struct opcode *op = &block->opcode_list[offset], *next = &block->opcode_list[offset + 1]; jit_node_t *addr; - u8 link_reg, rs, rt; bool is_forward = (s16)op->i.imm >= -1; int op_cycles = lightrec_cycles_of_opcode(op->c); u32 target_offset, cycles = state->cycles + op_cycles; bool no_indirection = false; u32 next_pc; + u8 rs, rt; jit_note(__FILE__, __LINE__); @@ -248,12 +249,8 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 if (!op_flag_no_ds(op->flags) && next->opcode) lightrec_rec_opcode(state, block, offset + 1); - if (link) { - /* Update the $ra register */ - link_reg = lightrec_alloc_reg_out(reg_cache, _jit, 31, 0); - jit_movi(link_reg, link); - lightrec_free_reg(reg_cache, link_reg); - } + if (link) + update_ra_register(reg_cache, _jit, 31, block->pc, link); /* Clean remaining registers */ lightrec_clean_regs(reg_cache, _jit); @@ -287,13 +284,8 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 lightrec_regcache_leave_branch(reg_cache, regs_backup); - if (bz && link) { - /* Update the $ra register */ - link_reg = lightrec_alloc_reg_out(reg_cache, _jit, - 31, REG_EXT); - jit_movi(link_reg, (s32)link); - lightrec_free_reg(reg_cache, link_reg); - } + if (bz && link) + update_ra_register(reg_cache, _jit, 31, block->pc, link); if (!op_flag_no_ds(op->flags) && next->opcode) lightrec_rec_opcode(state, block, offset + 1); @@ -942,11 +934,11 @@ static void rec_alu_div(struct lightrec_cstate *state, if (!op_flag_no_lo(flags)) { if (is_signed) { - jit_lti(lo, rs, 0); + jit_ltr(lo, rs, rt); jit_lshi(lo, lo, 1); jit_subi(lo, lo, 1); } else { - jit_movi(lo, 0xffffffff); + jit_subi(lo, rt, 1); } } @@ -1150,8 +1142,10 @@ static void rec_store_memory(struct lightrec_cstate *cstate, ((!state->mirrors_mapped && !no_mask) || (invalidate && ((imm & 0x3) || simm + lut_offt != (s16)(simm + lut_offt)))); bool need_tmp = !no_mask || addr_offset || add_imm || invalidate; + bool swc2 = c.i.op == OP_SWC2; + u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt; - rt = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rt, 0); + rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0); if (need_tmp) tmp = lightrec_alloc_reg_temp(reg_cache, _jit); @@ -1179,7 +1173,7 @@ static void rec_store_memory(struct lightrec_cstate *cstate, addr_reg2 = addr_reg; } - if (is_big_endian() && swap_code && c.i.rt) { + if (is_big_endian() && swap_code && in_reg) { tmp3 = lightrec_alloc_reg_temp(reg_cache, _jit); jit_new_node_ww(swap_code, tmp3, rt); @@ -1273,7 +1267,8 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate, union code c = block->opcode_list[offset].c; jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_end; - u8 tmp, tmp2, rs, rt; + bool swc2 = c.i.op == OP_SWC2; + u8 tmp, tmp2, rs, rt, in_reg = swc2 ? REG_CP2_TEMP : c.i.rt; s16 imm; jit_note(__FILE__, __LINE__); @@ -1317,9 +1312,9 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate, lightrec_free_reg(reg_cache, tmp2); } - rt = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rt, 0); + rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); - if (is_big_endian() && swap_code && c.i.rt) { + if (is_big_endian() && swap_code && in_reg) { tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit); jit_new_node_ww(swap_code, tmp2, rt); @@ -1343,7 +1338,9 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block union code c = block->opcode_list[offset].c; jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_end; - u8 tmp, tmp2, tmp3, rs, rt; + bool swc2 = c.i.op == OP_SWC2; + u8 tmp, tmp2, tmp3, masked_reg, rs, rt; + u8 in_reg = swc2 ? REG_CP2_TEMP : c.i.rt; jit_note(__FILE__, __LINE__); @@ -1362,10 +1359,21 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block lightrec_free_reg(reg_cache, rs); tmp = lightrec_alloc_reg_temp(reg_cache, _jit); - to_not_ram = jit_bgti(tmp2, ram_size); + if (state->offset_ram != state->offset_scratch) { + to_not_ram = jit_bgti(tmp2, ram_size); + masked_reg = tmp2; + } else { + jit_lti_u(tmp, tmp2, ram_size); + jit_movnr(tmp, tmp2, tmp); + masked_reg = tmp; + } /* Compute the offset to the code LUT */ - jit_andi(tmp, tmp2, (RAM_SIZE - 1) & ~3); + if (c.i.op == OP_SW) + jit_andi(tmp, masked_reg, RAM_SIZE - 1); + else + jit_andi(tmp, masked_reg, (RAM_SIZE - 1) & ~3); + if (!lut_is_32bit(state)) jit_lshi(tmp, tmp, 1); jit_addr(tmp, LIGHTREC_REG_STATE, tmp); @@ -1380,10 +1388,9 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block jit_movi(tmp, state->offset_ram); to_end = jit_b(); + jit_patch(to_not_ram); } - jit_patch(to_not_ram); - if (state->offset_ram || state->offset_scratch) jit_movi(tmp, state->offset_scratch); @@ -1396,9 +1403,9 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block lightrec_free_reg(reg_cache, tmp); lightrec_free_reg(reg_cache, tmp3); - rt = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rt, 0); + rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); - if (is_big_endian() && swap_code && c.i.rt) { + if (is_big_endian() && swap_code && in_reg) { tmp = lightrec_alloc_reg_temp(reg_cache, _jit); jit_new_node_ww(swap_code, tmp, rt); @@ -1418,10 +1425,26 @@ static void rec_store(struct lightrec_cstate *state, jit_code_t code, jit_code_t swap_code) { u32 flags = block->opcode_list[offset].flags; + u32 mode = LIGHTREC_FLAGS_GET_IO_MODE(flags); bool no_invalidate = op_flag_no_invalidate(flags) || state->state->invalidate_from_dma_only; + union code c = block->opcode_list[offset].c; + bool is_swc2 = c.i.op == OP_SWC2; + + if (is_swc2) { + switch (mode) { + case LIGHTREC_IO_RAM: + case LIGHTREC_IO_SCRATCH: + case LIGHTREC_IO_DIRECT: + case LIGHTREC_IO_DIRECT_HW: + rec_cp2_do_mfc2(state, block, offset, c.i.rt, REG_CP2_TEMP); + break; + default: + break; + } + } - switch (LIGHTREC_FLAGS_GET_IO_MODE(flags)) { + switch (mode) { case LIGHTREC_IO_RAM: rec_store_ram(state, block, offset, code, swap_code, !no_invalidate); @@ -1442,8 +1465,11 @@ static void rec_store(struct lightrec_cstate *state, break; default: rec_io(state, block, offset, true, false); - break; + return; } + + if (is_swc2) + lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP); } static void rec_SB(struct lightrec_cstate *state, @@ -1465,7 +1491,9 @@ static void rec_SW(struct lightrec_cstate *state, const struct block *block, u16 offset) { - _jit_name(block->_jit, __func__); + union code c = block->opcode_list[offset].c; + + _jit_name(block->_jit, c.i.op == OP_SWC2 ? "rec_SWC2" : "rec_SW"); rec_store(state, block, offset, jit_code_stxi_i, jit_code_bswapr_ui); } @@ -1484,13 +1512,6 @@ static void rec_SWR(struct lightrec_cstate *state, rec_io(state, block, offset, true, false); } -static void rec_SWC2(struct lightrec_cstate *state, - const struct block *block, u16 offset) -{ - _jit_name(block->_jit, __func__); - rec_io(state, block, offset, false, false); -} - static void rec_load_memory(struct lightrec_cstate *cstate, const struct block *block, u16 offset, jit_code_t code, jit_code_t swap_code, bool is_unsigned, @@ -1499,19 +1520,23 @@ static void rec_load_memory(struct lightrec_cstate *cstate, struct regcache *reg_cache = cstate->reg_cache; struct opcode *op = &block->opcode_list[offset]; jit_state_t *_jit = block->_jit; - u8 rs, rt, addr_reg, flags = REG_EXT; + u8 rs, rt, out_reg, addr_reg, flags = REG_EXT; bool no_mask = op_flag_no_mask(op->flags); union code c = op->c; s16 imm; - if (!c.i.rt) + if (c.i.op == OP_LWC2) + out_reg = REG_CP2_TEMP; + else if (c.i.rt) + out_reg = c.i.rt; + else return; if (is_unsigned) flags |= REG_ZEXT; rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0); - rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, flags); + rt = lightrec_alloc_reg_out(reg_cache, _jit, out_reg, flags); if (!cstate->state->mirrors_mapped && c.i.imm && !no_mask) { jit_addi(rt, rs, (s16)c.i.imm); @@ -1597,10 +1622,14 @@ static void rec_load_direct(struct lightrec_cstate *cstate, union code c = block->opcode_list[offset].c; jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_not_bios, *to_end, *to_end2; - u8 tmp, rs, rt, addr_reg, flags = REG_EXT; + u8 tmp, rs, rt, out_reg, addr_reg, flags = REG_EXT; s16 imm; - if (!c.i.rt) + if (c.i.op == OP_LWC2) + out_reg = REG_CP2_TEMP; + else if (c.i.rt) + out_reg = c.i.rt; + else return; if (is_unsigned) @@ -1608,7 +1637,7 @@ static void rec_load_direct(struct lightrec_cstate *cstate, jit_note(__FILE__, __LINE__); rs = lightrec_alloc_reg_in(reg_cache, _jit, c.i.rs, 0); - rt = lightrec_alloc_reg_out(reg_cache, _jit, c.i.rt, flags); + rt = lightrec_alloc_reg_out(reg_cache, _jit, out_reg, flags); if ((state->offset_ram == state->offset_bios && state->offset_ram == state->offset_scratch && @@ -1700,7 +1729,8 @@ static void rec_load(struct lightrec_cstate *state, const struct block *block, u16 offset, jit_code_t code, jit_code_t swap_code, bool is_unsigned) { - u32 flags = block->opcode_list[offset].flags; + const struct opcode *op = &block->opcode_list[offset]; + u32 flags = op->flags; switch (LIGHTREC_FLAGS_GET_IO_MODE(flags)) { case LIGHTREC_IO_RAM: @@ -1720,7 +1750,12 @@ static void rec_load(struct lightrec_cstate *state, const struct block *block, break; default: rec_io(state, block, offset, false, true); - break; + return; + } + + if (op->i.op == OP_LWC2) { + rec_cp2_do_mtc2(state, block, offset, op->i.rt, REG_CP2_TEMP); + lightrec_discard_reg_if_loaded(state->reg_cache, REG_CP2_TEMP); } } @@ -1764,6 +1799,7 @@ static void rec_LWR(struct lightrec_cstate *state, const struct block *block, u1 static void rec_LW(struct lightrec_cstate *state, const struct block *block, u16 offset) { + union code c = block->opcode_list[offset].c; jit_code_t code; if (is_big_endian() && __WORDSIZE == 64) @@ -1771,16 +1807,10 @@ static void rec_LW(struct lightrec_cstate *state, const struct block *block, u16 else code = jit_code_ldxi_i; - _jit_name(block->_jit, __func__); + _jit_name(block->_jit, c.i.op == OP_LWC2 ? "rec_LWC2" : "rec_LW"); rec_load(state, block, offset, code, jit_code_bswapr_ui, false); } -static void rec_LWC2(struct lightrec_cstate *state, const struct block *block, u16 offset) -{ - _jit_name(block->_jit, __func__); - rec_io(state, block, offset, false, false); -} - static void rec_break_syscall(struct lightrec_cstate *state, const struct block *block, u16 offset, u32 exit_code) @@ -1826,7 +1856,9 @@ static void rec_mfc(struct lightrec_cstate *state, const struct block *block, u1 jit_state_t *_jit = block->_jit; jit_note(__FILE__, __LINE__); - lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, true); + + if (c.i.op != OP_SWC2) + lightrec_clean_reg_if_loaded(reg_cache, _jit, c.i.rt, true); call_to_c_wrapper(state, block, c.opcode, C_WRAPPER_MFC); } @@ -2031,15 +2063,14 @@ static unsigned int cp2c_s_offset(u8 reg) return cp2c_i_offset(reg) + is_big_endian() * 2; } -static void rec_cp2_basic_MFC2(struct lightrec_cstate *state, - const struct block *block, u16 offset) +static void rec_cp2_do_mfc2(struct lightrec_cstate *state, + const struct block *block, u16 offset, + u8 reg, u8 out_reg) { struct regcache *reg_cache = state->reg_cache; - const union code c = block->opcode_list[offset].c; jit_state_t *_jit = block->_jit; const u32 zext_regs = 0x300f0080; u8 rt, tmp, tmp2, tmp3, out, flags; - u8 reg = c.r.rd == 15 ? 14 : c.r.rd; unsigned int i; _jit_name(block->_jit, __func__); @@ -2051,7 +2082,10 @@ static void rec_cp2_basic_MFC2(struct lightrec_cstate *state, } flags = (zext_regs & BIT(reg)) ? REG_ZEXT : REG_EXT; - rt = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rt, flags); + rt = lightrec_alloc_reg_out(reg_cache, _jit, out_reg, flags); + + if (reg == 15) + reg = 14; switch (reg) { case 1: @@ -2108,6 +2142,14 @@ static void rec_cp2_basic_MFC2(struct lightrec_cstate *state, lightrec_free_reg(reg_cache, rt); } +static void rec_cp2_basic_MFC2(struct lightrec_cstate *state, + const struct block *block, u16 offset) +{ + const union code c = block->opcode_list[offset].c; + + rec_cp2_do_mfc2(state, block, offset, c.r.rd, c.r.rt); +} + static void rec_cp2_basic_CFC2(struct lightrec_cstate *state, const struct block *block, u16 offset) { @@ -2144,11 +2186,11 @@ static void rec_cp2_basic_CFC2(struct lightrec_cstate *state, lightrec_free_reg(reg_cache, rt); } -static void rec_cp2_basic_MTC2(struct lightrec_cstate *state, - const struct block *block, u16 offset) +static void rec_cp2_do_mtc2(struct lightrec_cstate *state, + const struct block *block, u16 offset, + u8 reg, u8 in_reg) { struct regcache *reg_cache = state->reg_cache; - const union code c = block->opcode_list[offset].c; jit_state_t *_jit = block->_jit; jit_node_t *loop, *to_loop; u8 rt, tmp, tmp2, flags = 0; @@ -2161,15 +2203,15 @@ static void rec_cp2_basic_MTC2(struct lightrec_cstate *state, return; } - if (c.r.rd == 31) + if (reg == 31) return; - if (c.r.rd == 30) + if (reg == 30) flags |= REG_EXT; - rt = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rt, flags); + rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, flags); - switch (c.r.rd) { + switch (reg) { case 15: tmp = lightrec_alloc_reg_temp(reg_cache, _jit); jit_ldxi_i(tmp, LIGHTREC_REG_STATE, cp2d_i_offset(13)); @@ -2228,13 +2270,21 @@ static void rec_cp2_basic_MTC2(struct lightrec_cstate *state, lightrec_free_reg(reg_cache, tmp2); break; default: - jit_stxi_i(cp2d_i_offset(c.r.rd), LIGHTREC_REG_STATE, rt); + jit_stxi_i(cp2d_i_offset(reg), LIGHTREC_REG_STATE, rt); break; } lightrec_free_reg(reg_cache, rt); } +static void rec_cp2_basic_MTC2(struct lightrec_cstate *state, + const struct block *block, u16 offset) +{ + const union code c = block->opcode_list[offset].c; + + rec_cp2_do_mtc2(state, block, offset, c.r.rd, c.r.rt); +} + static void rec_cp2_basic_CTC2(struct lightrec_cstate *state, const struct block *block, u16 offset) { @@ -2343,24 +2393,46 @@ static void rec_meta_MOV(struct lightrec_cstate *state, const struct block *block, u16 offset) { struct regcache *reg_cache = state->reg_cache; - union code c = block->opcode_list[offset].c; + const struct opcode *op = &block->opcode_list[offset]; + union code c = op->c; jit_state_t *_jit = block->_jit; + bool unload_rd; u8 rs, rd; _jit_name(block->_jit, __func__); jit_note(__FILE__, __LINE__); - if (c.r.rs) + + unload_rd = OPT_EARLY_UNLOAD + && LIGHTREC_FLAGS_GET_RD(op->flags) == LIGHTREC_REG_UNLOAD; + + if (c.r.rs || unload_rd) rs = lightrec_alloc_reg_in(reg_cache, _jit, c.r.rs, 0); - rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, REG_EXT); - if (c.r.rs == 0) - jit_movi(rd, 0); - else - jit_extr_i(rd, rs); + if (unload_rd) { + /* If the destination register will be unloaded right after the + * MOV meta-opcode, we don't actually need to write any host + * register - we can just store the source register directly to + * the register cache, at the offset corresponding to the + * destination register. */ + lightrec_discard_reg_if_loaded(reg_cache, c.r.rd); + + jit_stxi_i(offsetof(struct lightrec_state, regs.gpr) + + c.r.rd << 2, LIGHTREC_REG_STATE, rs); - if (c.r.rs) lightrec_free_reg(reg_cache, rs); - lightrec_free_reg(reg_cache, rd); + } else { + rd = lightrec_alloc_reg_out(reg_cache, _jit, c.r.rd, REG_EXT); + + if (c.r.rs == 0) + jit_movi(rd, 0); + else + jit_extr_i(rd, rs); + + lightrec_free_reg(reg_cache, rd); + } + + if (c.r.rs || unload_rd) + lightrec_free_reg(reg_cache, rs); } static void rec_meta_EXTC_EXTS(struct lightrec_cstate *state, @@ -2483,8 +2555,8 @@ static const lightrec_rec_func_t rec_standard[64] = { [OP_SWL] = rec_SWL, [OP_SW] = rec_SW, [OP_SWR] = rec_SWR, - [OP_LWC2] = rec_LWC2, - [OP_SWC2] = rec_SWC2, + [OP_LWC2] = rec_LW, + [OP_SWC2] = rec_SW, [OP_META_MOV] = rec_meta_MOV, [OP_META_EXTC] = rec_meta_EXTC_EXTS, diff --git a/deps/lightrec/interpreter.c b/deps/lightrec/interpreter.c index 43bea83f..ea8098cd 100644 --- a/deps/lightrec/interpreter.c +++ b/deps/lightrec/interpreter.c @@ -500,7 +500,7 @@ static u32 int_ctc(struct interpreter *inter) struct lightrec_state *state = inter->state; const struct opcode *op = inter->op; - lightrec_mtc(state, op->c, state->regs.gpr[op->r.rt]); + lightrec_mtc(state, op->c, op->r.rd, state->regs.gpr[op->r.rt]); /* If we have a MTC0 or CTC0 to CP0 register 12 (Status) or 13 (Cause), * return early so that the emulator will be able to check software diff --git a/deps/lightrec/lightrec-private.h b/deps/lightrec/lightrec-private.h index 56032f50..e67d406f 100644 --- a/deps/lightrec/lightrec-private.h +++ b/deps/lightrec/lightrec-private.h @@ -81,6 +81,7 @@ #define REG_LO 32 #define REG_HI 33 +#define REG_CP2_TEMP (offsetof(struct lightrec_state, cp2_temp_reg) / sizeof(u32)) /* Definition of jit_state_t (avoids inclusion of ) */ struct jit_node; @@ -152,8 +153,9 @@ struct lightrec_cstate { struct lightrec_state { struct lightrec_registers regs; - uintptr_t wrapper_regs[NUM_TEMPS]; + u32 cp2_temp_reg; u32 next_pc; + uintptr_t wrapper_regs[NUM_TEMPS]; u32 current_cycle; u32 target_cycle; u32 exit_flags; @@ -188,9 +190,6 @@ void lightrec_free_block(struct lightrec_state *state, struct block *block); void remove_from_code_lut(struct blockcache *cache, struct block *block); -enum psx_map -lightrec_get_map_idx(struct lightrec_state *state, u32 kaddr); - const struct lightrec_mem_map * lightrec_get_map(struct lightrec_state *state, void **host, u32 kaddr); @@ -272,7 +271,7 @@ static inline u32 get_branch_pc(const struct block *block, u16 offset, s16 imm) return block->pc + (offset + imm << 2); } -void lightrec_mtc(struct lightrec_state *state, union code op, u32 data); +void lightrec_mtc(struct lightrec_state *state, union code op, u8 reg, u32 data); u32 lightrec_mfc(struct lightrec_state *state, union code op); void lightrec_rfe(struct lightrec_state *state); void lightrec_cp(struct lightrec_state *state, union code op); @@ -340,4 +339,14 @@ static inline u8 block_clear_flags(struct block *block, u8 mask) #endif } +static inline _Bool can_sign_extend(s32 value, u8 order) +{ + return (u32)(value >> order - 1) + 1 < 2; +} + +static inline _Bool can_zero_extend(u32 value, u8 order) +{ + return (value >> order) == 0; +} + #endif /* __LIGHTREC_PRIVATE_H__ */ diff --git a/deps/lightrec/lightrec.c b/deps/lightrec/lightrec.c index be4da10f..b9e82fb2 100644 --- a/deps/lightrec/lightrec.c +++ b/deps/lightrec/lightrec.c @@ -198,7 +198,7 @@ static void lightrec_invalidate_map(struct lightrec_state *state, } } -enum psx_map +static enum psx_map lightrec_get_map_idx(struct lightrec_state *state, u32 kaddr) { const struct lightrec_mem_map *map; @@ -428,7 +428,10 @@ u32 lightrec_mfc(struct lightrec_state *state, union code op) if (op.i.op == OP_CP0) return state->regs.cp0[op.r.rd]; - else if (op.r.rs == OP_CP2_BASIC_MFC2) + + if (op.i.op == OP_SWC2) { + val = lightrec_mfc2(state, op.i.rt); + } else if (op.r.rs == OP_CP2_BASIC_MFC2) val = lightrec_mfc2(state, op.r.rd); else { val = state->regs.cp2c[op.r.rd]; @@ -458,7 +461,9 @@ static void lightrec_mfc_cb(struct lightrec_state *state, union code op) { u32 rt = lightrec_mfc(state, op); - if (op.r.rt) + if (op.i.op == OP_SWC2) + state->cp2_temp_reg = rt; + else if (op.r.rt) state->regs.gpr[op.r.rt] = rt; } @@ -576,15 +581,15 @@ static void lightrec_ctc2(struct lightrec_state *state, u8 reg, u32 data) } } -void lightrec_mtc(struct lightrec_state *state, union code op, u32 data) +void lightrec_mtc(struct lightrec_state *state, union code op, u8 reg, u32 data) { if (op.i.op == OP_CP0) { - lightrec_mtc0(state, op.r.rd, data); + lightrec_mtc0(state, reg, data); } else { - if (op.r.rs == OP_CP2_BASIC_CTC2) - lightrec_ctc2(state, op.r.rd, data); + if (op.i.op == OP_LWC2 || op.r.rs != OP_CP2_BASIC_CTC2) + lightrec_mtc2(state, reg, data); else - lightrec_mtc2(state, op.r.rd, data); + lightrec_ctc2(state, reg, data); if (state->ops.cop2_notify) (*state->ops.cop2_notify)(state, op.opcode, data); @@ -594,8 +599,18 @@ void lightrec_mtc(struct lightrec_state *state, union code op, u32 data) static void lightrec_mtc_cb(struct lightrec_state *state, u32 arg) { union code op = (union code) arg; + u32 data; + u8 reg; + + if (op.i.op == OP_LWC2) { + data = state->cp2_temp_reg; + reg = op.i.rt; + } else { + data = state->regs.gpr[op.r.rt]; + reg = op.r.rd; + } - lightrec_mtc(state, op, state->regs.gpr[op.r.rt]); + lightrec_mtc(state, op, reg, data); } void lightrec_rfe(struct lightrec_state *state) @@ -671,7 +686,7 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc) void *func; int err; - for (;;) { + do { func = lut_read(state, lut_offset(pc)); if (func && func != state->get_next_block) break; @@ -740,11 +755,8 @@ static void * get_next_block_func(struct lightrec_state *state, u32 pc) } else { lightrec_recompiler_add(state->rec, block); } - - if (state->exit_flags != LIGHTREC_EXIT_NORMAL || - state->current_cycle >= state->target_cycle) - break; - } + } while (state->exit_flags == LIGHTREC_EXIT_NORMAL + && state->current_cycle < state->target_cycle); state->next_pc = pc; return func; @@ -847,6 +859,9 @@ static void * lightrec_emit_code(struct lightrec_state *state, *size = (unsigned int) new_code_size; + if (state->ops.code_inv) + state->ops.code_inv(code, new_code_size); + return code; } @@ -1009,6 +1024,8 @@ static struct block * generate_dispatcher(struct lightrec_state *state) jit_prolog(); jit_frame(256); + jit_getarg(LIGHTREC_REG_STATE, jit_arg()); + jit_getarg(JIT_V0, jit_arg()); jit_getarg(JIT_V1, jit_arg()); jit_getarg_i(LIGHTREC_REG_CYCLE, jit_arg()); @@ -1016,10 +1033,6 @@ static struct block * generate_dispatcher(struct lightrec_state *state) for (i = 0; i < NUM_REGS; i++) jit_movr(JIT_V(i + FIRST_REG), JIT_V(i + FIRST_REG)); - /* Pass lightrec_state structure to blocks, using the last callee-saved - * register that Lightning provides */ - jit_movi(LIGHTREC_REG_STATE, (intptr_t) state); - loop = jit_label(); /* Call the block's code */ @@ -1115,6 +1128,10 @@ static struct block * generate_dispatcher(struct lightrec_state *state) jit_movr(LIGHTREC_REG_CYCLE, JIT_V0); } + /* Reset JIT_V0 to the next PC */ + jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE, + offsetof(struct lightrec_state, next_pc)); + /* If we get non-NULL, loop */ jit_patch_at(jit_bnei(JIT_V1, 0), loop); @@ -1399,6 +1416,8 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, block->_jit = _jit; lightrec_regcache_reset(cstate->reg_cache); + lightrec_preload_pc(cstate->reg_cache); + cstate->cycles = 0; cstate->nb_local_branches = 0; cstate->nb_targets = 0; @@ -1603,7 +1622,7 @@ static void lightrec_print_info(struct lightrec_state *state) u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle) { - s32 (*func)(void *, s32) = (void *)state->dispatcher->function; + s32 (*func)(struct lightrec_state *, u32, void *, s32) = (void *)state->dispatcher->function; void *block_trace; s32 cycles_delta; @@ -1620,7 +1639,8 @@ u32 lightrec_execute(struct lightrec_state *state, u32 pc, u32 target_cycle) if (block_trace) { cycles_delta = state->target_cycle - state->current_cycle; - cycles_delta = (*func)(block_trace, cycles_delta); + cycles_delta = (*func)(state, state->next_pc, + block_trace, cycles_delta); state->current_cycle = state->target_cycle - cycles_delta; } diff --git a/deps/lightrec/lightrec.h b/deps/lightrec/lightrec.h index 310036ce..9cd7f478 100644 --- a/deps/lightrec/lightrec.h +++ b/deps/lightrec/lightrec.h @@ -89,6 +89,7 @@ struct lightrec_ops { void (*cop2_op)(struct lightrec_state *state, u32 op); void (*enable_ram)(struct lightrec_state *state, _Bool enable); _Bool (*hw_direct)(u32 kaddr, _Bool is_write, u8 size); + void (*code_inv)(void *addr, u32 len); }; struct lightrec_registers { diff --git a/deps/lightrec/optimizer.c b/deps/lightrec/optimizer.c index 10067a7d..04d9d809 100644 --- a/deps/lightrec/optimizer.c +++ b/deps/lightrec/optimizer.c @@ -3,6 +3,7 @@ * Copyright (C) 2014-2021 Paul Cercueil */ +#include "constprop.h" #include "lightrec-config.h" #include "disassembler.h" #include "lightrec.h" @@ -268,15 +269,13 @@ static int find_next_reader(const struct opcode *list, unsigned int offset, u8 r for (i = offset; ; i++) { c = list[i].c; - if (opcode_reads_register(c, reg)) { - if (i > 0 && has_delay_slot(list[i - 1].c)) - break; - + if (opcode_reads_register(c, reg)) return i; - } - if (op_flag_sync(list[i].flags) || - has_delay_slot(c) || opcode_writes_register(c, reg)) + if (op_flag_sync(list[i].flags) + || (op_flag_no_ds(list[i].flags) && has_delay_slot(c)) + || is_delay_slot(list, i) + || opcode_writes_register(c, reg)) break; } @@ -287,7 +286,7 @@ static bool reg_is_dead(const struct opcode *list, unsigned int offset, u8 reg) { unsigned int i; - if (op_flag_sync(list[offset].flags)) + if (op_flag_sync(list[offset].flags) || is_delay_slot(list, offset)) return false; for (i = offset + 1; ; i++) { @@ -497,410 +496,127 @@ bool load_in_delay_slot(union code op) return false; } -static u32 lightrec_propagate_consts(const struct opcode *op, - const struct opcode *prev, - u32 known, u32 *v) +static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset, + struct constprop_data *v) { - union code c = prev->c; - - /* Register $zero is always, well, zero */ - known |= BIT(0); - v[0] = 0; - - if (op_flag_sync(op->flags)) - return BIT(0); - - switch (c.i.op) { - case OP_SPECIAL: - switch (c.r.op) { - case OP_SPECIAL_SLL: - if (known & BIT(c.r.rt)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] << c.r.imm; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SRL: - if (known & BIT(c.r.rt)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] >> c.r.imm; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SRA: - if (known & BIT(c.r.rt)) { - known |= BIT(c.r.rd); - v[c.r.rd] = (s32)v[c.r.rt] >> c.r.imm; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SLLV: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] << (v[c.r.rs] & 0x1f); - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SRLV: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] >> (v[c.r.rs] & 0x1f); - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SRAV: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = (s32)v[c.r.rt] - >> (v[c.r.rs] & 0x1f); - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_ADD: - case OP_SPECIAL_ADDU: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = (s32)v[c.r.rt] + (s32)v[c.r.rs]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SUB: - case OP_SPECIAL_SUBU: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] - v[c.r.rs]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_AND: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] & v[c.r.rs]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_OR: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] | v[c.r.rs]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_XOR: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rt] ^ v[c.r.rs]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_NOR: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = ~(v[c.r.rt] | v[c.r.rs]); - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SLT: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = (s32)v[c.r.rs] < (s32)v[c.r.rt]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_SLTU: - if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rs] < v[c.r.rt]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_SPECIAL_MULT: - case OP_SPECIAL_MULTU: - case OP_SPECIAL_DIV: - case OP_SPECIAL_DIVU: - if (OPT_FLAG_MULT_DIV && c.r.rd) - known &= ~BIT(c.r.rd); - if (OPT_FLAG_MULT_DIV && c.r.imm) - known &= ~BIT(c.r.imm); - break; - case OP_SPECIAL_MFLO: - case OP_SPECIAL_MFHI: - known &= ~BIT(c.r.rd); - break; - default: - break; - } - break; - case OP_META_MULT2: - case OP_META_MULTU2: - if (OPT_FLAG_MULT_DIV && (known & BIT(c.r.rs))) { - if (c.r.rd) { - known |= BIT(c.r.rd); - - if (c.r.op < 32) - v[c.r.rd] = v[c.r.rs] << c.r.op; - else - v[c.r.rd] = 0; - } - - if (c.r.imm) { - known |= BIT(c.r.imm); - - if (c.r.op >= 32) - v[c.r.imm] = v[c.r.rs] << (c.r.op - 32); - else if (c.i.op == OP_META_MULT2) - v[c.r.imm] = (s32) v[c.r.rs] >> (32 - c.r.op); - else - v[c.r.imm] = v[c.r.rs] >> (32 - c.r.op); - } - } else { - if (OPT_FLAG_MULT_DIV && c.r.rd) - known &= ~BIT(c.r.rd); - if (OPT_FLAG_MULT_DIV && c.r.imm) - known &= ~BIT(c.r.imm); - } - break; - case OP_REGIMM: - break; - case OP_ADDI: - case OP_ADDIU: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = v[c.i.rs] + (s32)(s16)c.i.imm; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_SLTI: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = (s32)v[c.i.rs] < (s32)(s16)c.i.imm; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_SLTIU: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = v[c.i.rs] < (u32)(s32)(s16)c.i.imm; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_ANDI: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = v[c.i.rs] & c.i.imm; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_ORI: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = v[c.i.rs] | c.i.imm; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_XORI: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = v[c.i.rs] ^ c.i.imm; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_LUI: - known |= BIT(c.i.rt); - v[c.i.rt] = c.i.imm << 16; - break; - case OP_CP0: - switch (c.r.rs) { - case OP_CP0_MFC0: - case OP_CP0_CFC0: - known &= ~BIT(c.r.rt); - break; - } - break; - case OP_CP2: - if (c.r.op == OP_CP2_BASIC) { - switch (c.r.rs) { - case OP_CP2_BASIC_MFC2: - case OP_CP2_BASIC_CFC2: - known &= ~BIT(c.r.rt); - break; - } - } - break; - case OP_LB: - case OP_LH: - case OP_LWL: - case OP_LW: - case OP_LBU: - case OP_LHU: - case OP_LWR: - case OP_LWC2: - known &= ~BIT(c.i.rt); - break; - case OP_META_MOV: - if (known & BIT(c.r.rs)) { - known |= BIT(c.r.rd); - v[c.r.rd] = v[c.r.rs]; - } else { - known &= ~BIT(c.r.rd); - } - break; - case OP_META_EXTC: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = (s32)(s8)v[c.i.rs]; - } else { - known &= ~BIT(c.i.rt); - } - break; - case OP_META_EXTS: - if (known & BIT(c.i.rs)) { - known |= BIT(c.i.rt); - v[c.i.rt] = (s32)(s16)v[c.i.rs]; - } else { - known &= ~BIT(c.i.rt); - } - break; - default: - break; - } - - return known; -} - -static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset) -{ - struct opcode *prev, *prev2 = NULL, *curr = &list[offset]; + struct opcode *ldop = NULL, *curr = &list[offset], *next; struct opcode *to_change, *to_nop; int idx, idx2; if (curr->r.imm != 24 && curr->r.imm != 16) return; - idx = find_prev_writer(list, offset, curr->r.rt); + if (is_delay_slot(list, offset)) + return; + + idx = find_next_reader(list, offset + 1, curr->r.rd); if (idx < 0) return; - prev = &list[idx]; + next = &list[idx]; - if (prev->i.op != OP_SPECIAL || prev->r.op != OP_SPECIAL_SLL || - prev->r.imm != curr->r.imm || prev->r.rd != curr->r.rt) + if (next->i.op != OP_SPECIAL || next->r.op != OP_SPECIAL_SRA || + next->r.imm != curr->r.imm || next->r.rt != curr->r.rd) return; - if (prev->r.rd != prev->r.rt && curr->r.rd != curr->r.rt) { + if (curr->r.rd != curr->r.rt && next->r.rd != next->r.rt) { /* sll rY, rX, 16 * ... - * srl rZ, rY, 16 */ + * sra rZ, rY, 16 */ - if (!reg_is_dead(list, offset, curr->r.rt) || - reg_is_read_or_written(list, idx, offset, curr->r.rd)) + if (!reg_is_dead(list, idx, curr->r.rd) || + reg_is_read_or_written(list, offset, idx, next->r.rd)) return; /* If rY is dead after the SRL, and rZ is not used after the SLL, * we can change rY to rZ */ pr_debug("Detected SLL/SRA with middle temp register\n"); - prev->r.rd = curr->r.rd; - curr->r.rt = prev->r.rd; + curr->r.rd = next->r.rd; + next->r.rt = curr->r.rd; } - /* We got a SLL/SRA combo. If imm #16, that's a cast to u16. - * If imm #24 that's a cast to u8. + /* We got a SLL/SRA combo. If imm #16, that's a cast to s16. + * If imm #24 that's a cast to s8. * * First of all, make sure that the target register of the SLL is not - * read before the SRA. */ + * read after the SRA. */ - if (prev->r.rd == prev->r.rt) { + if (curr->r.rd == curr->r.rt) { /* sll rX, rX, 16 * ... - * srl rY, rX, 16 */ - to_change = curr; - to_nop = prev; + * sra rY, rX, 16 */ + to_change = next; + to_nop = curr; /* rX is used after the SRA - we cannot convert it. */ - if (prev->r.rd != curr->r.rd && !reg_is_dead(list, offset, prev->r.rd)) + if (curr->r.rd != next->r.rd && !reg_is_dead(list, idx, curr->r.rd)) return; } else { /* sll rY, rX, 16 * ... - * srl rY, rY, 16 */ - to_change = prev; - to_nop = curr; + * sra rY, rY, 16 */ + to_change = curr; + to_nop = next; } - idx2 = find_prev_writer(list, idx, prev->r.rt); + idx2 = find_prev_writer(list, offset, curr->r.rt); if (idx2 >= 0) { /* Note that PSX games sometimes do casts after * a LHU or LBU; in this case we can change the * load opcode to a LH or LB, and the cast can * be changed to a MOV or a simple NOP. */ - prev2 = &list[idx2]; + ldop = &list[idx2]; - if (curr->r.rd != prev2->i.rt && - !reg_is_dead(list, offset, prev2->i.rt)) - prev2 = NULL; - else if (curr->r.imm == 16 && prev2->i.op == OP_LHU) - prev2->i.op = OP_LH; - else if (curr->r.imm == 24 && prev2->i.op == OP_LBU) - prev2->i.op = OP_LB; + if (next->r.rd != ldop->i.rt && + !reg_is_dead(list, idx, ldop->i.rt)) + ldop = NULL; + else if (curr->r.imm == 16 && ldop->i.op == OP_LHU) + ldop->i.op = OP_LH; + else if (curr->r.imm == 24 && ldop->i.op == OP_LBU) + ldop->i.op = OP_LB; else - prev2 = NULL; + ldop = NULL; - if (prev2) { - if (curr->r.rd == prev2->i.rt) { + if (ldop) { + if (next->r.rd == ldop->i.rt) { to_change->opcode = 0; - } else if (reg_is_dead(list, offset, prev2->i.rt) && - !reg_is_read_or_written(list, idx2 + 1, offset, curr->r.rd)) { + } else if (reg_is_dead(list, idx, ldop->i.rt) && + !reg_is_read_or_written(list, idx2 + 1, idx, next->r.rd)) { /* The target register of the SRA is dead after the * LBU/LHU; we can change the target register of the * LBU/LHU to the one of the SRA. */ - prev2->i.rt = curr->r.rd; + v[ldop->i.rt].known = 0; + v[ldop->i.rt].sign = 0; + ldop->i.rt = next->r.rd; to_change->opcode = 0; } else { to_change->i.op = OP_META_MOV; - to_change->r.rd = curr->r.rd; - to_change->r.rs = prev2->i.rt; + to_change->r.rd = next->r.rd; + to_change->r.rs = ldop->i.rt; } if (to_nop->r.imm == 24) pr_debug("Convert LBU+SLL+SRA to LB\n"); else pr_debug("Convert LHU+SLL+SRA to LH\n"); + + v[ldop->i.rt].known = 0; + v[ldop->i.rt].sign = 0xffffff80 << 24 - curr->r.imm; } } - if (!prev2) { + if (!ldop) { pr_debug("Convert SLL/SRA #%u to EXT%c\n", - prev->r.imm, - prev->r.imm == 24 ? 'C' : 'S'); + curr->r.imm, curr->r.imm == 24 ? 'C' : 'S'); - if (to_change == prev) { - to_change->i.rs = prev->r.rt; - to_change->i.rt = curr->r.rd; + if (to_change == curr) { + to_change->i.rs = curr->r.rt; + to_change->i.rt = next->r.rd; } else { - to_change->i.rt = curr->r.rd; - to_change->i.rs = prev->r.rt; + to_change->i.rt = next->r.rd; + to_change->i.rs = curr->r.rt; } if (to_nop->r.imm == 24) @@ -912,21 +628,22 @@ static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset) to_nop->opcode = 0; } -static void lightrec_remove_useless_lui(struct block *block, unsigned int offset, - u32 known, u32 *values) +static void +lightrec_remove_useless_lui(struct block *block, unsigned int offset, + const struct constprop_data *v) { struct opcode *list = block->opcode_list, *op = &block->opcode_list[offset]; int reader; - if (!op_flag_sync(op->flags) && (known & BIT(op->i.rt)) && - values[op->i.rt] == op->i.imm << 16) { + if (!op_flag_sync(op->flags) && is_known(v, op->i.rt) && + v[op->i.rt].value == op->i.imm << 16) { pr_debug("Converting duplicated LUI to NOP\n"); op->opcode = 0x0; return; } - if (op->i.imm != 0 || op->i.rt == 0) + if (op->i.imm != 0 || op->i.rt == 0 || offset == block->nb_ops - 1) return; reader = find_next_reader(list, offset + 1, op->i.rt); @@ -1010,21 +727,142 @@ static inline bool is_power_of_two(u32 value) return popcount32(value) == 1; } +static void lightrec_patch_known_zero(struct opcode *op, + const struct constprop_data *v) +{ + switch (op->i.op) { + case OP_SPECIAL: + switch (op->r.op) { + case OP_SPECIAL_JR: + case OP_SPECIAL_JALR: + case OP_SPECIAL_MTHI: + case OP_SPECIAL_MTLO: + if (is_known_zero(v, op->r.rs)) + op->r.rs = 0; + break; + default: + if (is_known_zero(v, op->r.rs)) + op->r.rs = 0; + fallthrough; + case OP_SPECIAL_SLL: + case OP_SPECIAL_SRL: + case OP_SPECIAL_SRA: + if (is_known_zero(v, op->r.rt)) + op->r.rt = 0; + break; + case OP_SPECIAL_SYSCALL: + case OP_SPECIAL_BREAK: + case OP_SPECIAL_MFHI: + case OP_SPECIAL_MFLO: + break; + } + break; + case OP_CP0: + switch (op->r.rs) { + case OP_CP0_MTC0: + case OP_CP0_CTC0: + if (is_known_zero(v, op->r.rt)) + op->r.rt = 0; + break; + default: + break; + } + break; + case OP_CP2: + if (op->r.op == OP_CP2_BASIC) { + switch (op->r.rs) { + case OP_CP2_BASIC_MTC2: + case OP_CP2_BASIC_CTC2: + if (is_known_zero(v, op->r.rt)) + op->r.rt = 0; + break; + default: + break; + } + } + break; + case OP_BEQ: + case OP_BNE: + if (is_known_zero(v, op->i.rt)) + op->i.rt = 0; + fallthrough; + case OP_REGIMM: + case OP_BLEZ: + case OP_BGTZ: + case OP_ADDI: + case OP_ADDIU: + case OP_SLTI: + case OP_SLTIU: + case OP_ANDI: + case OP_ORI: + case OP_XORI: + case OP_META_MOV: + case OP_META_EXTC: + case OP_META_EXTS: + case OP_META_MULT2: + case OP_META_MULTU2: + if (is_known_zero(v, op->i.rs)) + op->i.rs = 0; + break; + case OP_SB: + case OP_SH: + case OP_SWL: + case OP_SW: + case OP_SWR: + if (is_known_zero(v, op->i.rt)) + op->i.rt = 0; + fallthrough; + case OP_LB: + case OP_LH: + case OP_LWL: + case OP_LW: + case OP_LBU: + case OP_LHU: + case OP_LWR: + case OP_LWC2: + case OP_SWC2: + if (is_known(v, op->i.rs) + && kunseg(v[op->i.rs].value) == 0) + op->i.rs = 0; + break; + default: + break; + } +} + +static void lightrec_reset_syncs(struct block *block) +{ + struct opcode *op, *list = block->opcode_list; + unsigned int i; + s32 offset; + + for (i = 0; i < block->nb_ops; i++) + list[i].flags &= ~LIGHTREC_SYNC; + + for (i = 0; i < block->nb_ops; i++) { + op = &list[i]; + + if (op_flag_local_branch(op->flags) && has_delay_slot(op->c)) { + offset = i + 1 + (s16)op->i.imm; + list[offset].flags |= LIGHTREC_SYNC; + } + } +} + static int lightrec_transform_ops(struct lightrec_state *state, struct block *block) { - struct opcode *list = block->opcode_list; - struct opcode *prev, *op = NULL; - u32 known = BIT(0); - u32 values[32] = { 0 }; + struct opcode *op, *list = block->opcode_list; + struct constprop_data v[32] = LIGHTREC_CONSTPROP_INITIALIZER; unsigned int i; + bool local; u8 tmp; for (i = 0; i < block->nb_ops; i++) { - prev = op; op = &list[i]; - if (prev) - known = lightrec_propagate_consts(op, prev, known, values); + lightrec_consts_propagate(list, i, v); + + lightrec_patch_known_zero(op, v); /* Transform all opcodes detected as useless to real NOPs * (0x0: SLL r0, r0, #0) */ @@ -1039,9 +877,24 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl switch (op->i.op) { case OP_BEQ: - if (op->i.rs == op->i.rt) { + if (op->i.rs == op->i.rt || + (is_known(v, op->i.rs) && is_known(v, op->i.rt) && + v[op->i.rs].value == v[op->i.rt].value)) { + if (op->i.rs != op->i.rt) + pr_debug("Found always-taken BEQ\n"); + op->i.rs = 0; op->i.rt = 0; + } else if (v[op->i.rs].known & v[op->i.rt].known & + (v[op->i.rs].value ^ v[op->i.rt].value)) { + pr_debug("Found never-taken BEQ\n"); + + local = op_flag_local_branch(op->flags); + op->opcode = 0; + op->flags = 0; + + if (local) + lightrec_reset_syncs(block); } else if (op->i.rs == 0) { op->i.rs = op->i.rt; op->i.rt = 0; @@ -1049,16 +902,58 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl break; case OP_BNE: - if (op->i.rs == 0) { + if (v[op->i.rs].known & v[op->i.rt].known & + (v[op->i.rs].value ^ v[op->i.rt].value)) { + pr_debug("Found always-taken BNE\n"); + + op->i.op = OP_BEQ; + op->i.rs = 0; + op->i.rt = 0; + } else if (is_known(v, op->i.rs) && is_known(v, op->i.rt) && + v[op->i.rs].value == v[op->i.rt].value) { + pr_debug("Found never-taken BNE\n"); + + local = op_flag_local_branch(op->flags); + op->opcode = 0; + op->flags = 0; + + if (local) + lightrec_reset_syncs(block); + } else if (op->i.rs == 0) { op->i.rs = op->i.rt; op->i.rt = 0; } break; + case OP_BLEZ: + if (v[op->i.rs].known & BIT(31) && + v[op->i.rs].value & BIT(31)) { + pr_debug("Found always-taken BLEZ\n"); + + op->i.op = OP_BEQ; + op->i.rs = 0; + op->i.rt = 0; + } + break; + + case OP_BGTZ: + if (v[op->i.rs].known & BIT(31) && + v[op->i.rs].value & BIT(31)) { + pr_debug("Found never-taken BGTZ\n"); + + local = op_flag_local_branch(op->flags); + op->opcode = 0; + op->flags = 0; + + if (local) + lightrec_reset_syncs(block); + } + break; + case OP_LUI: - if (!prev || !has_delay_slot(prev->c)) + if (i == 0 || !has_delay_slot(list[i - 1].c)) lightrec_modify_lui(block, i); - lightrec_remove_useless_lui(block, i, known, values); + lightrec_remove_useless_lui(block, i, v); break; /* Transform ORI/ADDI/ADDIU with imm #0 or ORR/ADD/ADDU/SUB/SUBU @@ -1072,8 +967,59 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl op->r.rd = op->i.rt; } break; + case OP_ANDI: + if (bits_are_known_zero(v, op->i.rs, ~op->i.imm)) { + pr_debug("Found useless ANDI 0x%x\n", op->i.imm); + + if (op->i.rs == op->i.rt) { + op->opcode = 0; + } else { + op->i.op = OP_META_MOV; + op->r.rd = op->i.rt; + } + } + break; + case OP_REGIMM: + switch (op->r.rt) { + case OP_REGIMM_BLTZ: + case OP_REGIMM_BGEZ: + if (!(v[op->r.rs].known & BIT(31))) + break; + + if (!!(v[op->r.rs].value & BIT(31)) + ^ (op->r.rt == OP_REGIMM_BGEZ)) { + pr_debug("Found always-taken BLTZ/BGEZ\n"); + op->i.op = OP_BEQ; + op->i.rs = 0; + op->i.rt = 0; + } else { + pr_debug("Found never-taken BLTZ/BGEZ\n"); + + local = op_flag_local_branch(op->flags); + op->opcode = 0; + op->flags = 0; + + if (local) + lightrec_reset_syncs(block); + } + break; + case OP_REGIMM_BLTZAL: + case OP_REGIMM_BGEZAL: + /* TODO: Detect always-taken and replace with JAL */ + break; + } + break; case OP_SPECIAL: switch (op->r.op) { + case OP_SPECIAL_SRAV: + if ((v[op->r.rs].known & 0x1f) != 0x1f) + break; + + pr_debug("Convert SRAV to SRA\n"); + op->r.imm = v[op->r.rs].value & 0x1f; + op->r.op = OP_SPECIAL_SRA; + + fallthrough; case OP_SPECIAL_SRA: if (op->r.imm == 0) { pr_debug("Convert SRA #0 to MOV\n"); @@ -1081,38 +1027,65 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl op->r.rs = op->r.rt; break; } - - lightrec_optimize_sll_sra(block->opcode_list, i); break; + + case OP_SPECIAL_SLLV: + if ((v[op->r.rs].known & 0x1f) != 0x1f) + break; + + pr_debug("Convert SLLV to SLL\n"); + op->r.imm = v[op->r.rs].value & 0x1f; + op->r.op = OP_SPECIAL_SLL; + + fallthrough; case OP_SPECIAL_SLL: + if (op->r.imm == 0) { + pr_debug("Convert SLL #0 to MOV\n"); + op->i.op = OP_META_MOV; + op->r.rs = op->r.rt; + } + + lightrec_optimize_sll_sra(block->opcode_list, i, v); + break; + + case OP_SPECIAL_SRLV: + if ((v[op->r.rs].known & 0x1f) != 0x1f) + break; + + pr_debug("Convert SRLV to SRL\n"); + op->r.imm = v[op->r.rs].value & 0x1f; + op->r.op = OP_SPECIAL_SRL; + + fallthrough; case OP_SPECIAL_SRL: if (op->r.imm == 0) { - pr_debug("Convert SLL/SRL #0 to MOV\n"); + pr_debug("Convert SRL #0 to MOV\n"); op->i.op = OP_META_MOV; op->r.rs = op->r.rt; } break; + case OP_SPECIAL_MULT: case OP_SPECIAL_MULTU: - if ((known & BIT(op->r.rs)) && - is_power_of_two(values[op->r.rs])) { + if (is_known(v, op->r.rs) && + is_power_of_two(v[op->r.rs].value)) { tmp = op->c.i.rs; op->c.i.rs = op->c.i.rt; op->c.i.rt = tmp; - } else if (!(known & BIT(op->r.rt)) || - !is_power_of_two(values[op->r.rt])) { + } else if (!is_known(v, op->r.rt) || + !is_power_of_two(v[op->r.rt].value)) { break; } pr_debug("Multiply by power-of-two: %u\n", - values[op->r.rt]); + v[op->r.rt].value); if (op->r.op == OP_SPECIAL_MULT) op->i.op = OP_META_MULT2; else op->i.op = OP_META_MULTU2; - op->r.op = ctz32(values[op->r.rt]); + op->r.op = ctz32(v[op->r.rt].value); break; case OP_SPECIAL_OR: case OP_SPECIAL_ADD: @@ -1218,8 +1191,7 @@ static int lightrec_switch_delay_slots(struct lightrec_state *state, struct bloc op.opcode == 0 || next_op.opcode == 0) continue; - if (i && has_delay_slot(block->opcode_list[i - 1].c) && - !op_flag_no_ds(block->opcode_list[i - 1].flags)) + if (is_delay_slot(block->opcode_list, i)) continue; if (op_flag_sync(next->flags)) @@ -1368,12 +1340,11 @@ static int lightrec_local_branches(struct lightrec_state *state, struct block *b continue; } - pr_debug("Adding sync at offset 0x%x\n", offset << 2); - - block->opcode_list[offset].flags |= LIGHTREC_SYNC; list->flags |= LIGHTREC_LOCAL_BRANCH; } + lightrec_reset_syncs(block); + return 0; } @@ -1401,6 +1372,13 @@ bool has_delay_slot(union code op) } } +bool is_delay_slot(const struct opcode *list, unsigned int offset) +{ + return offset > 0 + && !op_flag_no_ds(list[offset - 1].flags) + && has_delay_slot(list[offset - 1].c); +} + bool should_emulate(const struct opcode *list) { return op_flag_emulate_branch(list->flags) && has_delay_slot(list->c); @@ -1565,20 +1543,17 @@ static int lightrec_early_unload(struct lightrec_state *state, struct block *blo static int lightrec_flag_io(struct lightrec_state *state, struct block *block) { - struct opcode *prev = NULL, *list = NULL; + struct opcode *list; enum psx_map psx_map; - u32 known = BIT(0); - u32 values[32] = { 0 }; + struct constprop_data v[32] = LIGHTREC_CONSTPROP_INITIALIZER; unsigned int i; u32 val, kunseg_val; bool no_mask; for (i = 0; i < block->nb_ops; i++) { - prev = list; list = &block->opcode_list[i]; - if (prev) - known = lightrec_propagate_consts(list, prev, known, values); + lightrec_consts_propagate(block->opcode_list, i, v); switch (list->i.op) { case OP_SB: @@ -1601,10 +1576,10 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block) /* Detect writes whose destination address is inside the * current block, using constant propagation. When these * occur, we mark the blocks as not compilable. */ - if ((known & BIT(list->i.rs)) && - kunseg(values[list->i.rs]) >= kunseg(block->pc) && - kunseg(values[list->i.rs]) < (kunseg(block->pc) + - block->nb_ops * 4)) { + if (is_known(v, list->i.rs) && + kunseg(v[list->i.rs].value) >= kunseg(block->pc) && + kunseg(v[list->i.rs].value) < (kunseg(block->pc) + + block->nb_ops * 4)) { pr_debug("Self-modifying block detected\n"); block_set_flags(block, BLOCK_NEVER_COMPILE); list->flags |= LIGHTREC_SMC; @@ -1622,13 +1597,22 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block) case OP_LWL: case OP_LWR: case OP_LWC2: - if (OPT_FLAG_IO && (known & BIT(list->i.rs))) { - val = values[list->i.rs] + (s16) list->i.imm; - kunseg_val = kunseg(val); - psx_map = lightrec_get_map_idx(state, kunseg_val); + if (OPT_FLAG_IO && + (v[list->i.rs].known | v[list->i.rs].sign)) { + psx_map = lightrec_get_constprop_map(state, v, + list->i.rs, + (s16) list->i.imm); + + if (psx_map != PSX_MAP_UNKNOWN && !is_known(v, list->i.rs)) + pr_debug("Detected map thanks to bit-level const propagation!\n"); list->flags &= ~LIGHTREC_IO_MASK; - no_mask = val == kunseg_val; + + val = v[list->i.rs].value + (s16) list->i.imm; + kunseg_val = kunseg(val); + + no_mask = (v[list->i.rs].known & ~v[list->i.rs].value + & 0xe0000000) == 0xe0000000; switch (psx_map) { case PSX_MAP_KERNEL_USER_RAM: @@ -1670,13 +1654,13 @@ static int lightrec_flag_io(struct lightrec_state *state, struct block *block) if (no_mask) list->flags |= LIGHTREC_NO_MASK; - break; + } else { + pr_debug("Flagging opcode %u as I/O access\n", + i); + list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW); } - fallthrough; + break; default: - pr_debug("Flagging opcode %u as I/O access\n", - i); - list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW); break; } } @@ -1870,18 +1854,15 @@ static bool lightrec_always_skip_div_check(void) static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block *block) { - struct opcode *prev, *list = NULL; + struct opcode *list = NULL; + struct constprop_data v[32] = LIGHTREC_CONSTPROP_INITIALIZER; u8 reg_hi, reg_lo; unsigned int i; - u32 known = BIT(0); - u32 values[32] = { 0 }; for (i = 0; i < block->nb_ops - 1; i++) { - prev = list; list = &block->opcode_list[i]; - if (prev) - known = lightrec_propagate_consts(list, prev, known, values); + lightrec_consts_propagate(block->opcode_list, i, v); switch (list->i.op) { case OP_SPECIAL: @@ -1891,8 +1872,9 @@ static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block * /* If we are dividing by a non-zero constant, don't * emit the div-by-zero check. */ if (lightrec_always_skip_div_check() || - ((known & BIT(list->c.r.rt)) && values[list->c.r.rt])) + (v[list->r.rt].known & v[list->r.rt].value)) { list->flags |= LIGHTREC_NO_DIV_CHECK; + } fallthrough; case OP_SPECIAL_MULT: case OP_SPECIAL_MULTU: @@ -1909,7 +1891,7 @@ static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block * } /* Don't support opcodes in delay slots */ - if ((i && has_delay_slot(block->opcode_list[i - 1].c)) || + if (is_delay_slot(block->opcode_list, i) || op_flag_no_ds(list->flags)) { continue; } diff --git a/deps/lightrec/optimizer.h b/deps/lightrec/optimizer.h index c8290286..825042df 100644 --- a/deps/lightrec/optimizer.h +++ b/deps/lightrec/optimizer.h @@ -14,6 +14,7 @@ struct opcode; _Bool opcode_reads_register(union code op, u8 reg); _Bool opcode_writes_register(union code op, u8 reg); _Bool has_delay_slot(union code op); +_Bool is_delay_slot(const struct opcode *list, unsigned int offset); _Bool load_in_delay_slot(union code op); _Bool opcode_is_io(union code op); _Bool is_unconditional_jump(union code c); diff --git a/deps/lightrec/regcache.c b/deps/lightrec/regcache.c index 1f11d8a2..c62ba3d5 100644 --- a/deps/lightrec/regcache.c +++ b/deps/lightrec/regcache.c @@ -11,6 +11,8 @@ #include #include +#define REG_PC (offsetof(struct lightrec_state, next_pc) / sizeof(u32)) + enum reg_priority { REG_IS_TEMP, REG_IS_TEMP_VALUE, @@ -24,7 +26,7 @@ enum reg_priority { struct native_register { bool used, output, extend, extended, zero_extend, zero_extended, locked; - s8 emulated_register; + s16 emulated_register; intptr_t value; enum reg_priority prio; }; @@ -61,7 +63,7 @@ static inline bool lightrec_reg_is_zero(u8 jit_reg) return false; } -static inline s8 lightrec_get_hardwired_reg(u8 reg) +static inline s8 lightrec_get_hardwired_reg(u16 reg) { #if defined(__mips__) || defined(__alpha__) || defined(__riscv) if (reg == 0) @@ -146,7 +148,7 @@ static struct native_register * alloc_temp(struct regcache *cache) for (i = ARRAY_SIZE(cache->lightrec_regs); i; i--) { elm = &cache->lightrec_regs[i - 1]; - if (!elm->used && elm->prio < best) { + if (!elm->used && !elm->locked && elm->prio < best) { nreg = elm; best = elm->prio; @@ -159,7 +161,7 @@ static struct native_register * alloc_temp(struct regcache *cache) } static struct native_register * find_mapped_reg(struct regcache *cache, - u8 reg, bool out) + u16 reg, bool out) { unsigned int i; @@ -175,7 +177,7 @@ static struct native_register * find_mapped_reg(struct regcache *cache, } static struct native_register * alloc_in_out(struct regcache *cache, - u8 reg, bool out) + u16 reg, bool out) { struct native_register *elm, *nreg = NULL; enum reg_priority best = REG_NB_PRIORITIES; @@ -191,7 +193,7 @@ static struct native_register * alloc_in_out(struct regcache *cache, for (i = 0; i < ARRAY_SIZE(cache->lightrec_regs); i++) { elm = &cache->lightrec_regs[i]; - if (!elm->used && elm->prio < best) { + if (!elm->used && !elm->locked && elm->prio < best) { nreg = elm; best = elm->prio; @@ -237,21 +239,6 @@ void lightrec_unload_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg) lightning_reg_to_lightrec(cache, jit_reg), jit_reg); } -/* lightrec_lock_reg: the register will be cleaned if dirty, then locked. - * A locked register cannot only be used as input, not output. */ -void lightrec_lock_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg) -{ - struct native_register *reg; - - if (lightrec_reg_is_zero(jit_reg)) - return; - - reg = lightning_reg_to_lightrec(cache, jit_reg); - lightrec_clean_reg(cache, _jit, jit_reg); - - reg->locked = true; -} - u8 lightrec_alloc_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg) { struct native_register *reg; @@ -313,7 +300,7 @@ void lightrec_temp_set_value(struct regcache *cache, u8 jit_reg, intptr_t value) } u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit, - u8 reg, u8 flags) + u16 reg, u8 flags) { struct native_register *nreg; u8 jit_reg; @@ -347,7 +334,7 @@ u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit, } u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit, - u8 reg, u8 flags) + u16 reg, u8 flags) { struct native_register *nreg; u8 jit_reg; @@ -416,33 +403,91 @@ u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit, return jit_reg; } -u8 lightrec_request_reg_in(struct regcache *cache, jit_state_t *_jit, - u8 reg, u8 jit_reg) +static bool reg_pc_is_mapped(struct regcache *cache) { - struct native_register *nreg; + struct native_register *nreg = lightning_reg_to_lightrec(cache, JIT_V0); + + return nreg->prio == REG_IS_LOADED && nreg->emulated_register == REG_PC; +} + +void lightrec_load_imm(struct regcache *cache, + jit_state_t *_jit, u8 jit_reg, u32 pc, u32 imm) +{ + s32 delta = imm - pc; + + if (!reg_pc_is_mapped(cache) || !can_sign_extend(delta, 16)) + jit_movi(jit_reg, imm); + else if (jit_reg != JIT_V0 || delta) + jit_addi(jit_reg, JIT_V0, delta); +} + +void lightrec_load_next_pc_imm(struct regcache *cache, + jit_state_t *_jit, u32 pc, u32 imm) +{ + struct native_register *nreg = lightning_reg_to_lightrec(cache, JIT_V0); + + if (reg_pc_is_mapped(cache)) { + /* JIT_V0 contains next PC - so we can overwrite it */ + lightrec_load_imm(cache, _jit, JIT_V0, pc, imm); + } else { + /* JIT_V0 contains something else - invalidate it */ + lightrec_unload_reg(cache, _jit, JIT_V0); + + jit_movi(JIT_V0, imm); + } + + nreg->prio = REG_IS_LOADED; + nreg->emulated_register = -1; + nreg->locked = true; +} + +void lightrec_load_next_pc(struct regcache *cache, jit_state_t *_jit, u8 reg) +{ + struct native_register *nreg_v0, *nreg; u16 offset; + u8 jit_reg; + + /* Invalidate JIT_V0 if it is not mapped to 'reg' */ + nreg_v0 = lightning_reg_to_lightrec(cache, JIT_V0); + if (nreg_v0->prio >= REG_IS_LOADED && nreg_v0->emulated_register != reg) + lightrec_unload_nreg(cache, _jit, nreg_v0, JIT_V0); nreg = find_mapped_reg(cache, reg, false); - if (nreg) { - jit_reg = lightrec_reg_to_lightning(cache, nreg); - nreg->used = true; - return jit_reg; - } + if (!nreg) { + /* Not mapped - load the value from the register cache */ - nreg = lightning_reg_to_lightrec(cache, jit_reg); - lightrec_unload_nreg(cache, _jit, nreg, jit_reg); + offset = offsetof(struct lightrec_state, regs.gpr) + (reg << 2); + jit_ldxi_ui(JIT_V0, LIGHTREC_REG_STATE, offset); - /* Load previous value from register cache */ - offset = offsetof(struct lightrec_state, regs.gpr) + (reg << 2); - jit_ldxi_i(jit_reg, LIGHTREC_REG_STATE, offset); + nreg_v0->prio = REG_IS_LOADED; + nreg_v0->emulated_register = reg; - nreg->extended = true; - nreg->zero_extended = false; - nreg->used = true; - nreg->emulated_register = reg; - nreg->prio = REG_IS_LOADED; + } else if (nreg == nreg_v0) { + /* The target register 'reg' is mapped to JIT_V0 */ - return jit_reg; + if (!nreg->zero_extended) + jit_extr_ui(JIT_V0, JIT_V0); + + } else { + /* The target register 'reg' is mapped elsewhere. In that case, + * move the register's value to JIT_V0 and re-map it in the + * register cache. We can then safely discard the original + * mapped register (even if it was dirty). */ + + jit_reg = lightrec_reg_to_lightning(cache, nreg); + if (nreg->zero_extended) + jit_movr(JIT_V0, jit_reg); + else + jit_extr_ui(JIT_V0, jit_reg); + + *nreg_v0 = *nreg; + lightrec_discard_nreg(nreg); + } + + lightrec_clean_reg(cache, _jit, JIT_V0); + + nreg_v0->zero_extended = true; + nreg_v0->locked = true; } static void free_reg(struct native_register *nreg) @@ -535,7 +580,7 @@ void lightrec_clean_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg) } void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit, - u8 reg, bool unload) + u16 reg, bool unload) { struct native_register *nreg; u8 jit_reg; @@ -551,7 +596,7 @@ void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit, } } -void lightrec_discard_reg_if_loaded(struct regcache *cache, u8 reg) +void lightrec_discard_reg_if_loaded(struct regcache *cache, u16 reg) { struct native_register *nreg; @@ -584,6 +629,17 @@ void lightrec_regcache_reset(struct regcache *cache) memset(&cache->lightrec_regs, 0, sizeof(cache->lightrec_regs)); } +void lightrec_preload_pc(struct regcache *cache) +{ + struct native_register *nreg; + + /* The block's PC is loaded in JIT_V0 at the start of the block */ + nreg = lightning_reg_to_lightrec(cache, JIT_V0); + nreg->emulated_register = REG_PC; + nreg->prio = REG_IS_LOADED; + nreg->zero_extended = true; +} + struct regcache * lightrec_regcache_init(struct lightrec_state *state) { struct regcache *cache; diff --git a/deps/lightrec/regcache.h b/deps/lightrec/regcache.h index cffbf053..d242c54b 100644 --- a/deps/lightrec/regcache.h +++ b/deps/lightrec/regcache.h @@ -41,12 +41,15 @@ struct regcache; u8 lightrec_alloc_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg); u8 lightrec_alloc_reg_temp(struct regcache *cache, jit_state_t *_jit); u8 lightrec_alloc_reg_out(struct regcache *cache, jit_state_t *_jit, - u8 reg, u8 flags); + u16 reg, u8 flags); u8 lightrec_alloc_reg_in(struct regcache *cache, jit_state_t *_jit, - u8 reg, u8 flags); + u16 reg, u8 flags); -u8 lightrec_request_reg_in(struct regcache *cache, jit_state_t *_jit, - u8 reg, u8 jit_reg); +void lightrec_load_imm(struct regcache *cache, + jit_state_t *_jit, u8 jit_reg, u32 pc, u32 imm); +void lightrec_load_next_pc(struct regcache *cache, jit_state_t *_jit, u8 reg); +void lightrec_load_next_pc_imm(struct regcache *cache, + jit_state_t *_jit, u32 pc, u32 imm); s8 lightrec_get_reg_with_value(struct regcache *cache, intptr_t value); void lightrec_temp_set_value(struct regcache *cache, u8 jit_reg, intptr_t value); @@ -55,8 +58,8 @@ u8 lightrec_get_reg_in_flags(struct regcache *cache, u8 jit_reg); void lightrec_set_reg_out_flags(struct regcache *cache, u8 jit_reg, u8 flags); void lightrec_regcache_reset(struct regcache *cache); +void lightrec_preload_pc(struct regcache *cache); -void lightrec_lock_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg); void lightrec_free_reg(struct regcache *cache, u8 jit_reg); void lightrec_free_regs(struct regcache *cache); void lightrec_clean_reg(struct regcache *cache, jit_state_t *_jit, u8 jit_reg); @@ -66,11 +69,11 @@ void lightrec_storeback_regs(struct regcache *cache, jit_state_t *_jit); _Bool lightrec_has_dirty_regs(struct regcache *cache); void lightrec_clean_reg_if_loaded(struct regcache *cache, jit_state_t *_jit, - u8 reg, _Bool unload); -void lightrec_discard_reg_if_loaded(struct regcache *cache, u8 reg); + u16 reg, _Bool unload); +void lightrec_discard_reg_if_loaded(struct regcache *cache, u16 reg); u8 lightrec_alloc_reg_in_address(struct regcache *cache, - jit_state_t *_jit, u8 reg, s16 offset); + jit_state_t *_jit, u16 reg, s16 offset); struct native_register * lightrec_regcache_enter_branch(struct regcache *cache); void lightrec_regcache_leave_branch(struct regcache *cache,