From fde7a317052fd7e7111b0e003e65d0d1b8159cac Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 29 Jun 2024 01:02:05 +0300 Subject: [PATCH 01/16] standalone: some ppf loading mostly for testing, like game.chd.ppf would load game.chd and apply game.chd.ppf patch --- frontend/menu.c | 19 +++++++++++++++++-- libpcsxcore/misc.c | 2 +- libpcsxcore/ppf.c | 43 +++++++++++++++++++++++-------------------- libpcsxcore/ppf.h | 2 +- 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/frontend/menu.c b/frontend/menu.c index 39f0a697..eb3237e3 100644 --- a/frontend/menu.c +++ b/frontend/menu.c @@ -40,6 +40,7 @@ #include "../libpcsxcore/cdrom.h" #include "../libpcsxcore/cdriso.h" #include "../libpcsxcore/cheat.h" +#include "../libpcsxcore/ppf.h" #include "../libpcsxcore/new_dynarec/new_dynarec.h" #include "../plugins/dfsound/spu_config.h" #include "psemu_plugin_defs.h" @@ -740,7 +741,7 @@ static const char *filter_exts[] = { #ifdef HAVE_CHD "chd", #endif - "bz", "znx", "pbp", "cbn", NULL + "bz", "znx", "pbp", "cbn", "ppf", NULL }; // rrrr rggg gggb bbbb @@ -2157,6 +2158,18 @@ static int run_exe(void) static int run_cd_image(const char *fname) { int autoload_state = g_autostateld_opt; + size_t fname_len = strlen(fname); + const char *ppfname = NULL; + char fname2[256]; + + // simle ppf handling, like game.chd.ppf + if (4 < fname_len && fname_len < sizeof(fname2) + && strcasecmp(fname + fname_len - 4, ".ppf") == 0) { + memcpy(fname2, fname, fname_len - 4); + fname2[fname_len - 4] = 0; + ppfname = fname; + fname = fname2; + } ready_to_go = 0; reload_plugins(fname); @@ -2170,6 +2183,8 @@ static int run_cd_image(const char *fname) menu_update_msg("unsupported/invalid CD image"); return -1; } + if (ppfname) + BuildPPFCache(ppfname); SysReset(); @@ -2185,7 +2200,7 @@ static int run_cd_image(const char *fname) if (autoload_state) { unsigned int newest = 0; - int time, slot, newest_slot = -1; + int time = 0, slot, newest_slot = -1; for (slot = 0; slot < 10; slot++) { if (emu_check_save_file(slot, &time)) { diff --git a/libpcsxcore/misc.c b/libpcsxcore/misc.c index 47a32cce..aafe5221 100644 --- a/libpcsxcore/misc.c +++ b/libpcsxcore/misc.c @@ -466,7 +466,7 @@ int CheckCdrom() { Apply_Hacks_Cdrom(); - BuildPPFCache(); + BuildPPFCache(NULL); return 0; } diff --git a/libpcsxcore/ppf.c b/libpcsxcore/ppf.c index f37687cc..6a88e053 100644 --- a/libpcsxcore/ppf.c +++ b/libpcsxcore/ppf.c @@ -181,7 +181,7 @@ static void AddToPPF(s32 ladr, s32 pos, s32 anz, unsigned char *ppfmem) { } } -void BuildPPFCache() { +void BuildPPFCache(const char *fname) { FILE *ppffile; char buffer[12]; char method, undo = 0, blockcheck = 0; @@ -196,23 +196,25 @@ void BuildPPFCache() { if (CdromId[0] == '\0') return; - // Generate filename in the format of SLUS_123.45 - buffer[0] = toupper(CdromId[0]); - buffer[1] = toupper(CdromId[1]); - buffer[2] = toupper(CdromId[2]); - buffer[3] = toupper(CdromId[3]); - buffer[4] = '_'; - buffer[5] = CdromId[4]; - buffer[6] = CdromId[5]; - buffer[7] = CdromId[6]; - buffer[8] = '.'; - buffer[9] = CdromId[7]; - buffer[10] = CdromId[8]; - buffer[11] = '\0'; - - sprintf(szPPF, "%s%s", Config.PatchesDir, buffer); - - ppffile = fopen(szPPF, "rb"); + if (!fname) { + // Generate filename in the format of SLUS_123.45 + buffer[0] = toupper(CdromId[0]); + buffer[1] = toupper(CdromId[1]); + buffer[2] = toupper(CdromId[2]); + buffer[3] = toupper(CdromId[3]); + buffer[4] = '_'; + buffer[5] = CdromId[4]; + buffer[6] = CdromId[5]; + buffer[7] = CdromId[6]; + buffer[8] = '.'; + buffer[9] = CdromId[7]; + buffer[10] = CdromId[8]; + buffer[11] = '\0'; + + sprintf(szPPF, "%s%s", Config.PatchesDir, buffer); + fname = szPPF; + } + ppffile = fopen(fname, "rb"); if (ppffile == NULL) return; memset(buffer, 0, 5); @@ -220,7 +222,7 @@ void BuildPPFCache() { goto fail_io; if (strcmp(buffer, "PPF") != 0) { - SysPrintf(_("Invalid PPF patch: %s.\n"), szPPF); + SysPrintf(_("Invalid PPF patch: %s.\n"), fname); fclose(ppffile); return; } @@ -346,7 +348,8 @@ void BuildPPFCache() { FillPPFCache(); // build address array - SysPrintf(_("Loaded PPF %d.0 patch: %s.\n"), method + 1, szPPF); + SysPrintf(_("Loaded PPF %d.0 patch: %s.\n"), method + 1, fname); + return; fail_io: #ifndef NDEBUG diff --git a/libpcsxcore/ppf.h b/libpcsxcore/ppf.h index a1b14751..e646e554 100644 --- a/libpcsxcore/ppf.h +++ b/libpcsxcore/ppf.h @@ -23,7 +23,7 @@ extern "C" { #endif -void BuildPPFCache(); +void BuildPPFCache(const char *fname); void FreePPFCache(); void CheckPPFCache(unsigned char *pB, unsigned char m, unsigned char s, unsigned char f); -- 2.39.2 From b27f55bea733c150ca50360b2eb807304253d1fc Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 29 Jun 2024 01:08:41 +0300 Subject: [PATCH 02/16] try some overscan display option --- frontend/libretro.c | 12 ++++++++++++ frontend/libretro_core_options.h | 15 +++++++++++++++ frontend/menu.c | 4 ++++ frontend/plugin_lib.h | 1 + plugins/gpulib/gpu.c | 13 ++++++++++--- plugins/gpulib/gpu.h | 1 + 6 files changed, 43 insertions(+), 3 deletions(-) diff --git a/frontend/libretro.c b/frontend/libretro.c index cd6924c1..8cbae419 100644 --- a/frontend/libretro.c +++ b/frontend/libretro.c @@ -2759,6 +2759,18 @@ static void update_variables(bool in_flight) pl_rearmed_cbs.screen_centering_y = atoi(var.value); } + var.value = NULL; + var.key = "pcsx_rearmed_show_overscan"; + if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value) + { + if (strcmp(var.value, "auto") == 0) + pl_rearmed_cbs.show_overscan = 1; + else if (strcmp(var.value, "hack") == 0) + pl_rearmed_cbs.show_overscan = 2; + else + pl_rearmed_cbs.show_overscan = 0; + } + #ifdef THREAD_RENDERING var.key = "pcsx_rearmed_gpu_thread_rendering"; var.value = NULL; diff --git a/frontend/libretro_core_options.h b/frontend/libretro_core_options.h index 762bf276..e4995277 100644 --- a/frontend/libretro_core_options.h +++ b/frontend/libretro_core_options.h @@ -453,6 +453,21 @@ struct retro_core_option_v2_definition option_defs_us[] = { }, "auto", }, + { + "pcsx_rearmed_show_overscan", + "(GPU) Show horizontal overscan", + NULL, + "The PSX can display graphics way into the horizontal borders, even if most screens would crop it. This option tries to display all such graphics. Note that this may result in unusual resolutions that your device might not handle well. The 'Hack' option is intended for the widescreen hacks.", + NULL, + "video", + { + { "disabled", NULL }, + { "auto", "Auto" }, + { "hack", "Hack" }, + { NULL, NULL }, + }, + "disabled", + }, { "pcsx_rearmed_screen_centering", "(GPU) Screen centering", diff --git a/frontend/menu.c b/frontend/menu.c index eb3237e3..9b9af7c8 100644 --- a/frontend/menu.c +++ b/frontend/menu.c @@ -93,6 +93,7 @@ typedef enum MA_OPT_SCANLINES, MA_OPT_SCANLINE_LEVEL, MA_OPT_CENTERING, + MA_OPT_OVERSCAN, } menu_id; static int last_vout_w, last_vout_h, last_vout_bpp; @@ -467,6 +468,7 @@ static const struct { CE_INTVAL_P(screen_centering_type), CE_INTVAL_P(screen_centering_x), CE_INTVAL_P(screen_centering_y), + CE_INTVAL_P(show_overscan), CE_INTVAL(spu_config.iUseReverb), CE_INTVAL(spu_config.iXAPitch), CE_INTVAL(spu_config.iUseInterpolation), @@ -1280,6 +1282,7 @@ static const char *men_soft_filter[] = { "None", NULL }; static const char *men_dummy[] = { NULL }; static const char *men_centering[] = { "Auto", "Ingame", "Borderless", "Force", NULL }; +static const char *men_overscan[] = { "OFF", "Auto", "Hack", NULL }; static const char h_scaler[] = "int. 2x - scales w. or h. 2x if it fits on screen\n" "int. 4:3 - uses integer if possible, else fractional"; static const char h_cscaler[] = "Displays the scaler layer, you can resize it\n" @@ -1376,6 +1379,7 @@ static int menu_loop_cscaler(int id, int keys) static menu_entry e_menu_gfx_options[] = { mee_enum ("Screen centering", MA_OPT_CENTERING, pl_rearmed_cbs.screen_centering_type, men_centering), + mee_enum ("Show overscan", MA_OPT_OVERSCAN, pl_rearmed_cbs.show_overscan, men_overscan), mee_enum_h ("Scaler", MA_OPT_VARSCALER, g_scaler, men_scaler, h_scaler), mee_enum ("Video output mode", MA_OPT_VOUT_MODE, plat_target.vout_method, men_dummy), mee_onoff ("Software Scaling", MA_OPT_SCALER2, soft_scaling, 1), diff --git a/frontend/plugin_lib.h b/frontend/plugin_lib.h index 5733ca24..c7ca247e 100644 --- a/frontend/plugin_lib.h +++ b/frontend/plugin_lib.h @@ -119,6 +119,7 @@ struct rearmed_cbs { int screen_centering_type_default; int screen_centering_x; int screen_centering_y; + int show_overscan; }; extern struct rearmed_cbs pl_rearmed_cbs; diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index 306c9d2f..dd7d5f32 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -92,6 +92,11 @@ static noinline void update_width(void) x = (x + 1) & ~1; // blitter limitation sw /= hdiv; sw = (sw + 2) & ~3; // according to nocash + + if (gpu.state.show_overscan == 2) // widescreen hack + sw = (sw + 63) & ~63; + if (gpu.state.show_overscan && sw >= hres) + x = 0, hres = sw; switch (type) { case C_INGAME: break; @@ -116,8 +121,8 @@ static noinline void update_width(void) gpu.screen.w = sw; gpu.screen.hres = hres; gpu.state.dims_changed = 1; - //printf("xx %d %d -> %2d, %d / %d\n", - // gpu.screen.x1, gpu.screen.x2, x, sw, hres); + //printf("xx %d %d (%d) -> %2d, %d / %d\n", gpu.screen.x1, + // gpu.screen.x2, gpu.screen.x2 - gpu.screen.x1, x, sw, hres); } static noinline void update_height(void) @@ -979,10 +984,12 @@ void GPUrearmedCallbacks(const struct rearmed_cbs *cbs) gpu.state.screen_centering_type_default = cbs->screen_centering_type_default; if (gpu.state.screen_centering_type != cbs->screen_centering_type || gpu.state.screen_centering_x != cbs->screen_centering_x - || gpu.state.screen_centering_y != cbs->screen_centering_y) { + || gpu.state.screen_centering_y != cbs->screen_centering_y + || gpu.state.show_overscan != cbs->show_overscan) { gpu.state.screen_centering_type = cbs->screen_centering_type; gpu.state.screen_centering_x = cbs->screen_centering_x; gpu.state.screen_centering_y = cbs->screen_centering_y; + gpu.state.show_overscan = cbs->show_overscan; update_width(); update_height(); } diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h index 7625c412..bf420b9c 100644 --- a/plugins/gpulib/gpu.h +++ b/plugins/gpulib/gpu.h @@ -81,6 +81,7 @@ struct psx_gpu { uint32_t downscale_enable:1; uint32_t downscale_active:1; uint32_t dims_changed:1; + uint32_t show_overscan:2; uint32_t *frame_count; uint32_t *hcnt; /* hsync count */ struct { -- 2.39.2 From 4a1885e228806b41cf881ed1fc9860ad5f59f6e1 Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 30 Jun 2024 01:48:22 +0300 Subject: [PATCH 03/16] gpu_neon: fix sign extension unclear why it's added differently from sprites, might need to revisit libretro/pcsx_rearmed#833 --- plugins/gpu_neon/psx_gpu/psx_gpu_parse.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index d81b7078..53f33e4c 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -200,21 +200,14 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y, } } -#define sign_extend_12bit(value) \ - (((s32)((value) << 20)) >> 20) \ - #define sign_extend_11bit(value) \ (((s32)((value) << 21)) >> 21) \ -#define sign_extend_10bit(value) \ - (((s32)((value) << 22)) >> 22) \ - - #define get_vertex_data_xy(vertex_number, offset16) \ vertexes[vertex_number].x = \ - sign_extend_12bit(list_s16[offset16]) + psx_gpu->offset_x; \ + sign_extend_11bit(list_s16[offset16]) + psx_gpu->offset_x; \ vertexes[vertex_number].y = \ - sign_extend_12bit(list_s16[(offset16) + 1]) + psx_gpu->offset_y; \ + sign_extend_11bit(list_s16[(offset16) + 1]) + psx_gpu->offset_y; \ #define get_vertex_data_uv(vertex_number, offset16) \ vertexes[vertex_number].u = list_s16[offset16] & 0xFF; \ @@ -1746,10 +1739,8 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0xE5: { - s32 offset_x = list[0] << 21; - s32 offset_y = list[0] << 10; - psx_gpu->offset_x = offset_x >> 21; - psx_gpu->offset_y = offset_y >> 21; + psx_gpu->offset_x = sign_extend_11bit(list[0]); + psx_gpu->offset_y = sign_extend_11bit(list[0] >> 11); SET_Ex(5, list[0]); break; -- 2.39.2 From 459f02ad03fa10b5c403fed724d47fe5adfd5fb1 Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 30 Jun 2024 02:23:25 +0300 Subject: [PATCH 04/16] gpu_neon: revive the old tests --- plugins/gpu_neon/psx_gpu/psx_gpu_main.c | 15 ++++++++------- plugins/gpu_neon/psx_gpu/tests/Makefile | 5 ++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_main.c b/plugins/gpu_neon/psx_gpu/psx_gpu_main.c index 435c51a2..5f1f3834 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_main.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_main.c @@ -136,7 +136,8 @@ int main(int argc, char *argv[]) FILE *list_file; u32 no_display = 0; s32 dummy0 = 0; - u32 dummy1 = 0; + s32 dummy1 = 0; + u32 dummy2 = 0; if((argc != 3) && (argc != 4)) { @@ -180,7 +181,7 @@ int main(int argc, char *argv[]) u32 fbdev_handle = open("/dev/fb1", O_RDWR); vram_ptr = (mmap((void *)0x50000000, 1024 * 1024 * 2, PROT_READ | PROT_WRITE, MAP_SHARED | 0xA0000000, fbdev_handle, 0)); -#elif 1 +#elif 0 #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 /* arch specific */ #endif @@ -211,23 +212,23 @@ int main(int argc, char *argv[]) clear_stats(); -#ifdef NEON_BUILD +#ifdef CYCLE_COUNTER init_counter(); #endif - gpu_parse(psx_gpu, list, size, &dummy0, &dummy1); + gpu_parse(psx_gpu, list, size, &dummy0, &dummy1, &dummy2); flush_render_block_buffer(psx_gpu); clear_stats(); -#ifdef NEON_BUILD +#ifdef CYCLE_COUNTER u32 cycles = get_counter(); #endif - gpu_parse(psx_gpu, list, size, &dummy0, &dummy1); + gpu_parse(psx_gpu, list, size, &dummy0, &dummy1, &dummy2); flush_render_block_buffer(psx_gpu); -#ifdef NEON_BUILD +#ifdef CYCLE_COUNTER u32 cycles_elapsed = get_counter() - cycles; printf("%-64s: %d\n", argv[1], cycles_elapsed); diff --git a/plugins/gpu_neon/psx_gpu/tests/Makefile b/plugins/gpu_neon/psx_gpu/tests/Makefile index bb91a5a8..21d61558 100644 --- a/plugins/gpu_neon/psx_gpu/tests/Makefile +++ b/plugins/gpu_neon/psx_gpu/tests/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wall -ggdb CFLAGS += -fno-strict-aliasing CFLAGS += `sdl-config --cflags` -LDFLAGS += `sdl-config --libs` +LDLIBS += `sdl-config --libs` VPATH += .. @@ -12,6 +12,9 @@ ifdef NEON CFLAGS += -mcpu=cortex-a8 -mfpu=neon -DNEON_BUILD ASFLAGS = $(CFLAGS) OBJ += psx_gpu_arm_neon.o +else +CFLAGS += -DNEON_BUILD -DSIMD_BUILD +OBJ += psx_gpu_simd.o endif ifndef DEBUG CFLAGS += -O2 -DNDEBUG -- 2.39.2 From 5eecf06ddff70602526a937f6faebcd3039885bb Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Mon, 15 Jul 2024 22:10:13 +0200 Subject: [PATCH 05/16] git subrepo pull --force deps/lightrec subrepo: subdir: "deps/lightrec" merged: "601afca8e8" upstream: origin: "https://github.com/pcercuei/lightrec.git" branch: "master" commit: "601afca8e8" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo.git" commit: "110b9eb" --- deps/lightrec/.gitrepo | 4 +-- deps/lightrec/CMakeLists.txt | 4 +++ deps/lightrec/arch.h | 36 +++++++++++++++++++++++++ deps/lightrec/emitter.c | 21 ++++++++++----- deps/lightrec/lightning-wrapper.h | 10 +++++++ deps/lightrec/lightrec-config.h.cmakein | 2 ++ deps/lightrec/lightrec-private.h | 10 +++++++ deps/lightrec/lightrec.c | 6 ++--- deps/lightrec/optimizer.c | 12 ++++----- deps/lightrec/regcache.h | 10 +++++-- 10 files changed, 94 insertions(+), 21 deletions(-) create mode 100644 deps/lightrec/arch.h diff --git a/deps/lightrec/.gitrepo b/deps/lightrec/.gitrepo index 9e04deb8..69811196 100644 --- a/deps/lightrec/.gitrepo +++ b/deps/lightrec/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/pcercuei/lightrec.git branch = master - commit = bd0b82792284f22566bbfc78d8882e1e91b10516 - parent = 1229a4ea3dea3e1e47c46cd7afed38860fd91a57 + commit = 601afca8e889bdda7040ff5c64f7bbd20d1d5f2c + parent = 459f02ad03fa10b5c403fed724d47fe5adfd5fb1 method = merge cmdver = 0.4.6 diff --git a/deps/lightrec/CMakeLists.txt b/deps/lightrec/CMakeLists.txt index c3eb6f8f..6f3d53e7 100644 --- a/deps/lightrec/CMakeLists.txt +++ b/deps/lightrec/CMakeLists.txt @@ -85,6 +85,10 @@ option(OPT_FLAG_MULT_DIV "(optimization) Flag MULT/DIV that only use one of HI/L option(OPT_EARLY_UNLOAD "(optimization) Unload registers early" ON) option(OPT_PRELOAD_PC "(optimization) Preload PC value into register" ON) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "SH4|sh4") + option(OPT_SH4_USE_GBR "(SH4 optimization) Use GBR register for the state pointer" OFF) +endif() + target_include_directories(lightrec PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) if (CMAKE_C_COMPILER_ID MATCHES "GNU|Clang") diff --git a/deps/lightrec/arch.h b/deps/lightrec/arch.h new file mode 100644 index 00000000..7df9e754 --- /dev/null +++ b/deps/lightrec/arch.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* + * Copyright (C) 2024 Paul Cercueil + */ + +#ifndef __LIGHTREC_ARCH_H__ +#define __LIGHTREC_ARCH_H__ + +#include +#include + +static bool arch_has_fast_mask(void) +{ +#if __mips_isa_rev >= 2 + /* On MIPS32 >= r2, we can use extr / ins instructions */ + return true; +#endif +#ifdef __powerpc__ + /* On PowerPC, we can use the RLWINM instruction */ + return true; +#endif +#ifdef __aarch64__ + /* Aarch64 can use the UBFX instruction */ + return true; +#endif +#if defined(__x86__) || defined(__x86_64__) + /* x86 doesn't have enough registers, using cached values make + * little sense. Using jit_andi() will give a better result as it will + * use bit-shifts for low/high masks. */ + return true; +#endif + + return false; +} + +#endif /* __LIGHTREC_ARCH_H__ */ diff --git a/deps/lightrec/emitter.c b/deps/lightrec/emitter.c index 8612119f..a59ff1d7 100644 --- a/deps/lightrec/emitter.c +++ b/deps/lightrec/emitter.c @@ -3,6 +3,7 @@ * Copyright (C) 2014-2021 Paul Cercueil */ +#include "arch.h" #include "blockcache.h" #include "debug.h" #include "disassembler.h" @@ -103,7 +104,7 @@ static void lightrec_emit_end_of_block(struct lightrec_cstate *state, if (cycles && update_cycles) { jit_subi(LIGHTREC_REG_CYCLE, LIGHTREC_REG_CYCLE, cycles); - pr_debug("EOB: %u cycles\n", cycles); + pr_debug("EOB: %"PRIu32" cycles\n", cycles); } if (has_ds && op_flag_load_delay(ds->flags) @@ -247,11 +248,11 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 struct lightrec_branch *branch; const struct opcode *op = &block->opcode_list[offset], *ds = get_delay_slot(block->opcode_list, offset); - jit_node_t *addr; bool is_forward = (s16)op->i.imm >= 0; int op_cycles = lightrec_cycles_of_opcode(state->state, op->c); u32 target_offset, cycles = state->cycles + op_cycles; bool no_indirection = false; + jit_node_t *addr = NULL; u32 next_pc; u8 rs, rt; @@ -308,7 +309,7 @@ static void rec_b(struct lightrec_cstate *state, const struct block *block, u16 target_offset = offset + 1 + (s16)op->i.imm - !!op_flag_no_ds(op->flags); - pr_debug("Adding local branch to offset 0x%x\n", + pr_debug("Adding local branch to offset 0x%"PRIx32"\n", target_offset << 2); branch = &state->local_branches[ state->nb_local_branches++]; @@ -941,7 +942,7 @@ static void rec_alu_mult(struct lightrec_cstate *state, u8 reg_lo = get_mult_div_lo(c); u8 reg_hi = get_mult_div_hi(c); jit_state_t *_jit = block->_jit; - u8 lo, hi, rs, rt, rflags = 0; + u8 lo, hi = 0, rs, rt, rflags = 0; bool no_lo = op_flag_no_lo(flags); bool no_hi = op_flag_no_hi(flags); @@ -1276,10 +1277,16 @@ static void rec_and_mask(struct lightrec_cstate *cstate, struct regcache *reg_cache = cstate->reg_cache; u8 reg_imm; - reg_imm = lightrec_alloc_reg_temp_with_value(reg_cache, _jit, mask); - jit_andr(reg_out, reg_in, reg_imm); + if (arch_has_fast_mask() + && (is_low_mask(mask) || is_high_mask(mask))) { + jit_andi(reg_out, reg_in, mask); + } else { + reg_imm = lightrec_alloc_reg_temp_with_value(reg_cache, _jit, + mask); + jit_andr(reg_out, reg_in, reg_imm); - lightrec_free_reg(reg_cache, reg_imm); + lightrec_free_reg(reg_cache, reg_imm); + } } static void rec_store_memory(struct lightrec_cstate *cstate, diff --git a/deps/lightrec/lightning-wrapper.h b/deps/lightrec/lightning-wrapper.h index 88e93e06..cbf3edc3 100644 --- a/deps/lightrec/lightning-wrapper.h +++ b/deps/lightrec/lightning-wrapper.h @@ -8,6 +8,8 @@ #include +#include "lightrec-config.h" + #if __WORDSIZE == 32 #define jit_ldxi_ui(u,v,w) jit_ldxi_i(u,v,w) @@ -21,6 +23,14 @@ #define jit_b() jit_beqr(0, 0) +#if defined(__sh__) && OPT_SH4_USE_GBR +#define jit_add_state(u,v) \ + do { \ + jit_new_node_ww(jit_code_movr,_R0,LIGHTREC_REG_STATE); \ + jit_new_node_www(jit_code_addr,u,v,_R0); \ + } while (0) +#else #define jit_add_state(u,v) jit_addr(u,v,LIGHTREC_REG_STATE) +#endif #endif /* __LIGHTNING_WRAPPER_H__ */ diff --git a/deps/lightrec/lightrec-config.h.cmakein b/deps/lightrec/lightrec-config.h.cmakein index 9086a7ae..f92509ad 100644 --- a/deps/lightrec/lightrec-config.h.cmakein +++ b/deps/lightrec/lightrec-config.h.cmakein @@ -25,5 +25,7 @@ #cmakedefine01 OPT_EARLY_UNLOAD #cmakedefine01 OPT_PRELOAD_PC +#cmakedefine01 OPT_SH4_USE_GBR + #endif /* __LIGHTREC_CONFIG_H__ */ diff --git a/deps/lightrec/lightrec-private.h b/deps/lightrec/lightrec-private.h index 920008c2..5e047aa8 100644 --- a/deps/lightrec/lightrec-private.h +++ b/deps/lightrec/lightrec-private.h @@ -372,6 +372,16 @@ static inline _Bool can_zero_extend(u32 value, u8 order) return (value >> order) == 0; } +static inline _Bool is_low_mask(u32 imm) +{ + return imm & 1 ? popcount32(imm + 1) <= 1 : 0; +} + +static inline _Bool is_high_mask(u32 imm) +{ + return imm ? popcount32(imm + BIT(ctz32(imm))) == 0 : 0; +} + static inline const struct opcode * get_delay_slot(const struct opcode *list, u16 i) { diff --git a/deps/lightrec/lightrec.c b/deps/lightrec/lightrec.c index ec26bff7..ae170531 100644 --- a/deps/lightrec/lightrec.c +++ b/deps/lightrec/lightrec.c @@ -959,7 +959,7 @@ static struct block * generate_wrapper(struct lightrec_state *state) jit_tramp(256); /* Load pointer to C wrapper */ - jit_addr(JIT_R1, JIT_R1, LIGHTREC_REG_STATE); + jit_add_state(JIT_R1, JIT_R1); jit_ldxi(JIT_R1, JIT_R1, lightrec_offset(c_wrappers)); jit_epilog(); @@ -1046,7 +1046,7 @@ static u32 lightrec_memset(struct lightrec_state *state) return 0; } - pr_debug("Calling host memset, "PC_FMT" (host address 0x%"PRIxPTR") for %u bytes\n", + pr_debug("Calling host memset, "PC_FMT" (host address 0x%"PRIxPTR") for %"PRIu32" bytes\n", kunseg_pc, (uintptr_t)host, length); memset(host, 0, length); @@ -1624,7 +1624,7 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, for (i = 0; i < cstate->nb_local_branches; i++) { struct lightrec_branch *branch = &cstate->local_branches[i]; - pr_debug("Patch local branch to offset 0x%x\n", + pr_debug("Patch local branch to offset 0x%"PRIx32"\n", branch->target << 2); if (branch->target == 0) { diff --git a/deps/lightrec/optimizer.c b/deps/lightrec/optimizer.c index 0a3655b9..991ef778 100644 --- a/deps/lightrec/optimizer.c +++ b/deps/lightrec/optimizer.c @@ -1172,7 +1172,7 @@ static int lightrec_transform_ops(struct lightrec_state *state, struct block *bl break; } - pr_debug("Multiply by power-of-two: %u\n", + pr_debug("Multiply by power-of-two: %"PRIu32"\n", v[op->r.rt].value); if (op->r.op == OP_SPECIAL_MULT) @@ -1440,14 +1440,12 @@ static int lightrec_swap_load_delays(struct lightrec_state *state, switch (next.i.op) { case OP_LWL: case OP_LWR: - case OP_REGIMM: - case OP_BEQ: - case OP_BNE: - case OP_BLEZ: - case OP_BGTZ: continue; } + if (has_delay_slot(next)) + continue; + if (opcode_reads_register(next, c.i.rt) && !opcode_writes_register(next, c.i.rs)) { pr_debug("Swapping opcodes at offset 0x%x to " @@ -1481,7 +1479,7 @@ static int lightrec_local_branches(struct lightrec_state *state, struct block *b offset = i + 1 + (s16)list->c.i.imm; - pr_debug("Found local branch to offset 0x%x\n", offset << 2); + pr_debug("Found local branch to offset 0x%"PRIx32"\n", offset << 2); ds = get_delay_slot(block->opcode_list, i); if (op_flag_load_delay(ds->flags) && opcode_is_load(ds->c)) { diff --git a/deps/lightrec/regcache.h b/deps/lightrec/regcache.h index 4b7cb89c..23a775ce 100644 --- a/deps/lightrec/regcache.h +++ b/deps/lightrec/regcache.h @@ -7,9 +7,15 @@ #define __REGCACHE_H__ #include "lightning-wrapper.h" +#include "lightrec-config.h" -#define NUM_REGS (JIT_V_NUM - 1) -#define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1)) +#if defined(__sh__) && OPT_SH4_USE_GBR +# define NUM_REGS JIT_V_NUM +# define LIGHTREC_REG_STATE _GBR +#else +# define NUM_REGS (JIT_V_NUM - 1) +# define LIGHTREC_REG_STATE (JIT_V(JIT_V_NUM - 1)) +#endif #if defined(__powerpc__) # define NUM_TEMPS JIT_R_NUM -- 2.39.2 From c1bb478fffeaa627509a85edeab9280b0d1636c6 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Mon, 15 Jul 2024 22:12:44 +0200 Subject: [PATCH 06/16] Update lightrec-config.h to latest version Signed-off-by: Paul Cercueil --- include/lightrec/lightrec-config.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/lightrec/lightrec-config.h b/include/lightrec/lightrec-config.h index b4fff53a..3d4b81e6 100644 --- a/include/lightrec/lightrec-config.h +++ b/include/lightrec/lightrec-config.h @@ -25,5 +25,7 @@ #define OPT_EARLY_UNLOAD 1 #define OPT_PRELOAD_PC 1 +#define OPT_SH4_USE_GBR 0 + #endif /* __LIGHTREC_CONFIG_H__ */ -- 2.39.2 From 19f1a7d2de3a136810a84341db77d4f5eb8f3361 Mon Sep 17 00:00:00 2001 From: notaz Date: Thu, 25 Jul 2024 02:34:23 +0300 Subject: [PATCH 07/16] cdrom: more hacks for more timing issues libretro/pcsx_rearmed#840 --- libpcsxcore/cdrom.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/libpcsxcore/cdrom.c b/libpcsxcore/cdrom.c index 4ef0a237..335c2dc5 100644 --- a/libpcsxcore/cdrom.c +++ b/libpcsxcore/cdrom.c @@ -835,9 +835,11 @@ void cdrInterrupt(void) { { for (i = 0; i < 3; i++) set_loc[i] = btoi(cdr.Param[i]); - cdr.RetryDetected = msfiEq(cdr.SetSector, set_loc) - && !cdr.SetlocPending; - //cdr.RetryDetected |= msfiEq(cdr.Param, cdr.Transfer); + if ((msfiEq(cdr.SetSector, set_loc)) //|| msfiEq(cdr.Param, cdr.Transfer)) + && !cdr.SetlocPending) + cdr.RetryDetected++; + else + cdr.RetryDetected = 0; memcpy(cdr.SetSector, set_loc, 3); cdr.SetSector[3] = 0; cdr.SetlocPending = 1; @@ -991,6 +993,7 @@ void cdrInterrupt(void) { Hokuto no Ken 2 InuYasha - Feudal Fairy Tale Dance Dance Revolution Konamix + Digimon Rumble Arena ... */ if (!(cdr.StatP & (STATUS_PLAY | STATUS_READ))) @@ -999,7 +1002,9 @@ void cdrInterrupt(void) { } else { - second_resp_time = 2 * 1097107; + second_resp_time = 2100011; + // a hack to try to avoid weird cmd vs irq1 races causing games to retry + second_resp_time += (cdr.RetryDetected & 15) * 100001; } SetPlaySeekRead(cdr.StatP, 0); DriveStateOld = cdr.DriveState; -- 2.39.2 From d0fb0abd0ef0c9ed8aa3ccae56cb9250938b3de7 Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 16 Aug 2024 03:05:46 +0300 Subject: [PATCH 08/16] avoid double prints with bios+Config.PsxOut --- libpcsxcore/psxbios.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libpcsxcore/psxbios.c b/libpcsxcore/psxbios.c index 1f2e37a2..64a04b85 100644 --- a/libpcsxcore/psxbios.c +++ b/libpcsxcore/psxbios.c @@ -3615,7 +3615,8 @@ void psxBiosInit() { biosA0[0x03] = biosB0[0x35] = psxBios_write_psxout; biosA0[0x3c] = biosB0[0x3d] = psxBios_putchar_psxout; biosA0[0x3e] = biosB0[0x3f] = psxBios_puts_psxout; - biosA0[0x3f] = psxBios_printf_psxout; + // calls putchar() internally so no need to override + //biosA0[0x3f] = psxBios_printf_psxout; if (!Config.HLE) { char verstr[0x24+1]; -- 2.39.2 From 3382c20ff058f5918ab9b88f291eb1748c50128f Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 17 Aug 2024 01:44:00 +0300 Subject: [PATCH 09/16] gpulib: adjust masking no idea if anything uses this, but tested on hw --- plugins/gpulib/gpu.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index dd7d5f32..57823843 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -412,21 +412,31 @@ const unsigned char cmd_lengths[256] = #define VRAM_MEM_XY(x, y) &gpu.vram[(y) * 1024 + (x)] -static void cpy_msb(uint16_t *dst, const uint16_t *src, int l, uint16_t msb) +// this isn't very useful so should be rare +static void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6) { int i; - for (i = 0; i < l; i++) - dst[i] = src[i] | msb; + if (r6 == 1) { + for (i = 0; i < l; i++) + dst[i] = src[i] | 0x8000; + } + else { + uint16_t msb = r6 << 15; + for (i = 0; i < l; i++) { + uint16_t mask = (int16_t)dst[i] >> 15; + dst[i] = (dst[i] & mask) | ((src[i] | msb) & ~mask); + } + } } static inline void do_vram_line(int x, int y, uint16_t *mem, int l, - int is_read, uint16_t msb) + int is_read, uint32_t r6) { uint16_t *vram = VRAM_MEM_XY(x, y); if (unlikely(is_read)) memcpy(mem, vram, l * 2); - else if (unlikely(msb)) - cpy_msb(vram, mem, l, msb); + else if (unlikely(r6)) + cpy_mask(vram, mem, l, r6); else memcpy(vram, mem, l * 2); } @@ -434,7 +444,7 @@ static inline void do_vram_line(int x, int y, uint16_t *mem, int l, static int do_vram_io(uint32_t *data, int count, int is_read) { int count_initial = count; - uint16_t msb = gpu.ex_regs[6] << 15; + uint32_t r6 = gpu.ex_regs[6] & 3; uint16_t *sdata = (uint16_t *)data; int x = gpu.dma.x, y = gpu.dma.y; int w = gpu.dma.w, h = gpu.dma.h; @@ -449,7 +459,7 @@ static int do_vram_io(uint32_t *data, int count, int is_read) if (count < l) l = count; - do_vram_line(x + o, y, sdata, l, is_read, msb); + do_vram_line(x + o, y, sdata, l, is_read, r6); if (o + l < w) o += l; @@ -464,13 +474,13 @@ static int do_vram_io(uint32_t *data, int count, int is_read) for (; h > 0 && count >= w; sdata += w, count -= w, y++, h--) { y &= 511; - do_vram_line(x, y, sdata, w, is_read, msb); + do_vram_line(x, y, sdata, w, is_read, r6); } if (h > 0) { if (count > 0) { y &= 511; - do_vram_line(x, y, sdata, count, is_read, msb); + do_vram_line(x, y, sdata, count, is_read, r6); o = count; count = 0; } -- 2.39.2 From 89a8e88a616301c0cec4cbfebf96301ce9d5244c Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 17 Aug 2024 01:38:45 +0300 Subject: [PATCH 10/16] gpu_neon: fix wrong mask bit for fills Fixes notaz/pcsx_rearmed#344 --- plugins/gpu_neon/psx_gpu/psx_gpu.c | 6 +-- plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S | 46 +-------------------- 2 files changed, 3 insertions(+), 49 deletions(-) diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c index b671a757..a59e9cdc 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c @@ -4810,8 +4810,7 @@ void render_block_fill(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y, u32 r = color & 0xFF; u32 g = (color >> 8) & 0xFF; u32 b = (color >> 16) & 0xFF; - u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10) | - psx_gpu->mask_msb; + u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10); u32 color_32bpp = color_16bpp | (color_16bpp << 16); u32 *vram_ptr = (u32 *)(psx_gpu->vram_out_ptr + x + (y * 1024)); @@ -4863,8 +4862,7 @@ void render_block_fill_enh(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y, u32 r = color & 0xFF; u32 g = (color >> 8) & 0xFF; u32 b = (color >> 16) & 0xFF; - u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10) | - psx_gpu->mask_msb; + u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10); u32 color_32bpp = color_16bpp | (color_16bpp << 16); u32 *vram_ptr = (u32 *)(psx_gpu->vram_out_ptr + x + (y * 1024)); diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index ffbea043..1ba562b5 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -4386,51 +4386,6 @@ function(warmup) #undef vram_ptr #undef color -#undef width -#undef height -#undef pitch - -#define vram_ptr r0 -#define color r1 -#define width r2 -#define height r3 - -#define pitch r1 - -#define num_width r12 - -#undef colors_a -#undef colors_b - -#define colors_a q0 -#define colors_b q1 - -.align 3 - -function(render_block_fill_body) - vdup.u16 colors_a, color - mov pitch, #2048 - - vmov colors_b, colors_a - sub pitch, pitch, width, lsl #1 - - mov num_width, width - - 0: - vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]! - - subs num_width, num_width, #16 - bne 0b - - add vram_ptr, vram_ptr, pitch - mov num_width, width - - subs height, height, #1 - bne 0b - - bx lr - - #undef x #undef y #undef width @@ -4523,6 +4478,7 @@ function(render_block_fill_body) #define texels_wide_high d15 #define texels_wide q7 +.align 3 setup_sprite_flush_blocks: vpush { q1 - q5 } -- 2.39.2 From 33a6c6e81fc79096a52211ade443750b7371b003 Mon Sep 17 00:00:00 2001 From: notaz Date: Thu, 22 Aug 2024 00:38:45 +0300 Subject: [PATCH 11/16] cdriso: log main cd img size --- libpcsxcore/cdriso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libpcsxcore/cdriso.c b/libpcsxcore/cdriso.c index 4a794261..27ac760d 100644 --- a/libpcsxcore/cdriso.c +++ b/libpcsxcore/cdriso.c @@ -1718,7 +1718,7 @@ static long CALLBACK ISOopen(void) { } } - SysPrintf("%s.\n", image_str); + SysPrintf("%s (%lld bytes).\n", image_str, (long long)size_main); PrintTracks(); -- 2.39.2 From 69b52008ef99ef97fc875e396f2b4ba4cba3578f Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 20 Aug 2024 23:49:45 +0300 Subject: [PATCH 12/16] gpu_neon: rework enh. res. texturing hack libretro/pcsx_rearmed#841 --- plugins/gpu_neon/psx_gpu/common.h | 2 - plugins/gpu_neon/psx_gpu/psx_gpu.c | 63 +++++++++++- plugins/gpu_neon/psx_gpu/psx_gpu.h | 13 ++- plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S | 59 +++++++++++- plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h | 1 + .../gpu_neon/psx_gpu/psx_gpu_offsets_update.c | 1 + plugins/gpu_neon/psx_gpu/psx_gpu_parse.c | 95 +++++-------------- plugins/gpu_neon/psx_gpu/psx_gpu_simd.c | 36 +++++++ 8 files changed, 191 insertions(+), 79 deletions(-) diff --git a/plugins/gpu_neon/psx_gpu/common.h b/plugins/gpu_neon/psx_gpu/common.h index 820dfbef..5881e2a0 100644 --- a/plugins/gpu_neon/psx_gpu/common.h +++ b/plugins/gpu_neon/psx_gpu/common.h @@ -9,7 +9,5 @@ #include "vector_types.h" #include "psx_gpu.h" -#define unlikely(x) __builtin_expect((x), 0) - #endif diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c index a59e9cdc..19f1c199 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c @@ -560,8 +560,9 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu) y##set##_b.e[1] = vertex->b \ -void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a, - vertex_struct *b, vertex_struct *c) +void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, + const vertex_struct * __restrict__ a, const vertex_struct * __restrict__ b, + const vertex_struct * __restrict__ c) { u32 triangle_area = psx_gpu->triangle_area; u32 winding_mask_scalar; @@ -1163,6 +1164,8 @@ static void setup_spans_debug_check(psx_gpu_struct *psx_gpu, setup_spans_set_x4(alternate, down, alternate_active); \ height -= 4; \ } while(height > 0); \ + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) \ + span_uvrg_offset[height - 1].low = span_uvrg_offset[height - 2].low; \ } \ @@ -1216,6 +1219,8 @@ static void setup_spans_debug_check(psx_gpu_struct *psx_gpu, setup_spans_set_x4(alternate, up, alternate_active); \ height -= 4; \ } \ + if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V) \ + psx_gpu->span_uvrg_offset[0].low = psx_gpu->span_uvrg_offset[1].low; \ } \ #define index_left 0 @@ -1452,6 +1457,11 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, setup_spans_set_x4(none, down, no); height_minor_b -= 4; } + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) + { + span_uvrg_offset[height_minor_b - 1].low = + span_uvrg_offset[height_minor_b - 2].low; + } } left_split_triangles++; @@ -1459,6 +1469,41 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, #endif +// this is some hacky mess, can this be improved somehow? +// ideally change things to not have to do this hack at all +void __attribute__((noinline)) +setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block, + edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset) +{ + size_t span_i = span_uvrg_offset - psx_gpu->span_uvrg_offset; + if (span_i != 0 && span_i != psx_gpu->num_spans - 1 + && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U)) + return; + u32 num_blocks = span_edge_data->num_blocks - 1; + s32 offset = __builtin_ctz(span_edge_data->right_mask | 0x100) - 1; + s32 toffset = 8 * num_blocks + offset - 1; + if (toffset < 0 && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U)) + return; + + toffset += span_edge_data->left_x; + s32 u_dx = psx_gpu->uvrg_dx.low.e[0]; + s32 v_dx = psx_gpu->uvrg_dx.low.e[1]; + u32 u = span_uvrg_offset->low.e[0]; + u32 v = span_uvrg_offset->low.e[1]; + u += u_dx * toffset; + v += v_dx * toffset; + u = (u >> 16) & psx_gpu->texture_mask_width; + v = (v >> 16) & psx_gpu->texture_mask_height; + if (!(psx_gpu->render_state_base & (TEXTURE_MODE_16BPP << 8))) { + // 4bpp 8bpp are swizzled + u32 u_ = u; + u = (u & 0x0f) | ((v & 0x0f) << 4); + v = (v & 0xf0) | (u_ >> 4); + } + assert(offset >= 0); + //assert(block->uv.e[offset] == ((v << 8) | u)); + block->uv.e[offset] = (v << 8) | u; +} #define dither_table_entry_normal(value) \ (value) \ @@ -1868,6 +1913,14 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, #define setup_blocks_store_draw_mask_untextured_direct(_block, bits) \ +#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset) \ + +#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset) \ +{ \ + u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V; \ + if (unlikely(psx_gpu->hacks_active & m_)) \ + setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, uvrg_offset); \ +} \ #define setup_blocks_add_blocks_indirect() \ num_blocks += span_num_blocks; \ @@ -1938,6 +1991,8 @@ void setup_blocks_##shading##_##texturing##_##dithering##_##sw##_##target( \ setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \ setup_blocks_store_draw_mask_##texturing##_##target(block, \ span_edge_data->right_mask); \ + setup_blocks_uv_adj_hack_##texturing(block, span_edge_data, \ + span_uvrg_offset); \ \ block++; \ } \ @@ -5016,8 +5071,10 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram) psx_gpu->primitive_type = PRIMITIVE_TYPE_UNKNOWN; psx_gpu->saved_hres = 256; + psx_gpu->hacks_active = 0; - // check some offset + // check some offsets, asm relies on these + psx_gpu->reserved_a[(offsetof(psx_gpu_struct, test_mask) == 0) - 1] = 0; psx_gpu->reserved_a[(offsetof(psx_gpu_struct, blocks) == psx_gpu_blocks_offset) - 1] = 0; } diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.h b/plugins/gpu_neon/psx_gpu/psx_gpu.h index 6964a629..2d0f7b12 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.h @@ -21,10 +21,17 @@ #define SPAN_DATA_BLOCKS_SIZE 32 +#define AHACK_TEXTURE_ADJ_U (1 << 0) +#define AHACK_TEXTURE_ADJ_V (1 << 1) + #ifndef __ASSEMBLER__ #include "vector_types.h" +#ifndef unlikely +#define unlikely(x) __builtin_expect((x), 0) +#endif + typedef enum { PRIMITIVE_TYPE_TRIANGLE = 0, @@ -189,6 +196,7 @@ typedef struct // enhancement stuff u16 *enhancement_buf_ptr; // main alloc u16 *enhancement_current_buf_ptr; // offset into above, 4 bufs + u32 hacks_active; // AHACK_TEXTURE_ADJ_U ... u32 saved_hres; s16 saved_viewport_start_x; s16 saved_viewport_start_y; @@ -205,7 +213,7 @@ typedef struct // Align up to 64 byte boundary to keep the upcoming buffers cache line // aligned, also make reachable with single immediate addition - u8 reserved_a[184 + 9*4 - 9*sizeof(void *)]; + u8 reserved_a[184 + 8*4 - 9*sizeof(void *)]; // 8KB block_struct blocks[MAX_BLOCKS_PER_ROW]; @@ -257,6 +265,9 @@ u32 texture_region_mask(s32 x1, s32 y1, s32 x2, s32 y2); void update_texture_8bpp_cache(psx_gpu_struct *psx_gpu); void flush_render_block_buffer(psx_gpu_struct *psx_gpu); +void setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block, + edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset); + void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram); u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command); diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index 1ba562b5..82738855 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -223,6 +223,7 @@ #ifdef __MACH__ #define flush_render_block_buffer _flush_render_block_buffer #define update_texture_8bpp_cache _update_texture_8bpp_cache +#define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack #endif @ r0: psx_gpu @@ -543,6 +544,7 @@ function(compute_all_gradients) #define uvrg q14 #define uvrg_dy q15 +#define uv d28 #define alternate_x_16 d4 @@ -925,6 +927,14 @@ function(compute_all_gradients) subs height, height, #4; \ bhi 2b; \ \ + nop; \ + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \ + tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ + beq 1f; \ + add temp, span_uvrg_offset, height, lsl #4; \ + vldr uv, [temp, #(-16*2)]; \ + vstr uv, [temp, #(-16)]; \ + \ 1: \ @@ -986,6 +996,14 @@ function(compute_all_gradients) subs height, height, #4; \ bhi 2b; \ \ + nop; \ + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \ + tst temp, #AHACK_TEXTURE_ADJ_V; \ + beq 1f; \ + add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ + vldr uv, [temp, #16]; \ + vstr uv, [temp, #0]; \ + \ 1: \ @@ -1216,6 +1234,14 @@ function(setup_spans_up_down) subs height_minor_b, height_minor_b, #4 bhi 2b + nop + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset] + tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V) + beq 1f + add temp, span_uvrg_offset, height, lsl #4 + vldr uv, [temp, #(-16*2)] + vstr uv, [temp, #(-16)] + 1: setup_spans_epilogue() @@ -1256,6 +1282,7 @@ function(setup_spans_up_down) #define uvrg_dx_ptr r2 #define texture_mask_ptr r3 +#define hacks_active r6 #define dither_shift r8 #define dither_row r10 @@ -1273,6 +1300,7 @@ function(setup_spans_up_down) #define color_b r5 #undef uvrg +#undef uv #define u_block q0 #define v_block q1 @@ -1350,6 +1378,26 @@ function(setup_spans_up_down) #define setup_blocks_texture_unswizzled() \ +#define setup_blocks_uv_adj_hack_textured(hacks_active) \ + tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ + beq 91f; \ + /* see flush_render_block_buffer below for a reg saving note */ \ + vpush { texture_mask }; \ + vpush { uvrg_dx4 }; \ + \ + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + mov r12, span_uvrg_offset; \ + sub r1, block_ptr_a, #64; \ + mov r2, span_edge_data; \ + mov r3, r12; \ + bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */ \ + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + \ + vpop { uvrg_dx4 }; \ + vpop { texture_mask }; \ + vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ +91: \ + #define setup_blocks_shaded_textured_builder(swizzling) \ .align 3; \ @@ -1575,6 +1623,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vld1.u32 { test_mask }, [psx_gpu, :128]; \ vdup.u8 draw_mask, right_mask; \ \ + ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ @@ -1585,6 +1634,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \ \ + setup_blocks_uv_adj_hack_textured(hacks_active); \ + \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_b_offset, span_b_offset, #4; \ @@ -1599,7 +1650,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ + /* this callee-save reg saving may look unnecessary but it actually is */ \ + /* because the callee violates the ABI */ \ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ @@ -1776,6 +1828,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vld1.u32 { test_mask }, [psx_gpu, :128]; \ vdup.u8 draw_mask, right_mask; \ \ + ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ @@ -1786,6 +1839,8 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \ \ + setup_blocks_uv_adj_hack_textured(hacks_active); \ + \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_edge_data, span_edge_data, #8; \ @@ -1798,7 +1853,6 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ @@ -2334,7 +2388,6 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { rg_dx4 }; \ \ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h index 2f8a6463..7c21d31c 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h @@ -36,6 +36,7 @@ #define psx_gpu_texture_mask_width_offset 0xfa #define psx_gpu_texture_mask_height_offset 0xfb #define psx_gpu_reciprocal_table_ptr_offset 0x108 +#define psx_gpu_hacks_active_offset 0x114 #define psx_gpu_blocks_offset 0x200 #define psx_gpu_span_uvrg_offset_offset 0x2200 #define psx_gpu_span_edge_data_offset 0x4200 diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c index 9b378482..740df981 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c @@ -76,6 +76,7 @@ int main() //WRITE_OFFSET(f, clut_settings); //WRITE_OFFSET(f, texture_settings); WRITE_OFFSET(f, reciprocal_table_ptr); + WRITE_OFFSET(f, hacks_active); WRITE_OFFSET(f, blocks); WRITE_OFFSET(f, span_uvrg_offset); WRITE_OFFSET(f, span_edge_data); diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index 53f33e4c..f398695d 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -903,6 +903,7 @@ static void select_enhancement_buf(psx_gpu_struct *psx_gpu) psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \ psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \ psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \ + psx_gpu->hacks_active = 0; \ psx_gpu->uvrgb_phase = 0x8000; \ } @@ -917,7 +918,7 @@ static int enhancement_enable(psx_gpu_struct *psx_gpu) psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1; if (psx_gpu->viewport_end_x - psx_gpu->viewport_start_x + 1 > 1024) psx_gpu->viewport_end_x = psx_gpu->viewport_start_x + 1023; - psx_gpu->uvrgb_phase = 0x7fff; + //psx_gpu->uvrgb_phase = 0x7fff; return 1; } @@ -1018,73 +1019,29 @@ static int check_enhanced_range(psx_gpu_struct *psx_gpu, int x, int y) return 1; } -static int is_in_array(int val, int array[], int len) +static u32 uv_hack(psx_gpu_struct *psx_gpu, const vertex_struct *vertex_ptrs) { - int i; - for (i = 0; i < len; i++) - if (array[i] == val) - return 1; - return 0; -} - -static int make_members_unique(int array[], int len) -{ - int i, j; - for (i = j = 1; i < len; i++) - if (!is_in_array(array[i], array, j)) - array[j++] = array[i]; - - if (array[0] > array[1]) { - i = array[0]; array[0] = array[1]; array[1] = i; - } - return j; -} - -static void patch_u(vertex_struct *vertex_ptrs, int count, int old, int new) -{ - int i; - for (i = 0; i < count; i++) - if (vertex_ptrs[i].u == old) - vertex_ptrs[i].u = new; -} - -static void patch_v(vertex_struct *vertex_ptrs, int count, int old, int new) -{ - int i; - for (i = 0; i < count; i++) - if (vertex_ptrs[i].v == old) - vertex_ptrs[i].v = new; -} - -// this sometimes does more harm than good, like in PE2 -static void uv_hack(vertex_struct *vertex_ptrs, int vertex_count) -{ - int i, u[4], v[4]; - - for (i = 0; i < vertex_count; i++) { - u[i] = vertex_ptrs[i].u; - v[i] = vertex_ptrs[i].v; - } - if (make_members_unique(u, vertex_count) == 2 && u[1] - u[0] >= 8) { - if ((u[0] & 7) == 7) { - patch_u(vertex_ptrs, vertex_count, u[0], u[0] + 1); - //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0]+1, u[1]); - } - else if ((u[1] & 7) == 0 || u[1] - u[0] > 128) { - patch_u(vertex_ptrs, vertex_count, u[1], u[1] - 1); - //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0], u[1]-1); - } - } - if (make_members_unique(v, vertex_count) == 2 && ((v[0] - v[1]) & 7) == 0) { - if ((v[0] & 7) == 7) { - patch_v(vertex_ptrs, vertex_count, v[0], v[0] + 1); - //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0]+1, v[1]); - } - else if ((v[1] & 7) == 0) { - patch_v(vertex_ptrs, vertex_count, v[1], v[1] - 1); - //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0], v[1]-1); - } + int i, have_right_edge = 0, have_bottom_edge = 0, bad_u = 0, bad_v = 0; + u32 hacks = 0; + + for (i = 0; i < 3; i++) { + int j = (i + 1) % 3, k = (i + 2) % 3; + int du = abs((int)vertex_ptrs[i].u - (int)vertex_ptrs[j].u); + int dv = abs((int)vertex_ptrs[i].v - (int)vertex_ptrs[j].v); + if (du && (du & 7) != 7) + bad_u = 1; + if (dv && (dv & 7) != 7) + bad_v = 1; + if (vertex_ptrs[i].x == vertex_ptrs[j].x && vertex_ptrs[k].x < vertex_ptrs[j].x) + have_right_edge = 1; + if (vertex_ptrs[i].y == vertex_ptrs[j].y)// && vertex_ptrs[k].y < vertex_ptrs[j].y) + have_bottom_edge = 1; } + if (have_right_edge && bad_u) + hacks |= AHACK_TEXTURE_ADJ_U; + if (have_bottom_edge && bad_v) + hacks |= AHACK_TEXTURE_ADJ_V; + return hacks; } static void do_triangle_enhanced(psx_gpu_struct *psx_gpu, @@ -1104,6 +1061,8 @@ static void do_triangle_enhanced(psx_gpu_struct *psx_gpu, if (!enhancement_enable(psx_gpu)) return; + if ((current_command & RENDER_FLAGS_TEXTURE_MAP) && psx_gpu->hack_texture_adj) + psx_gpu->hacks_active |= uv_hack(psx_gpu, vertexes); shift_vertices3(vertex_ptrs); shift_triangle_area(); render_triangle_p(psx_gpu, vertex_ptrs, current_command); @@ -1314,8 +1273,6 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, get_vertex_data_xy_uv(2, 10); get_vertex_data_xy_uv(3, 14); - if (psx_gpu->hack_texture_adj) - uv_hack(vertexes, 4); do_quad_enhanced(psx_gpu, vertexes, current_command); gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t()); break; @@ -1368,8 +1325,6 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, get_vertex_data_xy_uv_rgb(2, 12); get_vertex_data_xy_uv_rgb(3, 18); - if (psx_gpu->hack_texture_adj) - uv_hack(vertexes, 4); do_quad_enhanced(psx_gpu, vertexes, current_command); gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt()); break; diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c index b5274362..174e61d2 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c @@ -192,6 +192,7 @@ typedef union #define gvld1_u8(d, s) d.u8 = vld1_u8(s) #define gvld1_u32(d, s) d.u32 = vld1_u32((const u32 *)(s)) +#define gvld1_u64(d, s) d.u64 = vld1_u64((const u64 *)(s)) #define gvld1q_u8(d, s) d.u8 = vld1q_u8(s) #define gvld1q_u16(d, s) d.u16 = vld1q_u16(s) #define gvld1q_u32(d, s) d.u32 = vld1q_u32((const u32 *)(s)) @@ -206,6 +207,8 @@ typedef union #define gvst1_u8(v, p) \ vst1_u8(p, v.u8) +#define gvst1_u64(v, p) \ + vst1_u64((u64 *)(p), v.u64) #define gvst1q_u16(v, p) \ vst1q_u16(p, v.u16) #define gvst1q_inc_u32(v, p, i) { \ @@ -388,10 +391,14 @@ typedef union #define gvld1_u8(d, s) d.m = _mm_loadu_si64(s) #define gvld1_u32 gvld1_u8 +#define gvld1_u64 gvld1_u8 #define gvld1q_u8(d, s) d.m = _mm_loadu_si128((__m128i *)(s)) #define gvld1q_u16 gvld1q_u8 #define gvld1q_u32 gvld1q_u8 +#define gvst1_u8(v, p) _mm_storeu_si64(p, v.m) +#define gvst1_u64 gvst1_u8 + #define gvst4_4_inc_u32(v0, v1, v2, v3, p, i) { \ __m128i t0 = _mm_unpacklo_epi32(v0.m, v1.m); \ __m128i t1 = _mm_unpacklo_epi32(v2.m, v3.m); \ @@ -1401,6 +1408,12 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, setup_spans_set_x4(alternate, down, alternate_active); \ height -= 4; \ } while(height > 0); \ + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) \ + { \ + vec_2x32u tmp; \ + gvld1_u64(tmp, &span_uvrg_offset[height - 2]); \ + gvst1_u64(tmp, &span_uvrg_offset[height - 1]); \ + } \ } \ @@ -1452,6 +1465,12 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, setup_spans_set_x4(alternate, up, alternate_active); \ height -= 4; \ } \ + if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V) \ + { \ + vec_2x32u tmp; \ + gvld1_u64(tmp, &psx_gpu->span_uvrg_offset[1]); \ + gvst1_u64(tmp, &psx_gpu->span_uvrg_offset[0]); \ + } \ } \ #define half_left lo @@ -1714,6 +1733,12 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, setup_spans_set_x4(none, down, no); height_minor_b -= 4; } + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) + { + vec_2x32u tmp; + gvld1_u64(tmp, &span_uvrg_offset[height_minor_b - 2]); + gvst1_u64(tmp, &span_uvrg_offset[height_minor_b - 1]); + } } } @@ -2152,6 +2177,15 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, #define setup_blocks_store_draw_mask_untextured_direct(_block, bits) \ +#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset) \ + +#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset) \ +{ \ + u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V; \ + if (unlikely(psx_gpu->hacks_active & m_)) \ + setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, (void *)uvrg_offset); \ +} \ + #define setup_blocks_add_blocks_indirect() \ num_blocks += span_num_blocks; \ \ @@ -2211,6 +2245,8 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \ setup_blocks_store_draw_mask_##texturing##_##target(block, \ span_edge_data->right_mask); \ + setup_blocks_uv_adj_hack_##texturing(block, span_edge_data, \ + span_uvrg_offset); \ \ block++; \ } \ -- 2.39.2 From 8847df50c67c19c605f60a109d30556b74d08eee Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 23 Aug 2024 00:09:55 +0300 Subject: [PATCH 13/16] gpu_neon: enable tex hack by default --- frontend/libretro.c | 2 +- frontend/libretro_core_options.h | 6 +++--- frontend/main.c | 1 + frontend/menu.c | 8 +++++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/frontend/libretro.c b/frontend/libretro.c index 8cbae419..80b1739b 100644 --- a/frontend/libretro.c +++ b/frontend/libretro.c @@ -2453,7 +2453,7 @@ static void update_variables(bool in_flight) } var.value = NULL; - var.key = "pcsx_rearmed_neon_enhancement_tex_adj"; + var.key = "pcsx_rearmed_neon_enhancement_tex_adj_v2"; if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value) { diff --git a/frontend/libretro_core_options.h b/frontend/libretro_core_options.h index e4995277..bcc34706 100644 --- a/frontend/libretro_core_options.h +++ b/frontend/libretro_core_options.h @@ -558,10 +558,10 @@ struct retro_core_option_v2_definition option_defs_us[] = { "disabled", }, { - "pcsx_rearmed_neon_enhancement_tex_adj", + "pcsx_rearmed_neon_enhancement_tex_adj_v2", "(GPU) Enhanced Resolution Texture Adjustment", "Enhanced Resolution Texture Adjustment", - "('Enhanced Resolution' Hack) Attempts to solve some texturing issues in some games, but causes new ones in others.", + "('Enhanced Resolution' Hack) Solves some texturing issues in some games in Enhanced Resolution mode. May cause a small performance hit.", NULL, "gpu_neon", { @@ -569,7 +569,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { { "enabled", NULL }, { NULL, NULL }, }, - "disabled", + "enabled", }, #endif /* GPU_NEON */ #ifdef GPU_PEOPS diff --git a/frontend/main.c b/frontend/main.c index 1318e195..4c051e2c 100644 --- a/frontend/main.c +++ b/frontend/main.c @@ -143,6 +143,7 @@ void emu_set_default_config(void) pl_rearmed_cbs.gpu_neon.allow_interlace = 2; // auto pl_rearmed_cbs.gpu_neon.enhancement_enable = pl_rearmed_cbs.gpu_neon.enhancement_no_main = 0; + pl_rearmed_cbs.gpu_neon.enhancement_tex_adj = 1; pl_rearmed_cbs.gpu_peops.iUseDither = 0; pl_rearmed_cbs.gpu_peops.dwActFixes = 1<<7; pl_rearmed_cbs.gpu_unai.ilace_force = 0; diff --git a/frontend/menu.c b/frontend/menu.c index 9b9af7c8..15034a90 100644 --- a/frontend/menu.c +++ b/frontend/menu.c @@ -454,7 +454,7 @@ static const struct { CE_INTVAL_P(gpu_neon.allow_interlace), CE_INTVAL_P(gpu_neon.enhancement_enable), CE_INTVAL_P(gpu_neon.enhancement_no_main), - CE_INTVAL_P(gpu_neon.enhancement_tex_adj), + CE_INTVAL_PV(gpu_neon.enhancement_tex_adj, 2), CE_INTVAL_P(gpu_peopsgl.bDrawDither), CE_INTVAL_P(gpu_peopsgl.iFilterType), CE_INTVAL_P(gpu_peopsgl.iFrameTexType), @@ -1411,10 +1411,12 @@ static int menu_loop_gfx_options(int id, int keys) static const char h_gpu_neon[] = "Configure built-in NEON GPU plugin"; static const char h_gpu_neon_enhanced[] = - "Renders in double resolution at the cost of lower performance\n" + "Renders in double resolution at perf. cost\n" "(not available for high resolution games)"; static const char h_gpu_neon_enhanced_hack[] = "Speed hack for above option (glitches some games)"; +static const char h_gpu_neon_enhanced_texadj[] = + "Solves some Enh. res. texture issues, some perf hit"; static const char *men_gpu_interlace[] = { "Off", "On", "Auto", NULL }; static menu_entry e_menu_plugin_gpu_neon[] = @@ -1422,7 +1424,7 @@ static menu_entry e_menu_plugin_gpu_neon[] = mee_enum ("Enable interlace mode", 0, pl_rearmed_cbs.gpu_neon.allow_interlace, men_gpu_interlace), mee_onoff_h ("Enhanced resolution", 0, pl_rearmed_cbs.gpu_neon.enhancement_enable, 1, h_gpu_neon_enhanced), mee_onoff_h ("Enhanced res. speed hack", 0, pl_rearmed_cbs.gpu_neon.enhancement_no_main, 1, h_gpu_neon_enhanced_hack), - mee_onoff ("Enh. res. texture adjust", 0, pl_rearmed_cbs.gpu_neon.enhancement_tex_adj, 1), + mee_onoff_h ("Enh. res. texture adjust", 0, pl_rearmed_cbs.gpu_neon.enhancement_tex_adj, 1, h_gpu_neon_enhanced_texadj), mee_end, }; -- 2.39.2 From 3107c849c014f9f26676ac7f8bc2592cb5c5de04 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Mon, 2 Sep 2024 12:54:37 +0200 Subject: [PATCH 14/16] git subrepo pull --force deps/lightrec subrepo: subdir: "deps/lightrec" merged: "ea20362c95" upstream: origin: "https://github.com/pcercuei/lightrec.git" branch: "master" commit: "ea20362c95" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo.git" commit: "110b9eb" --- deps/lightrec/.gitrepo | 4 +-- deps/lightrec/constprop.c | 11 +++++- deps/lightrec/emitter.c | 76 ++++++++++++++++++--------------------- deps/lightrec/lightrec.c | 7 ++++ deps/lightrec/regcache.c | 12 +++++++ deps/lightrec/regcache.h | 2 ++ 6 files changed, 68 insertions(+), 44 deletions(-) diff --git a/deps/lightrec/.gitrepo b/deps/lightrec/.gitrepo index 69811196..0d3c14bf 100644 --- a/deps/lightrec/.gitrepo +++ b/deps/lightrec/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/pcercuei/lightrec.git branch = master - commit = 601afca8e889bdda7040ff5c64f7bbd20d1d5f2c - parent = 459f02ad03fa10b5c403fed724d47fe5adfd5fb1 + commit = ea20362c9542f12fb6a0f27aa7df66b2af06b84d + parent = 8847df50c67c19c605f60a109d30556b74d08eee method = merge cmdver = 0.4.6 diff --git a/deps/lightrec/constprop.c b/deps/lightrec/constprop.c index 97670bcf..d5002a82 100644 --- a/deps/lightrec/constprop.c +++ b/deps/lightrec/constprop.c @@ -59,7 +59,7 @@ static void lightrec_propagate_addi(u32 rs, u32 rd, const struct constprop_data *d, struct constprop_data *v) { - u32 end, bit, sum, min, mask, imm, value; + u32 end, bit, sum, min, max, mask, imm, value; struct constprop_data result = { .value = v[rd].value, .known = v[rd].known, @@ -110,6 +110,15 @@ static void lightrec_propagate_addi(u32 rs, u32 rd, * sign bits are known. */ min = get_min_value(&v[rs]) + get_min_value(d); + max = get_max_value(&v[rs]) + + get_max_value(d); + + /* The sum may have less sign bits */ + if ((s32)min < 0) + mask &= min & max; + else + mask &= ~(min | mask); + result.value = (min & mask) | (result.value & ~mask); result.known |= mask << carry; diff --git a/deps/lightrec/emitter.c b/deps/lightrec/emitter.c index a59ff1d7..f84f049f 100644 --- a/deps/lightrec/emitter.c +++ b/deps/lightrec/emitter.c @@ -1300,7 +1300,7 @@ static void rec_store_memory(struct lightrec_cstate *cstate, struct opcode *op = &block->opcode_list[offset]; jit_state_t *_jit = block->_jit; union code c = op->c; - u8 rs, rt, tmp = 0, tmp2 = 0, tmp3, addr_reg, addr_reg2; + u8 rs, rt, tmp = 0, tmp2 = 0, tmp3, addr_reg, addr_reg2, src_reg; s16 imm = (s16)c.i.imm; s32 simm = (s32)imm << (1 - lut_is_32bit(state)); s32 lut_offt = lightrec_offset(code_lut); @@ -1342,25 +1342,23 @@ static void rec_store_memory(struct lightrec_cstate *cstate, } rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); + src_reg = rt; if (is_big_endian() && swap_code && in_reg) { tmp3 = lightrec_alloc_reg_temp(reg_cache, _jit); jit_new_node_ww(swap_code, tmp3, rt); - if (c.i.op == OP_META_SWU) - jit_unstr(addr_reg2, tmp3, LIGHTNING_UNALIGNED_32BIT); - else - jit_new_node_www(code, imm, addr_reg2, tmp3); - - lightrec_free_reg(reg_cache, tmp3); - } else if (c.i.op == OP_META_SWU) { - jit_unstr(addr_reg2, rt, LIGHTNING_UNALIGNED_32BIT); - } else { - jit_new_node_www(code, imm, addr_reg2, rt); + lightrec_free_reg(reg_cache, rt); + src_reg = tmp3; } - lightrec_free_reg(reg_cache, rt); + if (c.i.op == OP_META_SWU) + jit_unstr(addr_reg2, src_reg, LIGHTNING_UNALIGNED_32BIT); + else + jit_new_node_www(code, imm, addr_reg2, src_reg); + + lightrec_free_reg(reg_cache, src_reg); if (invalidate) { tmp3 = lightrec_alloc_reg_in(reg_cache, _jit, 0, 0); @@ -1445,7 +1443,7 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate, jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_end; bool swc2 = c.i.op == OP_SWC2; - u8 addr_reg, tmp, tmp2 = 0, rs, rt, in_reg = swc2 ? REG_TEMP : c.i.rt; + u8 addr_reg, tmp, tmp2 = 0, rs, rt, src_reg, in_reg = swc2 ? REG_TEMP : c.i.rt; s16 imm; jit_note(__FILE__, __LINE__); @@ -1489,25 +1487,23 @@ static void rec_store_direct_no_invalidate(struct lightrec_cstate *cstate, } rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); + src_reg = rt; if (is_big_endian() && swap_code && in_reg) { tmp2 = lightrec_alloc_reg_temp(reg_cache, _jit); jit_new_node_ww(swap_code, tmp2, rt); + src_reg = tmp2; - if (c.i.op == OP_META_SWU) - jit_unstr(tmp, tmp2, LIGHTNING_UNALIGNED_32BIT); - else - jit_new_node_www(code, imm, tmp, tmp2); - - lightrec_free_reg(reg_cache, tmp2); - } else if (c.i.op == OP_META_SWU) { - jit_unstr(tmp, rt, LIGHTNING_UNALIGNED_32BIT); - } else { - jit_new_node_www(code, imm, tmp, rt); + lightrec_free_reg(reg_cache, rt); } - lightrec_free_reg(reg_cache, rt); + if (c.i.op == OP_META_SWU) + jit_unstr(tmp, src_reg, LIGHTNING_UNALIGNED_32BIT); + else + jit_new_node_www(code, imm, tmp, src_reg); + + lightrec_free_reg(reg_cache, src_reg); lightrec_free_reg(reg_cache, tmp); } @@ -1521,7 +1517,7 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block jit_state_t *_jit = block->_jit; jit_node_t *to_not_ram, *to_end; bool swc2 = c.i.op == OP_SWC2; - u8 addr_reg, tmp, tmp2, tmp3, rs, rt, reg_imm; + u8 src_reg, addr_reg, tmp, tmp2, tmp3, rs, rt, reg_imm; u8 in_reg = swc2 ? REG_TEMP : c.i.rt; u32 mask; bool different_offsets = state->offset_ram != state->offset_scratch; @@ -1602,25 +1598,23 @@ static void rec_store_direct(struct lightrec_cstate *cstate, const struct block lightrec_free_reg(reg_cache, reg_imm); rt = lightrec_alloc_reg_in(reg_cache, _jit, in_reg, 0); + src_reg = rt; if (is_big_endian() && swap_code && in_reg) { tmp = lightrec_alloc_reg_temp(reg_cache, _jit); jit_new_node_ww(swap_code, tmp, rt); + src_reg = tmp; - if (c.i.op == OP_META_SWU) - jit_unstr(tmp2, tmp, LIGHTNING_UNALIGNED_32BIT); - else - jit_new_node_www(code, 0, tmp2, tmp); - - lightrec_free_reg(reg_cache, tmp); - } else if (c.i.op == OP_META_SWU) { - jit_unstr(tmp2, rt, LIGHTNING_UNALIGNED_32BIT); - } else { - jit_new_node_www(code, 0, tmp2, rt); + lightrec_free_reg(reg_cache, rt); } - lightrec_free_reg(reg_cache, rt); + if (c.i.op == OP_META_SWU) + jit_unstr(tmp2, src_reg, LIGHTNING_UNALIGNED_32BIT); + else + jit_new_node_www(code, 0, tmp2, src_reg); + + lightrec_free_reg(reg_cache, src_reg); lightrec_free_reg(reg_cache, tmp2); } @@ -1882,19 +1876,19 @@ static void rec_load_direct(struct lightrec_cstate *cstate, else addr_mask = 0x1fffffff; - reg_imm = lightrec_alloc_reg_temp_with_value(reg_cache, _jit, - addr_mask); if (!state->mirrors_mapped) { + reg_imm = lightrec_alloc_reg_temp_with_value(reg_cache, _jit, + addr_mask); jit_andi(tmp, addr_reg, BIT(28)); jit_rshi_u(tmp, tmp, 28 - 22); jit_orr(tmp, tmp, reg_imm); jit_andr(rt, addr_reg, tmp); + + lightrec_free_reg(reg_cache, reg_imm); } else { - jit_andr(rt, addr_reg, reg_imm); + rec_and_mask(cstate, _jit, rt, addr_reg, addr_mask); } - lightrec_free_reg(reg_cache, reg_imm); - if (state->offset_ram) { offt_reg = lightrec_get_reg_with_value(reg_cache, state->offset_ram); diff --git a/deps/lightrec/lightrec.c b/deps/lightrec/lightrec.c index ae170531..5f6a8712 100644 --- a/deps/lightrec/lightrec.c +++ b/deps/lightrec/lightrec.c @@ -3,6 +3,7 @@ * Copyright (C) 2014-2021 Paul Cercueil */ +#include "arch.h" #include "blockcache.h" #include "debug.h" #include "disassembler.h" @@ -1138,6 +1139,9 @@ static struct block * generate_dispatcher(struct lightrec_state *state) loop = jit_label(); + if (!arch_has_fast_mask()) + jit_movi(JIT_R1, 0x1fffffff); + /* Call the block's code */ jit_jmpr(JIT_V1); @@ -1582,6 +1586,9 @@ int lightrec_compile_block(struct lightrec_cstate *cstate, if (OPT_PRELOAD_PC && (block->flags & BLOCK_PRELOAD_PC)) lightrec_preload_pc(cstate->reg_cache, _jit); + if (!arch_has_fast_mask()) + lightrec_preload_imm(cstate->reg_cache, _jit, JIT_R1, 0x1fffffff); + cstate->cycles = 0; cstate->nb_local_branches = 0; cstate->nb_targets = 0; diff --git a/deps/lightrec/regcache.c b/deps/lightrec/regcache.c index 41d37789..51ce9b9a 100644 --- a/deps/lightrec/regcache.c +++ b/deps/lightrec/regcache.c @@ -699,6 +699,18 @@ void lightrec_preload_pc(struct regcache *cache, jit_state_t *_jit) jit_live(JIT_V0); } +void lightrec_preload_imm(struct regcache *cache, jit_state_t *_jit, + u8 jit_reg, u32 imm) +{ + struct native_register *nreg; + + nreg = lightning_reg_to_lightrec(cache, jit_reg); + nreg->prio = REG_IS_TEMP_VALUE; + nreg->value = imm; + + jit_live(jit_reg); +} + struct regcache * lightrec_regcache_init(struct lightrec_state *state) { struct regcache *cache; diff --git a/deps/lightrec/regcache.h b/deps/lightrec/regcache.h index 23a775ce..a8db39f3 100644 --- a/deps/lightrec/regcache.h +++ b/deps/lightrec/regcache.h @@ -70,6 +70,8 @@ void lightrec_set_reg_out_flags(struct regcache *cache, u8 jit_reg, u8 flags); void lightrec_regcache_reset(struct regcache *cache); void lightrec_preload_pc(struct regcache *cache, jit_state_t *_jit); +void lightrec_preload_imm(struct regcache *cache, jit_state_t *_jit, + u8 jit_reg, u32 imm); void lightrec_free_reg(struct regcache *cache, u8 jit_reg); void lightrec_free_regs(struct regcache *cache); -- 2.39.2 From eb2ea89a883237a5a348c8ce742a401d24132cd7 Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 4 Sep 2024 00:17:32 +0300 Subject: [PATCH 15/16] libretro: use shorter option names to fit narrow frontends --- frontend/libretro_core_options.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/frontend/libretro_core_options.h b/frontend/libretro_core_options.h index bcc34706..47a3ed4e 100644 --- a/frontend/libretro_core_options.h +++ b/frontend/libretro_core_options.h @@ -151,7 +151,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { }, { "pcsx_rearmed_memcard2", - "Enable Second Memory Card (Shared)", + "Second Memory Card (Shared)", NULL, "Emulate a second memory card in slot 2. This will be shared by all games.", NULL, @@ -455,7 +455,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { }, { "pcsx_rearmed_show_overscan", - "(GPU) Show horizontal overscan", + "(GPU) Horizontal overscan", NULL, "The PSX can display graphics way into the horizontal borders, even if most screens would crop it. This option tries to display all such graphics. Note that this may result in unusual resolutions that your device might not handle well. The 'Hack' option is intended for the widescreen hacks.", NULL, @@ -487,7 +487,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { #define V(x) { #x, NULL } { "pcsx_rearmed_screen_centering_x", - "(GPU) Manual screen centering X", + "(GPU) Manual position X", NULL, "X offset of the frame buffer. Only effective when 'Screen centering' is set to 'Manual'.", NULL, @@ -500,7 +500,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { }, { "pcsx_rearmed_screen_centering_y", - "(GPU) Manual screen centering Y", + "(GPU) Manual position Y", NULL, "Y offset of the frame buffer. Only effective when 'Screen centering' is set to 'Manual'.", NULL, @@ -546,7 +546,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { { "pcsx_rearmed_neon_enhancement_no_main", "(GPU) Enhanced Resolution Speed Hack", - "Enhanced Resolution Speed Hack", + "Enh. Res. Speed Hack", "('Enhanced Resolution' Hack) Improves performance but reduces compatibility and may cause rendering errors.", NULL, "gpu_neon", @@ -560,7 +560,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { { "pcsx_rearmed_neon_enhancement_tex_adj_v2", "(GPU) Enhanced Resolution Texture Adjustment", - "Enhanced Resolution Texture Adjustment", + "Enh. Res. Texture Fixup", "('Enhanced Resolution' Hack) Solves some texturing issues in some games in Enhanced Resolution mode. May cause a small performance hit.", NULL, "gpu_neon", -- 2.39.2 From 237887e817e23800997466632deb8ba63797a4cb Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 4 Sep 2024 00:19:25 +0300 Subject: [PATCH 16/16] libretro: improve retro_memory_map libretro/pcsx_rearmed#845 --- frontend/libretro.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/frontend/libretro.c b/frontend/libretro.c index 80b1739b..1ad39b51 100644 --- a/frontend/libretro.c +++ b/frontend/libretro.c @@ -1851,17 +1851,21 @@ strcasestr(const char *s, const char *find) static void set_retro_memmap(void) { -#ifndef NDEBUG + uint64_t flags_ram = RETRO_MEMDESC_SYSTEM_RAM; struct retro_memory_map retromap = { 0 }; - struct retro_memory_descriptor mmap = { - 0, psxM, 0, 0, 0, 0, 0x200000 + struct retro_memory_descriptor descs[] = { + { flags_ram, psxM, 0, 0x00000000, 0x5fe00000, 0, 0x200000 }, + { flags_ram, psxH, 0, 0x1f800000, 0x7ffffc00, 0, 0x000400 }, + // not ram but let the frontend patch it if it wants; should be last + { flags_ram, psxR, 0, 0x1fc00000, 0x5ff80000, 0, 0x080000 }, }; - retromap.descriptors = &mmap; - retromap.num_descriptors = 1; + retromap.descriptors = descs; + retromap.num_descriptors = sizeof(descs) / sizeof(descs[0]); + if (Config.HLE) + retromap.num_descriptors--; environ_cb(RETRO_ENVIRONMENT_SET_MEMORY_MAPS, &retromap); -#endif } static void show_notification(const char *msg_str, -- 2.39.2