From: notaz Date: Mon, 18 Nov 2024 01:15:32 +0000 (+0200) Subject: rect quad optimizations X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f060f4be678b55bc3e1c3f3d5b3cf4baff84ef3a;p=pcsx_rearmed.git rect quad optimizations --- diff --git a/Makefile b/Makefile index 39b5fbaf..83559310 100644 --- a/Makefile +++ b/Makefile @@ -229,7 +229,7 @@ plugins/dfsound/out.o: CFLAGS += -DHAVE_LIBRETRO endif # builtin gpu -OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o +OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o plugins/gpulib/prim.o ifeq "$(BUILTIN_GPU)" "neon" CFLAGS += -DGPU_NEON OBJS += plugins/gpu_neon/psx_gpu_if.o diff --git a/include/compiler_features.h b/include/compiler_features.h index 21549ddf..77114efb 100644 --- a/include/compiler_features.h +++ b/include/compiler_features.h @@ -2,6 +2,7 @@ #ifdef __GNUC__ # define likely(x) __builtin_expect((x),1) # define unlikely(x) __builtin_expect((x),0) +# define preload __builtin_prefetch # ifdef __clang__ # define noinline __attribute__((noinline)) # else @@ -11,6 +12,7 @@ #else # define likely(x) (x) # define unlikely(x) (x) +# define preload (x) # define noinline # define attr_unused #endif diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index e78feaf2..1fa06a15 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -16,6 +16,7 @@ #include "common.h" #include "../../gpulib/gpu_timing.h" +#include "../../gpulib/gpu.h" #ifndef command_lengths const u8 command_lengths[256] = @@ -245,12 +246,27 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y, #define SET_Ex(r, v) #endif +static void textured_sprite(psx_gpu_struct *psx_gpu, const u32 *list, + s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles) +{ + s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x); + s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y); + u8 v = (list[2] >> 8) & 0xff; + u8 u = list[2] & 0xff; + + set_clut(psx_gpu, list[2] >> 16); + + render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]); + gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height)); +} + u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command) { vertex_struct vertexes[4] __attribute__((aligned(16))) = {}; u32 current_command = 0, command_length; u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last; + u32 siplified_prim[4*4]; u32 *list_start = list; u32 *list_end = list + (size / 4); @@ -328,8 +344,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x2C ... 0x2F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[9]); + u32 i, simplified_count; + set_texture(psx_gpu, list[4] >> 16); + if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); set_triangle_color(psx_gpu, list[0] & 0xFFFFFF); get_vertex_data_xy_uv(0, 2); @@ -383,8 +410,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x3C ... 0x3F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[11]); + u32 i, simplified_count; + set_texture(psx_gpu, list[5] >> 16); + if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); get_vertex_data_xy_uv_rgb(0, 0); get_vertex_data_xy_uv_rgb(1, 6); @@ -525,23 +563,12 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); break; } - - case 0x64 ... 0x67: - { - u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u32 uv = list_s16[4]; - s32 width = list_s16[6] & 0x3FF; - s32 height = list_s16[7] & 0x1FF; - - set_clut(psx_gpu, list_s16[5]); - render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); + case 0x64 ... 0x67: + textured_sprite(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF, + &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x68 ... 0x6B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -565,22 +592,11 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); break; } - - case 0x74 ... 0x77: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u32 uv = list_s16[4]; - s32 width = 8, height = 8; - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); + case 0x74 ... 0x77: + textured_sprite(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x78 ... 0x7B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -594,19 +610,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, } case 0x7C ... 0x7F: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u32 uv = list_s16[4]; - s32 width = 16, height = 16; - - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); + textured_sprite(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles); break; - } #ifdef PCSX case 0x1F: // irq? @@ -1155,12 +1160,31 @@ static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y, } #endif +static void textured_sprite_enh(psx_gpu_struct *psx_gpu, const u32 *list, + s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles) +{ + s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x); + s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y); + s32 width_b = width, height_b = height; + u8 v = (list[2] >> 8) & 0xff; + u8 u = list[2] & 0xff; + + set_clut(psx_gpu, list[2] >> 16); + + render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]); + gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height)); + + if (check_enhanced_range(psx_gpu, x, x + width)) + do_sprite_enhanced(psx_gpu, x, y, u, v, width_b, height_b, list[0]); +} + u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command) { vertex_struct vertexes[4] __attribute__((aligned(16))) = {}; u32 current_command = 0, command_length; u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last; + u32 siplified_prim[4*4]; u32 *list_start = list; u32 *list_end = list + (size / 4); @@ -1265,8 +1289,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x2C ... 0x2F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[9]); + u32 i, simplified_count; + set_texture(psx_gpu, list[4] >> 16); + if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); set_triangle_color(psx_gpu, list[0] & 0xFFFFFF); get_vertex_data_xy_uv(0, 2); @@ -1318,8 +1353,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x3C ... 0x3F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[11]); + u32 i, simplified_count; + set_texture(psx_gpu, list[5] >> 16); + if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); get_vertex_data_xy_uv_rgb(0, 0); get_vertex_data_xy_uv_rgb(1, 6); @@ -1475,30 +1521,12 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, } break; } - - case 0x64 ... 0x67: - { - u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u8 u = list_s16[4]; - u8 v = list_s16[4] >> 8; - s32 width = list_s16[6] & 0x3FF; - s32 height = list_s16[7] & 0x1FF; - - set_clut(psx_gpu, list_s16[5]); - render_sprite(psx_gpu, x, y, u, v, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); - - if (check_enhanced_range(psx_gpu, x, x + width)) { - width = list_s16[6] & 0x3FF; - height = list_s16[7] & 0x1FF; - do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]); - } + case 0x64 ... 0x67: + textured_sprite_enh(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF, + &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x68 ... 0x6B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -1528,26 +1556,11 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]); break; } - - case 0x74 ... 0x77: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u8 u = list_s16[4]; - u8 v = list_s16[4] >> 8; - s32 width = 8, height = 8; - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, u, v, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); - - if (check_enhanced_range(psx_gpu, x, x + 8)) - do_sprite_enhanced(psx_gpu, x, y, u, v, 8, 8, list[0]); + case 0x74 ... 0x77: + textured_sprite_enh(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x78 ... 0x7B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -1562,25 +1575,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]); break; } - - case 0x7C ... 0x7F: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u8 u = list_s16[4]; - u8 v = list_s16[4] >> 8; - s32 width = 16, height = 16; - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, u, v, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); - - if (check_enhanced_range(psx_gpu, x, x + 16)) - do_sprite_enhanced(psx_gpu, x, y, u, v, 16, 16, list[0]); + case 0x7C ... 0x7F: + textured_sprite_enh(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles); break; - } case 0x80 ... 0x9F: // vid -> vid case 0xA0 ... 0xBF: // sys -> vid diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp index c7169dd6..0f124f80 100644 --- a/plugins/gpu_unai/gpulib_if.cpp +++ b/plugins/gpu_unai/gpulib_if.cpp @@ -375,6 +375,36 @@ static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word) #endif #include "../gpulib/gpu_timing.h" + +static inline void textured_sprite(int &cpu_cycles_sum, int &cpu_cycles) +{ + u32 PRIM = le32_to_u32(gpu_unai.PacketBuffer.U4[0]) >> 24; + gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + s32 w = 0, h = 0; + + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + // This fixes Silent Hill running animation on loading screens: + // (On PSX, color values 0x00-0x7F darken the source texture's color, + // 0x81-FF lighten textures (ultimately clamped to 0x1F), + // 0x80 leaves source texture color unchanged, HOWEVER, + // gpu_unai uses a simple lighting LUT whereby only the upper + // 5 bits of an 8-bit color are used, so 0x80-0x87 all behave as + // 0x80. + // + // NOTE: I've changed all textured sprite draw commands here and + // elsewhere to use proper behavior, but left poly commands + // alone, I don't want to slow rendering down too much. (TODO) + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) + driver_idx |= Lighting; + PS driver = gpuSpriteDrivers[driver_idx]; + PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer }; + gpuDrawS(packet, driver, &w, &h); + gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); +} + extern const unsigned char cmd_lengths[256]; int do_cmd_list(u32 *list_, int list_len, @@ -468,8 +498,20 @@ int do_cmd_list(u32 *list_, int list_len, case 0x2D: case 0x2E: case 0x2F: { // Textured 4-pt poly - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); + u32 simplified_count; gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[4]) >> 16); + if ((simplified_count = prim_try_simplify_quad_t(gpu_unai.PacketBuffer.U4, + gpu_unai.PacketBuffer.U4))) + { + for (i = 0;; ) { + textured_sprite(cpu_cycles_sum, cpu_cycles); + if (++i >= simplified_count) + break; + memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16); + } + break; + } + gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); u32 driver_idx = //(gpu_unai.blit_mask?1024:0) | @@ -542,8 +584,20 @@ int do_cmd_list(u32 *list_, int list_len, case 0x3D: case 0x3E: case 0x3F: { // Gouraud-shaded, textured 4-pt poly - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - gpuSetTexture (le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16); + u32 simplified_count; + gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16); + if ((simplified_count = prim_try_simplify_quad_gt(gpu_unai.PacketBuffer.U4, + gpu_unai.PacketBuffer.U4))) + { + for (i = 0;; ) { + textured_sprite(cpu_cycles_sum, cpu_cycles); + if (++i >= simplified_count) + break; + memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16); + } + break; + } + gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); PP driver = gpuPolySpanDrivers[ //(gpu_unai.blit_mask?1024:0) | Dithering | @@ -651,31 +705,9 @@ int do_cmd_list(u32 *list_, int list_len, case 0x64: case 0x65: case 0x66: - case 0x67: { // Textured rectangle (variable size) - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); - s32 w = 0, h = 0; - - //senquack - Only color 808080h-878787h allows skipping lighting calculation: - // This fixes Silent Hill running animation on loading screens: - // (On PSX, color values 0x00-0x7F darken the source texture's color, - // 0x81-FF lighten textures (ultimately clamped to 0x1F), - // 0x80 leaves source texture color unchanged, HOWEVER, - // gpu_unai uses a simple lighting LUT whereby only the upper - // 5 bits of an 8-bit color are used, so 0x80-0x87 all behave as - // 0x80. - // - // NOTE: I've changed all textured sprite draw commands here and - // elsewhere to use proper behavior, but left poly commands - // alone, I don't want to slow rendering down too much. (TODO) - //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) - // Strip lower 3 bits of each color and determine if lighting should be used: - if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) - driver_idx |= Lighting; - PS driver = gpuSpriteDrivers[driver_idx]; - gpuDrawS(packet, driver, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); - } break; + case 0x67: // Textured rectangle (variable size) + textured_sprite(cpu_cycles_sum, cpu_cycles); + break; case 0x68: case 0x69: @@ -704,18 +736,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x76: case 0x77: { // Textured rectangle (8x8) gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00080008); - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); - s32 w = 0, h = 0; - - //senquack - Only color 808080h-878787h allows skipping lighting calculation: - //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) - // Strip lower 3 bits of each color and determine if lighting should be used: - if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) - driver_idx |= Lighting; - PS driver = gpuSpriteDrivers[driver_idx]; - gpuDrawS(packet, driver, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); + textured_sprite(cpu_cycles_sum, cpu_cycles); } break; case 0x78: @@ -734,17 +755,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x7E: case 0x7F: { // Textured rectangle (16x16) gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00100010); - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); - s32 w = 0, h = 0; - //senquack - Only color 808080h-878787h allows skipping lighting calculation: - //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) - // Strip lower 3 bits of each color and determine if lighting should be used: - if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) - driver_idx |= Lighting; - PS driver = gpuSpriteDrivers[driver_idx]; - gpuDrawS(packet, driver, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); + textured_sprite(cpu_cycles_sum, cpu_cycles); } break; #ifdef TEST diff --git a/plugins/gpulib/Makefile b/plugins/gpulib/Makefile index cff61410..53aaa886 100644 --- a/plugins/gpulib/Makefile +++ b/plugins/gpulib/Makefile @@ -5,7 +5,7 @@ endif include ../../config.mak -OBJS += gpu.o +OBJS += gpu.o prim.o ifeq "$(ARCH)" "arm" OBJS += vout_pl.o diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index df1c1c6c..88aa6704 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -17,23 +17,11 @@ #include "gpu_timing.h" #include "../../libpcsxcore/gpu.h" // meh #include "../../frontend/plugin_lib.h" +#include "../../include/compiler_features.h" #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif -#ifdef __GNUC__ -# define unlikely(x) __builtin_expect((x), 0) -# define preload __builtin_prefetch -# ifndef __clang__ -# define noinline __attribute__((noinline,noclone)) -# else -# define noinline __attribute__((noinline)) -# endif -#else -# define unlikely(x) -# define preload(...) -# define noinline -#endif //#define log_io gpu_log #define log_io(...) diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h index e654500d..570d8421 100644 --- a/plugins/gpulib/gpu.h +++ b/plugins/gpulib/gpu.h @@ -147,6 +147,9 @@ void vout_update(void); void vout_blank(void); void vout_set_config(const struct rearmed_cbs *config); +int prim_try_simplify_quad_t (void *simplified, const void *prim); +int prim_try_simplify_quad_gt(void *simplified, const void *prim); + /* listing these here for correct linkage if rasterizer uses c++ */ struct GPUFreeze; diff --git a/plugins/gpulib/prim.c b/plugins/gpulib/prim.c new file mode 100644 index 00000000..d6294641 --- /dev/null +++ b/plugins/gpulib/prim.c @@ -0,0 +1,249 @@ +#include +#include +#include +#include "../../include/compiler_features.h" +#include "gpu.h" + +// retain neon's ability to sample textures pixel-perfectly +#ifdef GPU_NEON +#define STRICT +#endif + +struct vert_t +{ + union { + struct { + int16_t x, y; + }; + uint32_t xy; + }; + union { + struct { + uint8_t u, v; + int16_t clut; + }; + uint32_t uvclut; + }; +}; + +// gt ~ gouraud textured +struct vert_gt +{ + uint32_t rgb; + struct vert_t t; +}; + +struct quad_t +{ + uint32_t rgb_c; + struct vert_t v[4]; +}; + +struct quad_gt +{ + struct vert_gt v[4]; +}; + +struct sprite +{ + uint32_t rgb_c; + union { + struct { + int16_t x, y; + }; + uint32_t xy; + }; + union { + struct { + uint8_t u, v; + int16_t clut; + }; + uint32_t uvclut; + }; + int16_t w, h; +}; + +// debug +#if 0 +static void log_quad_t(const struct quad_t *q, int ret) +{ +#if 1 + printf("quad_t %08x", q->rgb_c); + int i; + for (i = 0; i < 4; i++) + printf(" | %3d,%3d %3d,%3d", + q->v[i].x, q->v[i].y, q->v[i].u, q->v[i].v); + printf(" -> %d\n", ret); +#endif +} + +static void log_quad_gt(const struct vert_gt *v, int ret) +{ +#if 1 + printf("quad_gt %02x", v[0].rgb >> 24); + int i; + for (i = 0; i < 4; i++) + printf(" | %3d,%3d %3d,%3d %06x", + v[i].t.x, v[i].t.y, v[i].t.u, v[i].t.v, v[i].rgb & 0xffffff); + printf(" -> %d\n", ret); +#endif +} + +int prim_try_simplify_quad_t_(void *simplified, const void *prim_); +int prim_try_simplify_quad_t(void *simplified, const void *prim_) +{ + struct quad_t prim = *(struct quad_t *)prim_; + int ret = prim_try_simplify_quad_t_(simplified, prim_); + #define prim_try_simplify_quad_t prim_try_simplify_quad_t_ + ///if (!ret) + log_quad_t(&prim, ret); + return ret; +} + +int prim_try_simplify_quad_gt_(void *simplified, const void *prim_); +int prim_try_simplify_quad_gt(void *simplified, const void *prim_) +{ + struct quad_gt prim = *(struct quad_gt *)prim_; + int ret = prim_try_simplify_quad_gt_(simplified, prim_); + #define prim_try_simplify_quad_gt prim_try_simplify_quad_gt_ + ///if (!ret) + log_quad_gt(prim.v, ret); + return ret; +} +#endif // debug + +static noinline int simplify_quad_t(void *simplified, const struct vert_t *v, + int xd, int ud, int yd, int vd, uint32_t rgb_c, uint16_t clut) +{ + struct sprite *s = simplified; + int ret = 1; + rgb_c &= HTOLE32(0x03ffffff); + rgb_c |= HTOLE32(0x64000000); + xd = abs(xd); + ud = abs(ud); + s[0].rgb_c = rgb_c; + s[0].xy = v->xy; + s[0].u = v->u; + s[0].v = v->v; + s[0].clut = clut; + s[0].w = HTOLE16(xd); + s[0].h = HTOLE16(yd); +#ifndef STRICT + if (xd != ud) { + int mid = xd / 2; + s[0].w = HTOLE16(mid); + s[1].rgb_c = rgb_c; + s[1].x = HTOLE16(LE16TOH(s[0].x) + mid); + s[1].y = s[0].y; + s[1].u = s[0].u + mid + ud - xd; + s[1].v = s[0].v; + s[1].clut = clut; + s[1].w = HTOLE16(xd - mid); + s[1].h = s[0].h; + ret = 2; + } + if (yd != vd) { + int i, mid = yd / 2, y = LE16TOH(s[0].y); + memcpy(s + ret, s, sizeof(s[0]) * ret); + for (i = 0; i < ret; i++) { + s[i].h = HTOLE16(mid); + s[ret+i].y = HTOLE16(y + mid); + s[ret+i].h = HTOLE16(yd - mid); + s[ret+i].v = s[0].v + mid + vd - yd; + } + ret *= 2; + } +#endif + return ret; +} + +// this is split to reduce gcc spilling +static noinline int prim_try_simplify_quad_t2(void *simplified, + const struct vert_t *v, uint32_t rgb_c) +{ + do { + int yd = LE16TOH(v[2].y) - LE16TOH(v[0].y); + int xd, ud, vd; + if (yd < 0) + break; + xd = LE16TOH(v[1].x) - LE16TOH(v[0].x); + ud = LE16TOH(v[1].u) - LE16TOH(v[0].u); + vd = LE16TOH(v[2].v) - LE16TOH(v[0].v); +#ifdef STRICT + if (xd != ud || yd != vd) +#else + if (abs(xd - ud) > 1 || abs(yd - vd) > 1) +#endif + break; + return simplify_quad_t(simplified, xd < 0 ? &v[1] : &v[0], + xd, ud, yd, vd, rgb_c, v[0].clut); + } + while (0); + return 0; +} + +static noinline int prim_try_simplify_quad_gt2(void *simplified, + const struct vert_gt *v) +{ + do { + int yd = LE16TOH(v[2].t.y) - LE16TOH(v[0].t.y); + int xd, ud, vd; + if (yd < 0) + break; + xd = LE16TOH(v[1].t.x) - LE16TOH(v[0].t.x); + ud = LE16TOH(v[1].t.u) - LE16TOH(v[0].t.u); + vd = LE16TOH(v[2].t.v) - LE16TOH(v[0].t.v); +#ifdef STRICT + if (xd != ud || yd != vd) +#else + if (abs(xd - ud) > 1 || abs(yd - vd) > 1) +#endif + break; + if (!(v[0].rgb & HTOLE32(1 << 24))) { // modulation/"lighting" + uint32_t i, xor = 0, rgb0 = v[0].rgb; + for (i = 1; i < 4; i++) + xor |= rgb0 ^ v[i].rgb; + if (xor & HTOLE32(0xf8f8f8)) + break; + } + return simplify_quad_t(simplified, xd < 0 ? &v[1].t : &v[0].t, + xd, ud, yd, vd, v[0].rgb, v[0].t.clut); + } + while (0); + return 0; +} + +// 2c-2f +int prim_try_simplify_quad_t(void *simplified, const void *prim_) +{ + const struct quad_t *prim = prim_; + const struct vert_t *v = prim->v; + int ret = 0; + do { + if (v[0].y != v[1].y || v[0].x != v[2].x || v[2].y != v[3].y || v[1].x != v[3].x) + break; + if (v[0].v != v[1].v || v[0].u != v[2].u || v[2].v != v[3].v || v[1].u != v[3].u) + break; + ret = prim_try_simplify_quad_t2(simplified, v, prim->rgb_c); + } + while (0); + return ret; +} + +// 3c-3f +int prim_try_simplify_quad_gt(void *simplified, const void *prim) +{ + const struct vert_gt *v = prim; + int ret = 0; + do { + if (v[0].t.y != v[1].t.y || v[0].t.x != v[2].t.x || v[2].t.y != v[3].t.y || v[1].t.x != v[3].t.x) + break; + if (v[0].t.v != v[1].t.v || v[0].t.u != v[2].t.u || v[2].t.v != v[3].t.v || v[1].t.u != v[3].t.u) + break; + ret = prim_try_simplify_quad_gt2(simplified, v); + } + while (0); + return ret; +} + +// vim:shiftwidth=2:expandtab