From: notaz Date: Tue, 20 Aug 2024 20:49:45 +0000 (+0300) Subject: gpu_neon: rework enh. res. texturing hack X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ee060c582cd44ba9cda6626466414c318a09b697;p=pcsx_rearmed.git gpu_neon: rework enh. res. texturing hack libretro/pcsx_rearmed#841 --- diff --git a/plugins/gpu_neon/psx_gpu/common.h b/plugins/gpu_neon/psx_gpu/common.h index 820dfbef..5881e2a0 100644 --- a/plugins/gpu_neon/psx_gpu/common.h +++ b/plugins/gpu_neon/psx_gpu/common.h @@ -9,7 +9,5 @@ #include "vector_types.h" #include "psx_gpu.h" -#define unlikely(x) __builtin_expect((x), 0) - #endif diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c index a59e9cdc..19f1c199 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c @@ -560,8 +560,9 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu) y##set##_b.e[1] = vertex->b \ -void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a, - vertex_struct *b, vertex_struct *c) +void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, + const vertex_struct * __restrict__ a, const vertex_struct * __restrict__ b, + const vertex_struct * __restrict__ c) { u32 triangle_area = psx_gpu->triangle_area; u32 winding_mask_scalar; @@ -1163,6 +1164,8 @@ static void setup_spans_debug_check(psx_gpu_struct *psx_gpu, setup_spans_set_x4(alternate, down, alternate_active); \ height -= 4; \ } while(height > 0); \ + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) \ + span_uvrg_offset[height - 1].low = span_uvrg_offset[height - 2].low; \ } \ @@ -1216,6 +1219,8 @@ static void setup_spans_debug_check(psx_gpu_struct *psx_gpu, setup_spans_set_x4(alternate, up, alternate_active); \ height -= 4; \ } \ + if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V) \ + psx_gpu->span_uvrg_offset[0].low = psx_gpu->span_uvrg_offset[1].low; \ } \ #define index_left 0 @@ -1452,6 +1457,11 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, setup_spans_set_x4(none, down, no); height_minor_b -= 4; } + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) + { + span_uvrg_offset[height_minor_b - 1].low = + span_uvrg_offset[height_minor_b - 2].low; + } } left_split_triangles++; @@ -1459,6 +1469,41 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, #endif +// this is some hacky mess, can this be improved somehow? +// ideally change things to not have to do this hack at all +void __attribute__((noinline)) +setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block, + edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset) +{ + size_t span_i = span_uvrg_offset - psx_gpu->span_uvrg_offset; + if (span_i != 0 && span_i != psx_gpu->num_spans - 1 + && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U)) + return; + u32 num_blocks = span_edge_data->num_blocks - 1; + s32 offset = __builtin_ctz(span_edge_data->right_mask | 0x100) - 1; + s32 toffset = 8 * num_blocks + offset - 1; + if (toffset < 0 && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U)) + return; + + toffset += span_edge_data->left_x; + s32 u_dx = psx_gpu->uvrg_dx.low.e[0]; + s32 v_dx = psx_gpu->uvrg_dx.low.e[1]; + u32 u = span_uvrg_offset->low.e[0]; + u32 v = span_uvrg_offset->low.e[1]; + u += u_dx * toffset; + v += v_dx * toffset; + u = (u >> 16) & psx_gpu->texture_mask_width; + v = (v >> 16) & psx_gpu->texture_mask_height; + if (!(psx_gpu->render_state_base & (TEXTURE_MODE_16BPP << 8))) { + // 4bpp 8bpp are swizzled + u32 u_ = u; + u = (u & 0x0f) | ((v & 0x0f) << 4); + v = (v & 0xf0) | (u_ >> 4); + } + assert(offset >= 0); + //assert(block->uv.e[offset] == ((v << 8) | u)); + block->uv.e[offset] = (v << 8) | u; +} #define dither_table_entry_normal(value) \ (value) \ @@ -1868,6 +1913,14 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, #define setup_blocks_store_draw_mask_untextured_direct(_block, bits) \ +#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset) \ + +#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset) \ +{ \ + u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V; \ + if (unlikely(psx_gpu->hacks_active & m_)) \ + setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, uvrg_offset); \ +} \ #define setup_blocks_add_blocks_indirect() \ num_blocks += span_num_blocks; \ @@ -1938,6 +1991,8 @@ void setup_blocks_##shading##_##texturing##_##dithering##_##sw##_##target( \ setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \ setup_blocks_store_draw_mask_##texturing##_##target(block, \ span_edge_data->right_mask); \ + setup_blocks_uv_adj_hack_##texturing(block, span_edge_data, \ + span_uvrg_offset); \ \ block++; \ } \ @@ -5016,8 +5071,10 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram) psx_gpu->primitive_type = PRIMITIVE_TYPE_UNKNOWN; psx_gpu->saved_hres = 256; + psx_gpu->hacks_active = 0; - // check some offset + // check some offsets, asm relies on these + psx_gpu->reserved_a[(offsetof(psx_gpu_struct, test_mask) == 0) - 1] = 0; psx_gpu->reserved_a[(offsetof(psx_gpu_struct, blocks) == psx_gpu_blocks_offset) - 1] = 0; } diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.h b/plugins/gpu_neon/psx_gpu/psx_gpu.h index 2539521b..f65351cf 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.h @@ -21,10 +21,17 @@ #define SPAN_DATA_BLOCKS_SIZE 32 +#define AHACK_TEXTURE_ADJ_U (1 << 0) +#define AHACK_TEXTURE_ADJ_V (1 << 1) + #ifndef __ASSEMBLER__ #include "vector_types.h" +#ifndef unlikely +#define unlikely(x) __builtin_expect((x), 0) +#endif + typedef enum { PRIMITIVE_TYPE_TRIANGLE = 0, @@ -189,6 +196,7 @@ typedef struct // enhancement stuff u16 *enhancement_buf_ptr; // main alloc u16 *enhancement_current_buf_ptr; // offset into above, 4 bufs + u32 hacks_active; // AHACK_TEXTURE_ADJ_U ... u32 saved_hres; s16 saved_viewport_start_x; s16 saved_viewport_start_y; @@ -205,7 +213,7 @@ typedef struct // Align up to 64 byte boundary to keep the upcoming buffers cache line // aligned, also make reachable with single immediate addition - u8 reserved_a[184 + 9*4 - 9*sizeof(void *)]; + u8 reserved_a[184 + 8*4 - 9*sizeof(void *)]; // 8KB block_struct blocks[MAX_BLOCKS_PER_ROW]; @@ -256,6 +264,9 @@ u32 texture_region_mask(s32 x1, s32 y1, s32 x2, s32 y2); void update_texture_8bpp_cache(psx_gpu_struct *psx_gpu); void flush_render_block_buffer(psx_gpu_struct *psx_gpu); +void setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block, + edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset); + void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram); u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command); diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index 1ba562b5..82738855 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -223,6 +223,7 @@ #ifdef __MACH__ #define flush_render_block_buffer _flush_render_block_buffer #define update_texture_8bpp_cache _update_texture_8bpp_cache +#define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack #endif @ r0: psx_gpu @@ -543,6 +544,7 @@ function(compute_all_gradients) #define uvrg q14 #define uvrg_dy q15 +#define uv d28 #define alternate_x_16 d4 @@ -925,6 +927,14 @@ function(compute_all_gradients) subs height, height, #4; \ bhi 2b; \ \ + nop; \ + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \ + tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ + beq 1f; \ + add temp, span_uvrg_offset, height, lsl #4; \ + vldr uv, [temp, #(-16*2)]; \ + vstr uv, [temp, #(-16)]; \ + \ 1: \ @@ -986,6 +996,14 @@ function(compute_all_gradients) subs height, height, #4; \ bhi 2b; \ \ + nop; \ + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \ + tst temp, #AHACK_TEXTURE_ADJ_V; \ + beq 1f; \ + add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ + vldr uv, [temp, #16]; \ + vstr uv, [temp, #0]; \ + \ 1: \ @@ -1216,6 +1234,14 @@ function(setup_spans_up_down) subs height_minor_b, height_minor_b, #4 bhi 2b + nop + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset] + tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V) + beq 1f + add temp, span_uvrg_offset, height, lsl #4 + vldr uv, [temp, #(-16*2)] + vstr uv, [temp, #(-16)] + 1: setup_spans_epilogue() @@ -1256,6 +1282,7 @@ function(setup_spans_up_down) #define uvrg_dx_ptr r2 #define texture_mask_ptr r3 +#define hacks_active r6 #define dither_shift r8 #define dither_row r10 @@ -1273,6 +1300,7 @@ function(setup_spans_up_down) #define color_b r5 #undef uvrg +#undef uv #define u_block q0 #define v_block q1 @@ -1350,6 +1378,26 @@ function(setup_spans_up_down) #define setup_blocks_texture_unswizzled() \ +#define setup_blocks_uv_adj_hack_textured(hacks_active) \ + tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ + beq 91f; \ + /* see flush_render_block_buffer below for a reg saving note */ \ + vpush { texture_mask }; \ + vpush { uvrg_dx4 }; \ + \ + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + mov r12, span_uvrg_offset; \ + sub r1, block_ptr_a, #64; \ + mov r2, span_edge_data; \ + mov r3, r12; \ + bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */ \ + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + \ + vpop { uvrg_dx4 }; \ + vpop { texture_mask }; \ + vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ +91: \ + #define setup_blocks_shaded_textured_builder(swizzling) \ .align 3; \ @@ -1575,6 +1623,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vld1.u32 { test_mask }, [psx_gpu, :128]; \ vdup.u8 draw_mask, right_mask; \ \ + ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ @@ -1585,6 +1634,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \ \ + setup_blocks_uv_adj_hack_textured(hacks_active); \ + \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_b_offset, span_b_offset, #4; \ @@ -1599,7 +1650,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ + /* this callee-save reg saving may look unnecessary but it actually is */ \ + /* because the callee violates the ABI */ \ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ @@ -1776,6 +1828,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vld1.u32 { test_mask }, [psx_gpu, :128]; \ vdup.u8 draw_mask, right_mask; \ \ + ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ @@ -1786,6 +1839,8 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \ \ + setup_blocks_uv_adj_hack_textured(hacks_active); \ + \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_edge_data, span_edge_data, #8; \ @@ -1798,7 +1853,6 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ @@ -2334,7 +2388,6 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { rg_dx4 }; \ \ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h index 2f8a6463..7c21d31c 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h @@ -36,6 +36,7 @@ #define psx_gpu_texture_mask_width_offset 0xfa #define psx_gpu_texture_mask_height_offset 0xfb #define psx_gpu_reciprocal_table_ptr_offset 0x108 +#define psx_gpu_hacks_active_offset 0x114 #define psx_gpu_blocks_offset 0x200 #define psx_gpu_span_uvrg_offset_offset 0x2200 #define psx_gpu_span_edge_data_offset 0x4200 diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c index 9b378482..740df981 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c @@ -76,6 +76,7 @@ int main() //WRITE_OFFSET(f, clut_settings); //WRITE_OFFSET(f, texture_settings); WRITE_OFFSET(f, reciprocal_table_ptr); + WRITE_OFFSET(f, hacks_active); WRITE_OFFSET(f, blocks); WRITE_OFFSET(f, span_uvrg_offset); WRITE_OFFSET(f, span_edge_data); diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index 53f33e4c..f398695d 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -903,6 +903,7 @@ static void select_enhancement_buf(psx_gpu_struct *psx_gpu) psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \ psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \ psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \ + psx_gpu->hacks_active = 0; \ psx_gpu->uvrgb_phase = 0x8000; \ } @@ -917,7 +918,7 @@ static int enhancement_enable(psx_gpu_struct *psx_gpu) psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1; if (psx_gpu->viewport_end_x - psx_gpu->viewport_start_x + 1 > 1024) psx_gpu->viewport_end_x = psx_gpu->viewport_start_x + 1023; - psx_gpu->uvrgb_phase = 0x7fff; + //psx_gpu->uvrgb_phase = 0x7fff; return 1; } @@ -1018,73 +1019,29 @@ static int check_enhanced_range(psx_gpu_struct *psx_gpu, int x, int y) return 1; } -static int is_in_array(int val, int array[], int len) +static u32 uv_hack(psx_gpu_struct *psx_gpu, const vertex_struct *vertex_ptrs) { - int i; - for (i = 0; i < len; i++) - if (array[i] == val) - return 1; - return 0; -} - -static int make_members_unique(int array[], int len) -{ - int i, j; - for (i = j = 1; i < len; i++) - if (!is_in_array(array[i], array, j)) - array[j++] = array[i]; - - if (array[0] > array[1]) { - i = array[0]; array[0] = array[1]; array[1] = i; - } - return j; -} - -static void patch_u(vertex_struct *vertex_ptrs, int count, int old, int new) -{ - int i; - for (i = 0; i < count; i++) - if (vertex_ptrs[i].u == old) - vertex_ptrs[i].u = new; -} - -static void patch_v(vertex_struct *vertex_ptrs, int count, int old, int new) -{ - int i; - for (i = 0; i < count; i++) - if (vertex_ptrs[i].v == old) - vertex_ptrs[i].v = new; -} - -// this sometimes does more harm than good, like in PE2 -static void uv_hack(vertex_struct *vertex_ptrs, int vertex_count) -{ - int i, u[4], v[4]; - - for (i = 0; i < vertex_count; i++) { - u[i] = vertex_ptrs[i].u; - v[i] = vertex_ptrs[i].v; - } - if (make_members_unique(u, vertex_count) == 2 && u[1] - u[0] >= 8) { - if ((u[0] & 7) == 7) { - patch_u(vertex_ptrs, vertex_count, u[0], u[0] + 1); - //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0]+1, u[1]); - } - else if ((u[1] & 7) == 0 || u[1] - u[0] > 128) { - patch_u(vertex_ptrs, vertex_count, u[1], u[1] - 1); - //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0], u[1]-1); - } - } - if (make_members_unique(v, vertex_count) == 2 && ((v[0] - v[1]) & 7) == 0) { - if ((v[0] & 7) == 7) { - patch_v(vertex_ptrs, vertex_count, v[0], v[0] + 1); - //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0]+1, v[1]); - } - else if ((v[1] & 7) == 0) { - patch_v(vertex_ptrs, vertex_count, v[1], v[1] - 1); - //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0], v[1]-1); - } + int i, have_right_edge = 0, have_bottom_edge = 0, bad_u = 0, bad_v = 0; + u32 hacks = 0; + + for (i = 0; i < 3; i++) { + int j = (i + 1) % 3, k = (i + 2) % 3; + int du = abs((int)vertex_ptrs[i].u - (int)vertex_ptrs[j].u); + int dv = abs((int)vertex_ptrs[i].v - (int)vertex_ptrs[j].v); + if (du && (du & 7) != 7) + bad_u = 1; + if (dv && (dv & 7) != 7) + bad_v = 1; + if (vertex_ptrs[i].x == vertex_ptrs[j].x && vertex_ptrs[k].x < vertex_ptrs[j].x) + have_right_edge = 1; + if (vertex_ptrs[i].y == vertex_ptrs[j].y)// && vertex_ptrs[k].y < vertex_ptrs[j].y) + have_bottom_edge = 1; } + if (have_right_edge && bad_u) + hacks |= AHACK_TEXTURE_ADJ_U; + if (have_bottom_edge && bad_v) + hacks |= AHACK_TEXTURE_ADJ_V; + return hacks; } static void do_triangle_enhanced(psx_gpu_struct *psx_gpu, @@ -1104,6 +1061,8 @@ static void do_triangle_enhanced(psx_gpu_struct *psx_gpu, if (!enhancement_enable(psx_gpu)) return; + if ((current_command & RENDER_FLAGS_TEXTURE_MAP) && psx_gpu->hack_texture_adj) + psx_gpu->hacks_active |= uv_hack(psx_gpu, vertexes); shift_vertices3(vertex_ptrs); shift_triangle_area(); render_triangle_p(psx_gpu, vertex_ptrs, current_command); @@ -1314,8 +1273,6 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, get_vertex_data_xy_uv(2, 10); get_vertex_data_xy_uv(3, 14); - if (psx_gpu->hack_texture_adj) - uv_hack(vertexes, 4); do_quad_enhanced(psx_gpu, vertexes, current_command); gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t()); break; @@ -1368,8 +1325,6 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, get_vertex_data_xy_uv_rgb(2, 12); get_vertex_data_xy_uv_rgb(3, 18); - if (psx_gpu->hack_texture_adj) - uv_hack(vertexes, 4); do_quad_enhanced(psx_gpu, vertexes, current_command); gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt()); break; diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c index b5274362..174e61d2 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c @@ -192,6 +192,7 @@ typedef union #define gvld1_u8(d, s) d.u8 = vld1_u8(s) #define gvld1_u32(d, s) d.u32 = vld1_u32((const u32 *)(s)) +#define gvld1_u64(d, s) d.u64 = vld1_u64((const u64 *)(s)) #define gvld1q_u8(d, s) d.u8 = vld1q_u8(s) #define gvld1q_u16(d, s) d.u16 = vld1q_u16(s) #define gvld1q_u32(d, s) d.u32 = vld1q_u32((const u32 *)(s)) @@ -206,6 +207,8 @@ typedef union #define gvst1_u8(v, p) \ vst1_u8(p, v.u8) +#define gvst1_u64(v, p) \ + vst1_u64((u64 *)(p), v.u64) #define gvst1q_u16(v, p) \ vst1q_u16(p, v.u16) #define gvst1q_inc_u32(v, p, i) { \ @@ -388,10 +391,14 @@ typedef union #define gvld1_u8(d, s) d.m = _mm_loadu_si64(s) #define gvld1_u32 gvld1_u8 +#define gvld1_u64 gvld1_u8 #define gvld1q_u8(d, s) d.m = _mm_loadu_si128((__m128i *)(s)) #define gvld1q_u16 gvld1q_u8 #define gvld1q_u32 gvld1q_u8 +#define gvst1_u8(v, p) _mm_storeu_si64(p, v.m) +#define gvst1_u64 gvst1_u8 + #define gvst4_4_inc_u32(v0, v1, v2, v3, p, i) { \ __m128i t0 = _mm_unpacklo_epi32(v0.m, v1.m); \ __m128i t1 = _mm_unpacklo_epi32(v2.m, v3.m); \ @@ -1401,6 +1408,12 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, setup_spans_set_x4(alternate, down, alternate_active); \ height -= 4; \ } while(height > 0); \ + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) \ + { \ + vec_2x32u tmp; \ + gvld1_u64(tmp, &span_uvrg_offset[height - 2]); \ + gvst1_u64(tmp, &span_uvrg_offset[height - 1]); \ + } \ } \ @@ -1452,6 +1465,12 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, setup_spans_set_x4(alternate, up, alternate_active); \ height -= 4; \ } \ + if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V) \ + { \ + vec_2x32u tmp; \ + gvld1_u64(tmp, &psx_gpu->span_uvrg_offset[1]); \ + gvst1_u64(tmp, &psx_gpu->span_uvrg_offset[0]); \ + } \ } \ #define half_left lo @@ -1714,6 +1733,12 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, setup_spans_set_x4(none, down, no); height_minor_b -= 4; } + if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) + { + vec_2x32u tmp; + gvld1_u64(tmp, &span_uvrg_offset[height_minor_b - 2]); + gvst1_u64(tmp, &span_uvrg_offset[height_minor_b - 1]); + } } } @@ -2152,6 +2177,15 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, #define setup_blocks_store_draw_mask_untextured_direct(_block, bits) \ +#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset) \ + +#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset) \ +{ \ + u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V; \ + if (unlikely(psx_gpu->hacks_active & m_)) \ + setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, (void *)uvrg_offset); \ +} \ + #define setup_blocks_add_blocks_indirect() \ num_blocks += span_num_blocks; \ \ @@ -2211,6 +2245,8 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a, setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \ setup_blocks_store_draw_mask_##texturing##_##target(block, \ span_edge_data->right_mask); \ + setup_blocks_uv_adj_hack_##texturing(block, span_edge_data, \ + span_uvrg_offset); \ \ block++; \ } \