gpu_neon: rework enh. res. texturing hack
authornotaz <notasas@gmail.com>
Tue, 20 Aug 2024 20:49:45 +0000 (23:49 +0300)
committernotaz <notasas@gmail.com>
Thu, 22 Aug 2024 23:15:52 +0000 (02:15 +0300)
libretro/pcsx_rearmed#841

plugins/gpu_neon/psx_gpu/common.h
plugins/gpu_neon/psx_gpu/psx_gpu.c
plugins/gpu_neon/psx_gpu/psx_gpu.h
plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h
plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c
plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
plugins/gpu_neon/psx_gpu/psx_gpu_simd.c

index 820dfbe..5881e2a 100644 (file)
@@ -9,7 +9,5 @@
 #include "vector_types.h"
 #include "psx_gpu.h"
 
-#define unlikely(x) __builtin_expect((x), 0)
-
 #endif
 
index a59e9cd..19f1c19 100644 (file)
@@ -560,8 +560,9 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu)
   y##set##_b.e[1] = vertex->b                                                  \
   
 
-void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
- vertex_struct *b, vertex_struct *c)
+void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
+ const vertex_struct * __restrict__ a, const vertex_struct * __restrict__ b,
+ const vertex_struct * __restrict__ c)
 {
   u32 triangle_area = psx_gpu->triangle_area;
   u32 winding_mask_scalar;
@@ -1163,6 +1164,8 @@ static void setup_spans_debug_check(psx_gpu_struct *psx_gpu,
       setup_spans_set_x4(alternate, down, alternate_active);                   \
       height -= 4;                                                             \
     } while(height > 0);                                                       \
+    if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V))   \
+      span_uvrg_offset[height - 1].low = span_uvrg_offset[height - 2].low;     \
   }                                                                            \
 
 
@@ -1216,6 +1219,8 @@ static void setup_spans_debug_check(psx_gpu_struct *psx_gpu,
       setup_spans_set_x4(alternate, up, alternate_active);                     \
       height -= 4;                                                             \
     }                                                                          \
+    if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V)                           \
+      psx_gpu->span_uvrg_offset[0].low = psx_gpu->span_uvrg_offset[1].low;     \
   }                                                                            \
 
 #define index_left  0
@@ -1452,6 +1457,11 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
       setup_spans_set_x4(none, down, no);
       height_minor_b -= 4;
     }
+    if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V))
+    {
+      span_uvrg_offset[height_minor_b - 1].low =
+        span_uvrg_offset[height_minor_b - 2].low;
+    }
   }
 
   left_split_triangles++;
@@ -1459,6 +1469,41 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
 
 #endif
 
+// this is some hacky mess, can this be improved somehow?
+// ideally change things to not have to do this hack at all
+void __attribute__((noinline))
+setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block,
+    edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset)
+{
+  size_t span_i = span_uvrg_offset - psx_gpu->span_uvrg_offset;
+  if (span_i != 0 && span_i != psx_gpu->num_spans - 1
+      && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U))
+    return;
+  u32 num_blocks = span_edge_data->num_blocks - 1;
+  s32 offset = __builtin_ctz(span_edge_data->right_mask | 0x100) - 1;
+  s32 toffset = 8 * num_blocks + offset - 1;
+  if (toffset < 0 && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U))
+    return;
+
+  toffset += span_edge_data->left_x;
+  s32 u_dx = psx_gpu->uvrg_dx.low.e[0];
+  s32 v_dx = psx_gpu->uvrg_dx.low.e[1];
+  u32 u = span_uvrg_offset->low.e[0];
+  u32 v = span_uvrg_offset->low.e[1];
+  u += u_dx * toffset;
+  v += v_dx * toffset;
+  u = (u >> 16) & psx_gpu->texture_mask_width;
+  v = (v >> 16) & psx_gpu->texture_mask_height;
+  if (!(psx_gpu->render_state_base & (TEXTURE_MODE_16BPP << 8))) {
+    // 4bpp 8bpp are swizzled
+    u32 u_ = u;
+    u = (u & 0x0f) | ((v & 0x0f) << 4);
+    v = (v & 0xf0) | (u_ >> 4);
+  }
+  assert(offset >= 0);
+  //assert(block->uv.e[offset] == ((v << 8) | u));
+  block->uv.e[offset] = (v << 8) | u;
+}
 
 #define dither_table_entry_normal(value)                                       \
   (value)                                                                      \
@@ -1868,6 +1913,14 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
 
 #define setup_blocks_store_draw_mask_untextured_direct(_block, bits)           \
 
+#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset)    \
+
+#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset)      \
+{                                                                              \
+  u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V;                          \
+  if (unlikely(psx_gpu->hacks_active & m_))                                    \
+    setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, uvrg_offset);         \
+}                                                                              \
 
 #define setup_blocks_add_blocks_indirect()                                     \
   num_blocks += span_num_blocks;                                               \
@@ -1938,6 +1991,8 @@ void setup_blocks_##shading##_##texturing##_##dithering##_##sw##_##target(     \
       setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \
       setup_blocks_store_draw_mask_##texturing##_##target(block,               \
        span_edge_data->right_mask);                                            \
+      setup_blocks_uv_adj_hack_##texturing(block, span_edge_data,              \
+          span_uvrg_offset);                                                   \
                                                                                \
       block++;                                                                 \
     }                                                                          \
@@ -5016,8 +5071,10 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
   psx_gpu->primitive_type = PRIMITIVE_TYPE_UNKNOWN;
 
   psx_gpu->saved_hres = 256;
+  psx_gpu->hacks_active = 0;
 
-  // check some offset
+  // check some offsets, asm relies on these
+  psx_gpu->reserved_a[(offsetof(psx_gpu_struct, test_mask) == 0) - 1] = 0;
   psx_gpu->reserved_a[(offsetof(psx_gpu_struct, blocks) == psx_gpu_blocks_offset) - 1] = 0;
 }
 
index 6964a62..2d0f7b1 100644 (file)
 
 #define SPAN_DATA_BLOCKS_SIZE 32
 
+#define AHACK_TEXTURE_ADJ_U   (1 << 0)
+#define AHACK_TEXTURE_ADJ_V   (1 << 1)
+
 #ifndef __ASSEMBLER__
 
 #include "vector_types.h"
 
+#ifndef unlikely
+#define unlikely(x) __builtin_expect((x), 0)
+#endif
+
 typedef enum
 {
   PRIMITIVE_TYPE_TRIANGLE = 0,
@@ -189,6 +196,7 @@ typedef struct
   // enhancement stuff
   u16 *enhancement_buf_ptr;          // main alloc
   u16 *enhancement_current_buf_ptr;  // offset into above, 4 bufs
+  u32 hacks_active;                  // AHACK_TEXTURE_ADJ_U ...
   u32 saved_hres;
   s16 saved_viewport_start_x;
   s16 saved_viewport_start_y;
@@ -205,7 +213,7 @@ typedef struct
 
   // Align up to 64 byte boundary to keep the upcoming buffers cache line
   // aligned, also make reachable with single immediate addition
-  u8 reserved_a[184 + 9*4 - 9*sizeof(void *)];
+  u8 reserved_a[184 + 8*4 - 9*sizeof(void *)];
 
   // 8KB
   block_struct blocks[MAX_BLOCKS_PER_ROW];
@@ -257,6 +265,9 @@ u32 texture_region_mask(s32 x1, s32 y1, s32 x2, s32 y2);
 void update_texture_8bpp_cache(psx_gpu_struct *psx_gpu);
 void flush_render_block_buffer(psx_gpu_struct *psx_gpu);
 
+void setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block,
+    edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset);
+
 void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram);
 u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
  s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command);
index 1ba562b..8273885 100644 (file)
 #ifdef __MACH__
 #define flush_render_block_buffer _flush_render_block_buffer
 #define update_texture_8bpp_cache _update_texture_8bpp_cache
+#define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack
 #endif
 
 @ r0: psx_gpu
@@ -543,6 +544,7 @@ function(compute_all_gradients)
 
 #define uvrg                                     q14
 #define uvrg_dy                                  q15
+#define uv                                       d28
 
 #define alternate_x_16                           d4
 
@@ -925,6 +927,14 @@ function(compute_all_gradients)
   subs height, height, #4;                                                     \
   bhi 2b;                                                                      \
                                                                                \
+  nop;                                                                         \
+  ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset];                           \
+  tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V);                      \
+  beq 1f;                                                                      \
+  add temp, span_uvrg_offset, height, lsl #4;                                  \
+  vldr uv, [temp, #(-16*2)];                                                   \
+  vstr uv, [temp, #(-16)];                                                     \
+                                                                               \
  1:                                                                            \
 
 
@@ -986,6 +996,14 @@ function(compute_all_gradients)
   subs height, height, #4;                                                     \
   bhi 2b;                                                                      \
                                                                                \
+  nop;                                                                         \
+  ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset];                           \
+  tst temp, #AHACK_TEXTURE_ADJ_V;                                              \
+  beq 1f;                                                                      \
+  add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset;                         \
+  vldr uv, [temp, #16];                                                        \
+  vstr uv, [temp, #0];                                                         \
+                                                                               \
  1:                                                                            \
 
 
@@ -1216,6 +1234,14 @@ function(setup_spans_up_down)
   subs height_minor_b, height_minor_b, #4
   bhi 2b
 
+  nop
+  ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]
+  tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)
+  beq 1f
+  add temp, span_uvrg_offset, height, lsl #4
+  vldr uv, [temp, #(-16*2)]
+  vstr uv, [temp, #(-16)]
+
  1:
   setup_spans_epilogue()
 
@@ -1256,6 +1282,7 @@ function(setup_spans_up_down)
 
 #define uvrg_dx_ptr                              r2
 #define texture_mask_ptr                         r3
+#define hacks_active                             r6
 #define dither_shift                             r8
 #define dither_row                               r10
 
@@ -1273,6 +1300,7 @@ function(setup_spans_up_down)
 #define color_b                                  r5
 
 #undef uvrg
+#undef uv
 
 #define u_block                                  q0
 #define v_block                                  q1
@@ -1350,6 +1378,26 @@ function(setup_spans_up_down)
 
 #define setup_blocks_texture_unswizzled()                                      \
 
+#define setup_blocks_uv_adj_hack_textured(hacks_active)                        \
+  tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V);              \
+  beq 91f;                                                                     \
+  /* see flush_render_block_buffer below for a reg saving note */              \
+  vpush { texture_mask };                                                      \
+  vpush { uvrg_dx4 };                                                          \
+                                                                               \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                         \
+  mov r12, span_uvrg_offset;                                                   \
+  sub r1, block_ptr_a, #64;                                                    \
+  mov r2, span_edge_data;                                                      \
+  mov r3, r12;                                                                 \
+  bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */                                \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                         \
+                                                                               \
+  vpop { uvrg_dx4 };                                                           \
+  vpop { texture_mask };                                                       \
+  vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
+91:                                                                            \
+
 
 #define setup_blocks_shaded_textured_builder(swizzling)                        \
 .align 3;                                                                      \
@@ -1575,6 +1623,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
   vdup.u8 draw_mask, right_mask;                                               \
                                                                                \
+  ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset];                   \
   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
   vzip.u8 u_whole_8, v_whole_8;                                                \
@@ -1585,6 +1634,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
                                                                                \
+  setup_blocks_uv_adj_hack_textured(hacks_active);                             \
+                                                                               \
  1:                                                                            \
   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
   add span_b_offset, span_b_offset, #4;                                        \
@@ -1599,7 +1650,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
-  /* TODO: Load from psx_gpu instead of saving/restoring these               */\
+  /* this callee-save reg saving may look unnecessary but it actually is */    \
+  /* because the callee violates the ABI */                                    \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
@@ -1776,6 +1828,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
   vdup.u8 draw_mask, right_mask;                                               \
                                                                                \
+  ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset];                   \
   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
   vzip.u8 u_whole_8, v_whole_8;                                                \
@@ -1786,6 +1839,8 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
                                                                                \
+  setup_blocks_uv_adj_hack_textured(hacks_active);                             \
+                                                                               \
  1:                                                                            \
   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
   add span_edge_data, span_edge_data, #8;                                      \
@@ -1798,7 +1853,6 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
-  /* TODO: Load from psx_gpu instead of saving/restoring these               */\
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
@@ -2334,7 +2388,6 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
-  /* TODO: Load from psx_gpu instead of saving/restoring these               */\
   vpush { rg_dx4 };                                                            \
                                                                                \
   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
index 2f8a646..7c21d31 100644 (file)
@@ -36,6 +36,7 @@
 #define psx_gpu_texture_mask_width_offset                 0xfa
 #define psx_gpu_texture_mask_height_offset                0xfb
 #define psx_gpu_reciprocal_table_ptr_offset               0x108
+#define psx_gpu_hacks_active_offset                       0x114
 #define psx_gpu_blocks_offset                             0x200
 #define psx_gpu_span_uvrg_offset_offset                   0x2200
 #define psx_gpu_span_edge_data_offset                     0x4200
index 9b37848..740df98 100644 (file)
@@ -76,6 +76,7 @@ int main()
        //WRITE_OFFSET(f, clut_settings);
        //WRITE_OFFSET(f, texture_settings);
        WRITE_OFFSET(f, reciprocal_table_ptr);
+       WRITE_OFFSET(f, hacks_active);
        WRITE_OFFSET(f, blocks);
        WRITE_OFFSET(f, span_uvrg_offset);
        WRITE_OFFSET(f, span_edge_data);
index 53f33e4..f398695 100644 (file)
@@ -903,6 +903,7 @@ static void select_enhancement_buf(psx_gpu_struct *psx_gpu)
   psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \
   psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \
   psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \
+  psx_gpu->hacks_active = 0; \
   psx_gpu->uvrgb_phase = 0x8000; \
 }
 
@@ -917,7 +918,7 @@ static int enhancement_enable(psx_gpu_struct *psx_gpu)
   psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1;
   if (psx_gpu->viewport_end_x - psx_gpu->viewport_start_x + 1 > 1024)
     psx_gpu->viewport_end_x = psx_gpu->viewport_start_x + 1023;
-  psx_gpu->uvrgb_phase = 0x7fff;
+  //psx_gpu->uvrgb_phase = 0x7fff;
   return 1;
 }
 
@@ -1018,73 +1019,29 @@ static int check_enhanced_range(psx_gpu_struct *psx_gpu, int x, int y)
   return 1;
 }
 
-static int is_in_array(int val, int array[], int len)
+static u32 uv_hack(psx_gpu_struct *psx_gpu, const vertex_struct *vertex_ptrs)
 {
-  int i;
-  for (i = 0; i < len; i++)
-    if (array[i] == val)
-      return 1;
-  return 0;
-}
-
-static int make_members_unique(int array[], int len)
-{
-  int i, j;
-  for (i = j = 1; i < len; i++)
-    if (!is_in_array(array[i], array, j))
-      array[j++] = array[i];
-
-  if (array[0] > array[1]) {
-    i = array[0]; array[0] = array[1]; array[1] = i;
-  }
-  return j;
-}
-
-static void patch_u(vertex_struct *vertex_ptrs, int count, int old, int new)
-{
-  int i;
-  for (i = 0; i < count; i++)
-    if (vertex_ptrs[i].u == old)
-      vertex_ptrs[i].u = new;
-}
-
-static void patch_v(vertex_struct *vertex_ptrs, int count, int old, int new)
-{
-  int i;
-  for (i = 0; i < count; i++)
-    if (vertex_ptrs[i].v == old)
-      vertex_ptrs[i].v = new;
-}
-
-// this sometimes does more harm than good, like in PE2
-static void uv_hack(vertex_struct *vertex_ptrs, int vertex_count)
-{
-  int i, u[4], v[4];
-
-  for (i = 0; i < vertex_count; i++) {
-    u[i] = vertex_ptrs[i].u;
-    v[i] = vertex_ptrs[i].v;
-  }
-  if (make_members_unique(u, vertex_count) == 2 && u[1] - u[0] >= 8) {
-    if ((u[0] & 7) == 7) {
-      patch_u(vertex_ptrs, vertex_count, u[0], u[0] + 1);
-      //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0]+1, u[1]);
-    }
-    else if ((u[1] & 7) == 0 || u[1] - u[0] > 128) {
-      patch_u(vertex_ptrs, vertex_count, u[1], u[1] - 1);
-      //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0], u[1]-1);
-    }
-  }
-  if (make_members_unique(v, vertex_count) == 2 && ((v[0] - v[1]) & 7) == 0) {
-    if ((v[0] & 7) == 7) {
-      patch_v(vertex_ptrs, vertex_count, v[0], v[0] + 1);
-      //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0]+1, v[1]);
-    }
-    else if ((v[1] & 7) == 0) {
-      patch_v(vertex_ptrs, vertex_count, v[1], v[1] - 1);
-      //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0], v[1]-1);
-    }
+  int i, have_right_edge = 0, have_bottom_edge = 0, bad_u = 0, bad_v = 0;
+  u32 hacks = 0;
+
+  for (i = 0; i < 3; i++) {
+    int j = (i + 1) % 3, k = (i + 2) % 3;
+    int du = abs((int)vertex_ptrs[i].u - (int)vertex_ptrs[j].u);
+    int dv = abs((int)vertex_ptrs[i].v - (int)vertex_ptrs[j].v);
+    if (du && (du & 7) != 7)
+      bad_u = 1;
+    if (dv && (dv & 7) != 7)
+      bad_v = 1;
+    if (vertex_ptrs[i].x == vertex_ptrs[j].x && vertex_ptrs[k].x < vertex_ptrs[j].x)
+      have_right_edge = 1;
+    if (vertex_ptrs[i].y == vertex_ptrs[j].y)// && vertex_ptrs[k].y < vertex_ptrs[j].y)
+      have_bottom_edge = 1;
   }
+  if (have_right_edge && bad_u)
+    hacks |= AHACK_TEXTURE_ADJ_U;
+  if (have_bottom_edge && bad_v)
+    hacks |= AHACK_TEXTURE_ADJ_V;
+  return hacks;
 }
 
 static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
@@ -1104,6 +1061,8 @@ static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
   if (!enhancement_enable(psx_gpu))
     return;
 
+  if ((current_command & RENDER_FLAGS_TEXTURE_MAP) && psx_gpu->hack_texture_adj)
+    psx_gpu->hacks_active |= uv_hack(psx_gpu, vertexes);
   shift_vertices3(vertex_ptrs);
   shift_triangle_area();
   render_triangle_p(psx_gpu, vertex_ptrs, current_command);
@@ -1314,8 +1273,6 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv(2, 10);  
         get_vertex_data_xy_uv(3, 14);
   
-        if (psx_gpu->hack_texture_adj)
-          uv_hack(vertexes, 4);
         do_quad_enhanced(psx_gpu, vertexes, current_command);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
         break;
@@ -1368,8 +1325,6 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv_rgb(2, 12);
         get_vertex_data_xy_uv_rgb(3, 18);
 
-        if (psx_gpu->hack_texture_adj)
-          uv_hack(vertexes, 4);
         do_quad_enhanced(psx_gpu, vertexes, current_command);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
         break;
index b527436..174e61d 100644 (file)
@@ -192,6 +192,7 @@ typedef union
 
 #define gvld1_u8(d, s)           d.u8  = vld1_u8(s)
 #define gvld1_u32(d, s)          d.u32 = vld1_u32((const u32 *)(s))
+#define gvld1_u64(d, s)          d.u64 = vld1_u64((const u64 *)(s))
 #define gvld1q_u8(d, s)          d.u8  = vld1q_u8(s)
 #define gvld1q_u16(d, s)         d.u16 = vld1q_u16(s)
 #define gvld1q_u32(d, s)         d.u32 = vld1q_u32((const u32 *)(s))
@@ -206,6 +207,8 @@ typedef union
 
 #define gvst1_u8(v, p) \
   vst1_u8(p, v.u8)
+#define gvst1_u64(v, p) \
+  vst1_u64((u64 *)(p), v.u64)
 #define gvst1q_u16(v, p) \
   vst1q_u16(p, v.u16)
 #define gvst1q_inc_u32(v, p, i) { \
@@ -388,10 +391,14 @@ typedef union
 
 #define gvld1_u8(d, s)           d.m = _mm_loadu_si64(s)
 #define gvld1_u32                gvld1_u8
+#define gvld1_u64                gvld1_u8
 #define gvld1q_u8(d, s)          d.m = _mm_loadu_si128((__m128i *)(s))
 #define gvld1q_u16               gvld1q_u8
 #define gvld1q_u32               gvld1q_u8
 
+#define gvst1_u8(v, p)           _mm_storeu_si64(p, v.m)
+#define gvst1_u64                gvst1_u8
+
 #define gvst4_4_inc_u32(v0, v1, v2, v3, p, i) { \
   __m128i t0 = _mm_unpacklo_epi32(v0.m, v1.m); \
   __m128i t1 = _mm_unpacklo_epi32(v2.m, v3.m); \
@@ -1401,6 +1408,12 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
       setup_spans_set_x4(alternate, down, alternate_active);                   \
       height -= 4;                                                             \
     } while(height > 0);                                                       \
+    if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V))   \
+    {                                                                          \
+      vec_2x32u tmp;                                                           \
+      gvld1_u64(tmp, &span_uvrg_offset[height - 2]);                           \
+      gvst1_u64(tmp, &span_uvrg_offset[height - 1]);                           \
+    }                                                                          \
   }                                                                            \
 
 
@@ -1452,6 +1465,12 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
       setup_spans_set_x4(alternate, up, alternate_active);                     \
       height -= 4;                                                             \
     }                                                                          \
+    if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V)                           \
+    {                                                                          \
+      vec_2x32u tmp;                                                           \
+      gvld1_u64(tmp, &psx_gpu->span_uvrg_offset[1]);                           \
+      gvst1_u64(tmp, &psx_gpu->span_uvrg_offset[0]);                           \
+    }                                                                          \
   }                                                                            \
 
 #define half_left  lo
@@ -1714,6 +1733,12 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
       setup_spans_set_x4(none, down, no);
       height_minor_b -= 4;
     }
+    if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V))
+    {
+      vec_2x32u tmp;
+      gvld1_u64(tmp, &span_uvrg_offset[height_minor_b - 2]);
+      gvst1_u64(tmp, &span_uvrg_offset[height_minor_b - 1]);
+    }
   }
 }
 
@@ -2152,6 +2177,15 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
 
 #define setup_blocks_store_draw_mask_untextured_direct(_block, bits)           \
 
+#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset)    \
+
+#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset)      \
+{                                                                              \
+  u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V;                          \
+  if (unlikely(psx_gpu->hacks_active & m_))                                    \
+    setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, (void *)uvrg_offset); \
+}                                                                              \
+
 #define setup_blocks_add_blocks_indirect()                                     \
   num_blocks += span_num_blocks;                                               \
                                                                                \
@@ -2211,6 +2245,8 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
       setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \
       setup_blocks_store_draw_mask_##texturing##_##target(block,               \
        span_edge_data->right_mask);                                            \
+      setup_blocks_uv_adj_hack_##texturing(block, span_edge_data,              \
+          span_uvrg_offset);                                                   \
                                                                                \
       block++;                                                                 \
     }                                                                          \