From: notaz <notasas@gmail.com>
Date: Tue, 16 Aug 2022 21:11:39 +0000 (+0300)
Subject: gpu_neon: adjust some comments and things
X-Git-Tag: r24l~386
X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=afcb5133bfb264042e6db9f222d0e32a39e6d7ef;p=pcsx_rearmed.git

gpu_neon: adjust some comments and things
---

diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c
index 1d513d8b..51ad152d 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c
@@ -22,6 +22,13 @@
 #endif
 #include "psx_gpu_simd.h"
 
+#if 0
+void dump_r_d(const char *name, void *dump);
+void dump_r_q(const char *name, void *dump);
+#define dumprd(n) dump_r_d(#n, n.e)
+#define dumprq(n) dump_r_q(#n, n.e)
+#endif
+
 u32 span_pixels = 0;
 u32 span_pixel_blocks = 0;
 u32 spans = 0;
@@ -769,13 +776,13 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
 {                                                                              \
   u32 _num_spans = &span_edge_data_element - psx_gpu->span_edge_data;          \
   if (_num_spans > MAX_SPANS)                                                  \
-    *(int *)0 = 1;                                                             \
+    *(volatile int *)0 = 1;                                                    \
   if (_num_spans < psx_gpu->num_spans)                                         \
   {                                                                            \
     if(span_edge_data_element.num_blocks > MAX_BLOCKS_PER_ROW)                 \
-      *(int *)0 = 1;                                                           \
-    if(span_edge_data_element.y > 2048)                                        \
-      *(int *)0 = 1;                                                           \
+      *(volatile int *)0 = 2;                                                  \
+    if(span_edge_data_element.y >= 2048)                                       \
+      *(volatile int *)0 = 3;                                                  \
   }                                                                            \
 }                                                                              \
 
@@ -788,7 +795,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   vec_2x64s alternate_x;                                                       \
   vec_2x64s alternate_dx_dy;                                                   \
   vec_4x32s alternate_x_32;                                                    \
-  vec_2x32s alternate_x_16;                                                    \
+  vec_4x16u alternate_x_16;                                                    \
                                                                                \
   vec_4x16u alternate_select;                                                  \
   vec_4x16s y_mid_point;                                                       \
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
index da47756e..c62c1baa 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
@@ -369,8 +369,8 @@ function(compute_all_gradients)
   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
 
   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
-  vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
-
+  vdup.u32 r_shift, r14              @ r_shift = { shift, shift*, shift, shift* }
+                                     @ * - vshl.u64: ignored by hw
   vadd.u32 uvrg_base, uvrgb_phase
   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
 
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c
index 5c05b14a..bbeccb71 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c
@@ -313,7 +313,7 @@ typedef union
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-static int ccount;
+static int ccount, dump_enabled;
 void cmpp(const char *name, const void *a_, const void *b_, size_t len)
 {
   const uint32_t *a = a_, *b = b_, masks[] = { 0, 0xff, 0xffff, 0xffffff };
@@ -336,8 +336,9 @@ void cmpp(const char *name, const void *a_, const void *b_, size_t len)
 void dump_r_(const char *name, void *dump, int is_q)
 {
   unsigned long long *u = dump;
+  if (!dump_enabled) return;
   //if (ccount > 1) return;
-  printf("%10s %016llx ", name, u[0]);
+  printf("%20s %016llx ", name, u[0]);
   if (is_q)
     printf("%016llx", u[1]);
   puts("");
@@ -497,7 +498,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
 
   gvreg uvrg_base;
   gvshll_n_u16(uvrg_base, gvlo(uvrg_xxxx0), 16); // uvrg_base = uvrg0 << 16
-  gvdupq_n_u32(r_shift, shift);         // r_shift = { shift, shift, shift, shift }
+  gvdupq_n_s64(r_shift, shift);         // r_shift = { shift, shift }
 
   gvaddq_u32(uvrg_base, uvrg_base, uvrgb_phase);
   gvabsq_s32(ga_uvrg_x, ga_uvrg_x);     // ga_uvrg_x = abs(ga_uvrg_x)
@@ -600,7 +601,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
   vec_2x64s alternate_x;                                                       \
   vec_2x64s alternate_dx_dy;                                                   \
   vec_4x32s alternate_x_32;                                                    \
-  vec_2x32s alternate_x_16;                                                    \
+  vec_4x16u alternate_x_16;                                                    \
                                                                                \
   vec_4x16u alternate_select;                                                  \
   vec_4x16s y_mid_point;                                                       \
diff --git a/plugins/gpu_neon/psx_gpu/vector_ops.h b/plugins/gpu_neon/psx_gpu/vector_ops.h
index 189eb79d..6f2bcbf7 100644
--- a/plugins/gpu_neon/psx_gpu/vector_ops.h
+++ b/plugins/gpu_neon/psx_gpu/vector_ops.h
@@ -103,7 +103,7 @@
   foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift))            \
 
 #define shr_4x16b(dest, source, shift)                                         \
-  foreach_element(4, (dest).e[_i] = (source).e[_i] >> (shift))                 \
+  foreach_element(4, (dest).e[_i] = (u16)(source).e[_i] >> (shift))            \
 
 #define shl_4x16b(dest, source, shift)                                         \
   foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift))            \