From: notaz Date: Tue, 16 Aug 2022 21:11:39 +0000 (+0300) Subject: gpu_neon: adjust some comments and things X-Git-Tag: r24l~386 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=afcb5133bfb264042e6db9f222d0e32a39e6d7ef;p=pcsx_rearmed.git gpu_neon: adjust some comments and things --- diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c index 1d513d8b..51ad152d 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c @@ -22,6 +22,13 @@ #endif #include "psx_gpu_simd.h" +#if 0 +void dump_r_d(const char *name, void *dump); +void dump_r_q(const char *name, void *dump); +#define dumprd(n) dump_r_d(#n, n.e) +#define dumprq(n) dump_r_q(#n, n.e) +#endif + u32 span_pixels = 0; u32 span_pixel_blocks = 0; u32 spans = 0; @@ -769,13 +776,13 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a, { \ u32 _num_spans = &span_edge_data_element - psx_gpu->span_edge_data; \ if (_num_spans > MAX_SPANS) \ - *(int *)0 = 1; \ + *(volatile int *)0 = 1; \ if (_num_spans < psx_gpu->num_spans) \ { \ if(span_edge_data_element.num_blocks > MAX_BLOCKS_PER_ROW) \ - *(int *)0 = 1; \ - if(span_edge_data_element.y > 2048) \ - *(int *)0 = 1; \ + *(volatile int *)0 = 2; \ + if(span_edge_data_element.y >= 2048) \ + *(volatile int *)0 = 3; \ } \ } \ @@ -788,7 +795,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a, vec_2x64s alternate_x; \ vec_2x64s alternate_dx_dy; \ vec_4x32s alternate_x_32; \ - vec_2x32s alternate_x_16; \ + vec_4x16u alternate_x_16; \ \ vec_4x16u alternate_select; \ vec_4x16s y_mid_point; \ diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index da47756e..c62c1baa 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -369,8 +369,8 @@ function(compute_all_gradients) sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS) vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16 - vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift } - + vdup.u32 r_shift, r14 @ r_shift = { shift, shift*, shift, shift* } + @ * - vshl.u64: ignored by hw vadd.u32 uvrg_base, uvrgb_phase vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x) diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c index 5c05b14a..bbeccb71 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_simd.c @@ -313,7 +313,7 @@ typedef union #include #include #include -static int ccount; +static int ccount, dump_enabled; void cmpp(const char *name, const void *a_, const void *b_, size_t len) { const uint32_t *a = a_, *b = b_, masks[] = { 0, 0xff, 0xffff, 0xffffff }; @@ -336,8 +336,9 @@ void cmpp(const char *name, const void *a_, const void *b_, size_t len) void dump_r_(const char *name, void *dump, int is_q) { unsigned long long *u = dump; + if (!dump_enabled) return; //if (ccount > 1) return; - printf("%10s %016llx ", name, u[0]); + printf("%20s %016llx ", name, u[0]); if (is_q) printf("%016llx", u[1]); puts(""); @@ -497,7 +498,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, gvreg uvrg_base; gvshll_n_u16(uvrg_base, gvlo(uvrg_xxxx0), 16); // uvrg_base = uvrg0 << 16 - gvdupq_n_u32(r_shift, shift); // r_shift = { shift, shift, shift, shift } + gvdupq_n_s64(r_shift, shift); // r_shift = { shift, shift } gvaddq_u32(uvrg_base, uvrg_base, uvrgb_phase); gvabsq_s32(ga_uvrg_x, ga_uvrg_x); // ga_uvrg_x = abs(ga_uvrg_x) @@ -600,7 +601,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu, vec_2x64s alternate_x; \ vec_2x64s alternate_dx_dy; \ vec_4x32s alternate_x_32; \ - vec_2x32s alternate_x_16; \ + vec_4x16u alternate_x_16; \ \ vec_4x16u alternate_select; \ vec_4x16s y_mid_point; \ diff --git a/plugins/gpu_neon/psx_gpu/vector_ops.h b/plugins/gpu_neon/psx_gpu/vector_ops.h index 189eb79d..6f2bcbf7 100644 --- a/plugins/gpu_neon/psx_gpu/vector_ops.h +++ b/plugins/gpu_neon/psx_gpu/vector_ops.h @@ -103,7 +103,7 @@ foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \ #define shr_4x16b(dest, source, shift) \ - foreach_element(4, (dest).e[_i] = (source).e[_i] >> (shift)) \ + foreach_element(4, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \ #define shl_4x16b(dest, source, shift) \ foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift)) \