#endif
#include "psx_gpu_simd.h"
+#if 0
+void dump_r_d(const char *name, void *dump);
+void dump_r_q(const char *name, void *dump);
+#define dumprd(n) dump_r_d(#n, n.e)
+#define dumprq(n) dump_r_q(#n, n.e)
+#endif
+
u32 span_pixels = 0;
u32 span_pixel_blocks = 0;
u32 spans = 0;
{ \
u32 _num_spans = &span_edge_data_element - psx_gpu->span_edge_data; \
if (_num_spans > MAX_SPANS) \
- *(int *)0 = 1; \
+ *(volatile int *)0 = 1; \
if (_num_spans < psx_gpu->num_spans) \
{ \
if(span_edge_data_element.num_blocks > MAX_BLOCKS_PER_ROW) \
- *(int *)0 = 1; \
- if(span_edge_data_element.y > 2048) \
- *(int *)0 = 1; \
+ *(volatile int *)0 = 2; \
+ if(span_edge_data_element.y >= 2048) \
+ *(volatile int *)0 = 3; \
} \
} \
vec_2x64s alternate_x; \
vec_2x64s alternate_dx_dy; \
vec_4x32s alternate_x_32; \
- vec_2x32s alternate_x_16; \
+ vec_4x16u alternate_x_16; \
\
vec_4x16u alternate_select; \
vec_4x16s y_mid_point; \
sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
- vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
-
+ vdup.u32 r_shift, r14 @ r_shift = { shift, shift*, shift, shift* }
+ @ * - vshl.u64: ignored by hw
vadd.u32 uvrg_base, uvrgb_phase
vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
-static int ccount;
+static int ccount, dump_enabled;
void cmpp(const char *name, const void *a_, const void *b_, size_t len)
{
const uint32_t *a = a_, *b = b_, masks[] = { 0, 0xff, 0xffff, 0xffffff };
void dump_r_(const char *name, void *dump, int is_q)
{
unsigned long long *u = dump;
+ if (!dump_enabled) return;
//if (ccount > 1) return;
- printf("%10s %016llx ", name, u[0]);
+ printf("%20s %016llx ", name, u[0]);
if (is_q)
printf("%016llx", u[1]);
puts("");
gvreg uvrg_base;
gvshll_n_u16(uvrg_base, gvlo(uvrg_xxxx0), 16); // uvrg_base = uvrg0 << 16
- gvdupq_n_u32(r_shift, shift); // r_shift = { shift, shift, shift, shift }
+ gvdupq_n_s64(r_shift, shift); // r_shift = { shift, shift }
gvaddq_u32(uvrg_base, uvrg_base, uvrgb_phase);
gvabsq_s32(ga_uvrg_x, ga_uvrg_x); // ga_uvrg_x = abs(ga_uvrg_x)
vec_2x64s alternate_x; \
vec_2x64s alternate_dx_dy; \
vec_4x32s alternate_x_32; \
- vec_2x32s alternate_x_16; \
+ vec_4x16u alternate_x_16; \
\
vec_4x16u alternate_select; \
vec_4x16s y_mid_point; \
foreach_element(2, (dest).e[_i] = (u32)(source).e[_i] >> (shift)) \
#define shr_4x16b(dest, source, shift) \
- foreach_element(4, (dest).e[_i] = (source).e[_i] >> (shift)) \
+ foreach_element(4, (dest).e[_i] = (u16)(source).e[_i] >> (shift)) \
#define shl_4x16b(dest, source, shift) \
foreach_element(4, (dest).e[_i] = (u32)(source).e[_i] << (shift)) \