#include <stdlib.h>
#include <stdint.h>
#include <string.h>
+#include <assert.h>
#include "common.h"
#ifndef NEON_BUILD
printf("mismatch on %s %s: %x vs %x\n", #_a, #_b, _a, _b) \
-#ifndef NDEBUG
-#define setup_spans_debug_check(span_edge_data_element) \
-{ \
- u32 _num_spans = &span_edge_data_element - psx_gpu->span_edge_data; \
- if (_num_spans > MAX_SPANS) \
- *(volatile int *)0 = 1; \
- if (_num_spans < psx_gpu->num_spans) \
- { \
- if(span_edge_data_element.num_blocks > MAX_BLOCKS_PER_ROW) \
- *(volatile int *)0 = 2; \
- if(span_edge_data_element.y >= 2048) \
- *(volatile int *)0 = 3; \
- } \
-} \
-
+#if !defined(NEON_BUILD) && !defined(NDEBUG)
+static void setup_spans_debug_check(psx_gpu_struct *psx_gpu,
+ edge_data_struct *span_edge_data_element)
+{
+ u32 _num_spans = span_edge_data_element - psx_gpu->span_edge_data;
+ if (_num_spans > MAX_SPANS)
+ *(volatile int *)0 = 1;
+ if (_num_spans < psx_gpu->num_spans)
+ {
+ if(span_edge_data_element->num_blocks > MAX_BLOCKS_PER_ROW)
+ *(volatile int *)0 = 2;
+ if(span_edge_data_element->y >= 2048)
+ *(volatile int *)0 = 3;
+ }
+}
#else
-#define setup_spans_debug_check(span_edge_data_element) \
-
+#define setup_spans_debug_check(psx_gpu, span_edge_data_element)
#endif
#define setup_spans_prologue_alternate_yes() \
span_b_offset = psx_gpu->span_b_offset; \
\
vec_8x16u c_0x0001; \
+ vec_4x16u c_max_blocks_per_row; \
\
dup_8x16b(c_0x0001, 0x0001); \
dup_8x16b(left_edge, psx_gpu->viewport_start_x); \
dup_4x16b(c_0x04, 0x04); \
dup_4x16b(c_0x07, 0x07); \
dup_4x16b(c_0xFFFE, 0xFFFE); \
+ dup_4x16b(c_max_blocks_per_row, MAX_BLOCKS_PER_ROW); \
#define compute_edge_delta_x2() \
and_4x16b(span_shift, left_right_x_16.high, c_0x07); \
shl_variable_4x16b(span_shift, c_0xFFFE, span_shift); \
shr_4x16b(left_right_x_16.high, left_right_x_16.high, 3); \
+ min_4x16b(left_right_x_16.high, left_right_x_16.high, c_max_blocks_per_row); \
\
u32 i; \
for(i = 0; i < 4; i++) \
span_edge_data[i].num_blocks = left_right_x_16.high.e[i]; \
span_edge_data[i].right_mask = span_shift.e[i]; \
span_edge_data[i].y = y_x4.e[i]; \
- setup_spans_debug_check(span_edge_data[i]); \
+ setup_spans_debug_check(psx_gpu, &span_edge_data[i]); \
} \
\
span_edge_data += 4; \
\
setup_spans_prologue_b(); \
\
- if(height > 0) \
+ if (height > 512) \
+ height = 512; \
+ if (height > 0) \
{ \
y_x4.e[0] = y_a; \
y_x4.e[1] = y_a + 1; \
\
setup_spans_prologue_b(); \
\
- if(height > 0) \
+ if (height > 512) \
+ height = 512; \
+ if (height > 0) \
{ \
y_x4.e[0] = y_a; \
y_x4.e[1] = y_a - 1; \
setup_spans_prologue_b();
- if(height_minor_a > 0)
+ if (height_minor_a > 512)
+ height_minor_a = 512;
+ if (height_minor_a > 0)
{
y_x4.e[0] = y_a;
y_x4.e[1] = y_a - 1;
setup_spans_clip(increment, no);
}
- if(height_minor_b > 0)
+ if (height_minor_b > 512)
+ height_minor_b = 512;
+ if (height_minor_b > 0)
{
y_x4.e[0] = y_a;
y_x4.e[1] = y_a + 1;
}
}
}
+ assert(psx_gpu->span_edge_data[0].y < 1024u);
u32 render_state = flags &
(RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND |
#ifndef NEON_BUILD
-void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
s32 v, s32 width, s32 height, u32 color)
{
- if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
- RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0 &&
- (psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) == 0)
- {
- setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
- return;
- }
-
u32 right_width = ((width - 1) & 0x7) + 1;
u32 right_mask_bits = (0xFF << right_width);
u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + x;
#endif
-void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
- s32 u, s32 v, s32 width, s32 height, u32 color)
+static void __attribute__((noinline))
+setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color)
{
u32 r = color & 0xFF;
u32 g = (color >> 8) & 0xFF;
u32 num_width;
- if(psx_gpu->num_blocks > MAX_BLOCKS)
+ if(psx_gpu->num_blocks)
{
flush_render_block_buffer(psx_gpu);
}
}
}
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color);
+
+void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color)
+{
+ if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
+ RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0 &&
+ (psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) == 0)
+ {
+ setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
+ return;
+ }
+
+ while (width > 0)
+ {
+ s32 w1 = width > 512 ? 512 : width;
+ setup_sprite_untextured_512(psx_gpu, x, y, 0, 0, w1, height, color);
+ x += 512;
+ width -= 512;
+ }
+}
+
#define setup_sprite_blocks_switch_textured(texture_mode) \
setup_sprite_##texture_mode \
#ifndef PSX_GPU_H
#define PSX_GPU_H
+#define MAX_SPANS 512
+#define MAX_BLOCKS 64
+#define MAX_BLOCKS_PER_ROW 128
+
+#define SPAN_DATA_BLOCKS_SIZE 32
+
+#ifndef __ASSEMBLER__
+
#include "vector_types.h"
typedef enum
vec_8x16u dither_offsets;
} block_struct;
-#define MAX_SPANS 512
-#define MAX_BLOCKS 64
-#define MAX_BLOCKS_PER_ROW 128
-
-#define SPAN_DATA_BLOCKS_SIZE 32
-
typedef struct render_block_handler_struct render_block_handler_struct;
typedef struct
const vertex_struct * __restrict__ a, const vertex_struct * __restrict__ b,
const vertex_struct * __restrict__ c);
-#endif
-
+#endif // __ASSEMBLER__
+#endif // PSX_GPU_H
static void setup_sprite_untextured_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y,\r
s32 u, s32 v, s32 width, s32 height, u32 color)\r
{\r
- setup_sprite_untextured(psx_gpu, x, y, u, v, width * 2, height * 2, color);\r
+ width *= 2;\r
+ height *= 2;\r
+ if (width > 1024)\r
+ width = 1024;\r
+ setup_sprite_untextured(psx_gpu, x, y, u, v, width, height, color);\r
}\r
\r
#define setup_sprite_blocks_switch_textured_4x(texture_mode) \\r
* General Public License for more details.
*/
-#define MAX_SPANS 512
-#define MAX_BLOCKS 64
-#define MAX_BLOCKS_PER_ROW 128
-
-#define RENDER_STATE_MASK_EVALUATE 0x20
-#define RENDER_FLAGS_MODULATE_TEXELS 0x1
-#define RENDER_FLAGS_BLEND 0x2
#define RENDER_INTERLACE_ENABLED 0x1
+#include "psx_gpu.h"
#include "psx_gpu_offsets.h"
#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
#ifdef __MACH__
#define flush_render_block_buffer _flush_render_block_buffer
-#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
#define update_texture_8bpp_cache _update_texture_8bpp_cache
#endif
#define left_x_32_low d22
#define left_x_32_high d23
+#define tmp_max_blocks d20
+
#define edges_xy q0
#define edges_dx_dy d2
#define edge_shifts d3
str b, [span_b_offset], #4; \
setup_spans_adjust_interpolants_##direction(); \
\
+ vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW; \
vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
+ vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks; \
\
vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
\
str b, [span_b_offset], #4; \
setup_spans_adjust_interpolants_##direction(); \
\
- vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
+ vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW; \
vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
+ vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
+ vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks; \
\
vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
\
ble 1f; \
\
orr temp, y_a, y_a, lsl #16; \
+ cmp height, #512; \
add temp, temp, #(1 << 16); \
+ movgt height, #512; \
add y_a, temp, #2; \
add y_a, y_a, #(2 << 16); \
vmov y_x4, temp, y_a; \
ble 1f; \
\
orr temp, y_a, y_a, lsl #16; \
+ cmp height, #512; \
sub temp, temp, #(1 << 16); \
+ movgt height, #512; \
sub y_a, temp, #2; \
sub y_a, y_a, #(2 << 16); \
vmov y_x4, temp, y_a; \
.align 3
-function(setup_sprite_untextured)
- ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
- tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
- | RENDER_FLAGS_BLEND)
- ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
- tsteq r12, #RENDER_INTERLACE_ENABLED
- beq setup_sprite_untextured_simple
-
+function(setup_sprite_untextured_512)
stmdb sp!, { r4 - r11, r14 }
ldr width, [sp, #40]
#define gvhaddq_u16(d, a, b) d.u16 = vhaddq_u16(a.u16, b.u16)
#define gvmax_s16(d, a, b) d.s16 = vmax_s16(a.s16, b.s16)
#define gvmin_s16(d, a, b) d.s16 = vmin_s16(a.s16, b.s16)
+#define gvmin_u16(d, a, b) d.u16 = vmin_u16(a.u16, b.u16)
#define gvminq_u8(d, a, b) d.u8 = vminq_u8(a.u8, b.u8)
#define gvminq_u16(d, a, b) d.u16 = vminq_u16(a.u16, b.u16)
#define gvmla_s32(d, a, b) d.s32 = vmla_s32(d.s32, a.s32, b.s32)
}
#endif // !__SSSE3__
#ifdef __SSE4_1__
-#define gvminq_u16(d, a, b) d.m = _mm_min_epu16(a.m, b.m)
+#define gvmin_u16(d, a, b) d.m = _mm_min_epu16(a.m, b.m)
+#define gvminq_u16 gvmin_u16
#define gvmovl_u8(d, s) d.m = _mm_cvtepu8_epi16(s.m)
#define gvmovl_s8(d, s) d.m = _mm_cvtepi8_epi16(s.m)
#define gvmovl_s32(d, s) d.m = _mm_cvtepi32_epi64(s.m)
// can do this because the caller needs the msb clear
#define gvhaddq_u16(d, a, b) d.u16 = (a.u16 + b.u16) >> 1
#endif
-#ifndef gvminq_u16
-#define gvminq_u16(d, a, b) { \
+#ifndef gvmin_u16
+#define gvmin_u16(d, a, b) { \
gvu16 t_ = a.u16 < b.u16; \
d.u16 = (a.u16 & t_) | (b.u16 & ~t_); \
}
+#define gvminq_u16 gvmin_u16
#endif
#ifndef gvmlsq_s32
#define gvmlsq_s32(d, a, b) d.s32 -= a.s32 * b.s32
span_b_offset = psx_gpu->span_b_offset; \
\
vec_8x16u c_0x0001; \
+ vec_4x16u c_max_blocks_per_row; \
\
gvdupq_n_u16(c_0x0001, 0x0001); \
gvdupq_n_u16(left_edge, psx_gpu->viewport_start_x); \
gvdup_n_u16(c_0x04, 0x04); \
gvdup_n_u16(c_0x07, 0x07); \
gvdup_n_u16(c_0xFFFE, 0xFFFE); \
+ gvdup_n_u16(c_max_blocks_per_row, MAX_BLOCKS_PER_ROW); \
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
// better encoding, remaining bits are unused anyway
gvand(span_shift, left_right_x_16_hi, c_0x07); \
setup_spans_make_span_shift(span_shift); \
gvshr_n_u16(left_right_x_16_hi, left_right_x_16_hi, 3); \
+ gvmin_u16(left_right_x_16_hi, left_right_x_16_hi, c_max_blocks_per_row); \
\
gvst4_pi_u16(left_right_x_16_lo, left_right_x_16_hi, span_shift, y_x4, \
span_edge_data); \
\
setup_spans_prologue_b(); \
\
- if(height > 0) \
+ if (height > 512) \
+ height = 512; \
+ if (height > 0) \
{ \
u64 y_x4_ = ((u64)(y_a + 3) << 48) | ((u64)(u16)(y_a + 2) << 32) \
| (u32)((y_a + 1) << 16) | (u16)y_a; \
\
setup_spans_prologue_b(); \
\
- if(height > 0) \
+ if (height > 512) \
+ height = 512; \
+ if (height > 0) \
{ \
u64 y_x4_ = ((u64)(y_a - 3) << 48) | ((u64)(u16)(y_a - 2) << 32) \
| (u32)((y_a - 1) << 16) | (u16)y_a; \
setup_spans_prologue_b();
- if(height_minor_a > 0)
+ if (height_minor_a > 512)
+ height_minor_a = 512;
+ if (height_minor_a > 0)
{
u64 y_x4_ = ((u64)(y_a - 3) << 48) | ((u64)(u16)(y_a - 2) << 32)
| (u32)((y_a - 1) << 16) | (u16)y_a;
setup_spans_clip(increment, no);
}
- if(height_minor_b > 0)
+ if (height_minor_b > 512)
+ height_minor_b = 512;
+ if (height_minor_b > 0)
{
u64 y_x4_ = ((u64)(y_a + 3) << 48) | ((u64)(u16)(y_a + 2) << 32)
| (u32)((y_a + 1) << 16) | (u16)y_a;
{
}
-void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
s32 v, s32 width, s32 height, u32 color)
{
- if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
- RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0 &&
- (psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) == 0)
- {
- setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
- return;
- }
-
#if 0
- setup_sprite_untextured_(psx_gpu, x, y, u, v, width, height, color);
+ setup_sprite_untextured_512_(psx_gpu, x, y, u, v, width, height, color);
return;
#endif
u32 right_width = ((width - 1) & 0x7) + 1;
#define setup_sprite_4bpp_4x setup_sprite_4bpp_4x_
#define setup_sprite_8bpp_4x setup_sprite_8bpp_4x_
#define setup_sprite_16bpp_4x setup_sprite_16bpp_4x_
-#define setup_sprite_untextured setup_sprite_untextured_
-#define setup_sprite_untextured_simple setup_sprite_untextured_simple_
+#define setup_sprite_untextured_512 setup_sprite_untextured_512_
#define scale2x_tiles8 scale2x_tiles8_
#endif
void setup_sprite_16bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
s32 width, s32 height, u32 color);
-void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
s32 v, s32 width, s32 height, u32 color);
-void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
- s32 u, s32 v, s32 width, s32 height, u32 color);
void scale2x_tiles8(void *dst, const void *src, int w8, int h);
#undef setup_sprite_4bpp_4x
#undef setup_sprite_8bpp_4x
#undef setup_sprite_16bpp_4x
-#undef setup_sprite_untextured
-#undef setup_sprite_untextured_simple
+#undef setup_sprite_untextured_512
#undef scale2x_tiles8
#endif
(dest).e[_i] = result; \
}) \
+#define min_4x16b(dest, source_a, source_b) \
+ foreach_element(4, \
+ { \
+ s32 result = (source_a).e[_i]; \
+ if((source_b).e[_i] < result) \
+ result = (source_b).e[_i]; \
+ (dest).e[_i] = result; \
+ }) \
+
#define min_8x16b(dest, source_a, source_b) \
foreach_element(8, \
{ \