#define MAX_BLOCKS 64
#define MAX_BLOCKS_PER_ROW 128
+#define RENDER_STATE_MASK_EVALUATE 0x20
+#define RENDER_FLAGS_MODULATE_TEXELS 0x1
+#define RENDER_FLAGS_BLEND 0x2
+#define RENDER_INTERLACE_ENABLED 0x1
+
#include "psx_gpu_offsets.h"
#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
#define edge_data_right_mask_offset 4
#define edge_data_y_offset 6
+.syntax unified
+.text
#define psx_gpu r0
#define v_a r1
.align 4
-/* FIXME: users of this should be in psx_gpu instead */
-#ifndef __PIC__
-#define load_pointer(register, pointer) \
- movw register, :lower16:pointer; \
- movt register, :upper16:pointer; \
-
-#else
-#define load_pointer(register, pointer) \
- ldr register, =pointer \
-
-#endif
-
#define function(name) \
.global name; \
name: \
vld1.32 { uvrg }, [ temp ]; \
add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
vld1.32 { uvrg_dy }, [ temp ]; \
- load_pointer(reciprocal_table_ptr, reciprocal_table); \
+ ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
\
vmov.u32 c_0x01, #0x01 \
#define height_b_alt r12
#define compute_edge_delta_x3(start_c, height_a, height_b) \
- vmov.u32 heights, height_a, height_b; \
+ vmov heights, height_a, height_b; \
ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
vmov.u32 edge_shifts[0], temp; \
ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
add temp, temp, #(1 << 16); \
add y_a, temp, #2; \
add y_a, y_a, #(2 << 16); \
- vmov.u32 y_x4, temp, y_a; \
+ vmov y_x4, temp, y_a; \
\
setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
right_index); \
sub temp, temp, #(1 << 16); \
sub y_a, temp, #2; \
sub y_a, y_a, #(2 << 16); \
- vmov.u32 y_x4, temp, y_a; \
+ vmov y_x4, temp, y_a; \
\
vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
\
sub height, y_a, y_c; \
\
vdup.u32 x_starts, x_a; \
- vmov.u32 x_ends, x_c, x_b; \
+ vmov x_ends, x_c, x_b; \
\
compute_edge_delta_x3(x_b, height_major, height_minor_a); \
setup_spans_up(major, minor, minor, yes); \
function(setup_spans_up_right)
setup_spans_up_up(right, left)
-.pool
-
#define setup_spans_down_down(minor, major) \
setup_spans_prologue(); \
sub height_minor_a, y_b, y_a; \
sub height, y_c, y_a; \
\
vdup.u32 x_starts, x_a; \
- vmov.u32 x_ends, x_c, x_b; \
+ vmov x_ends, x_c, x_b; \
\
compute_edge_delta_x3(x_b, height_major, height_minor_a); \
setup_spans_down(major, minor, minor, yes); \
function(setup_spans_up_a)
setup_spans_prologue()
- vmov.u32 x_starts, x_a, x_b
+ vmov x_starts, x_a, x_b
vdup.u32 x_ends, x_c
setup_spans_up_flat()
setup_spans_prologue()
vdup.u32 x_starts, x_a
- vmov.u32 x_ends, x_b, x_c
+ vmov x_ends, x_b, x_c
setup_spans_up_flat()
function(setup_spans_down_a)
setup_spans_prologue()
- vmov.u32 x_starts, x_a, x_b
+ vmov x_starts, x_a, x_b
vdup.u32 x_ends, x_c
setup_spans_down_flat()
setup_spans_prologue()
vdup.u32 x_starts, x_a
- vmov.u32 x_ends, x_b, x_c
+ vmov x_ends, x_b, x_c
setup_spans_down_flat()
sub height_minor_b, y_c, y_a
sub height_major, y_c, y_b
- vmov.u32 x_starts, x_a, x_c
+ vmov x_starts, x_a, x_c
vdup.u32 x_ends, x_b
compute_edge_delta_x3(x_a, height_minor_a, height_major)
mov temp, #0
- vmov.u32 height_increment, temp, height_minor_b
+ vmov height_increment, temp, height_minor_b
vmlal.s32 edges_xy, edges_dx_dy, height_increment
vmov edges_xy_b_left, edge_alt_low, edge_alt_high
sub temp, temp, #(1 << 16)
sub y_a, temp, #2
sub y_a, y_a, #(2 << 16)
- vmov.u32 y_x4, temp, y_a
+ vmov y_x4, temp, y_a
vaddw.s32 edges_xy, edges_xy, edges_dx_dy
add temp, temp, #(1 << 16)
add y_a, temp, #2
add y_a, y_a, #(2 << 16)
- vmov.u32 y_x4, temp, y_a
+ vmov y_x4, temp, y_a
setup_spans_adjust_edges_alternate_no(left, right)
ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
add temp, temp, height_minor_b
+
+ cmp temp, #MAX_SPANS
+ beq 5f
+
strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
2:
setup_spans_prologue_b()
bal 4b
-.pool
+ 5:
+ // FIXME: overflow corner case
+ sub temp, temp, height_minor_b
+ bics height_minor_b, #3
+ add temp, temp, height_minor_b
+ strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
+ bne 2b
+ bal 1b
#undef span_uvrg_offset
#undef span_edge_data
vdup.u16 colors, color
add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
- orr color, color, lsl #16
+ orr color, color, color, lsl #16
0:
moveq right_mask, right_mask, lsr #2
tst right_mask, #0x1
- streqh color, [ fb_ptr ]
+ strheq color, [ fb_ptr ]
1:
add span_edge_data, span_edge_data, #8
orr pixels_a, pixels_a, pixel_3, lsl #24
orr pixels_b, pixels_b, pixel_7, lsl #24
- vmov.u32 texels, pixels_a, pixels_b
+ vmov texels, pixels_a, pixels_b
vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
[ draw_mask_bits_ptr, :16 ], c_64
vbif.u16 fb_pixels, pixels, draw_mask_combined
- vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
-
sub fb_ptr_cmp, fb_ptr_next, fb_ptr
+ pld [ fb_ptr_next, #64 ]
+
add fb_ptr_cmp, fb_ptr_cmp, #14
+ vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
+
cmp fb_ptr_cmp, #28
bls 4f
vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
vand.u16 pixels_mg, pixels, d128_0x83E0; \
\
- vbit.u16 blend_pixels, fb_pixels, draw_mask; \
- vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
+ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
+ pld [ fb_ptr_next, #64 ]; \
\
sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
+ vbit.u16 blend_pixels, fb_pixels, draw_mask; \
+ \
add fb_ptr_cmp, fb_ptr_cmp, #14; \
+ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
+ \
cmp fb_ptr_cmp, #28; \
bls 2f; \
\
beq 1f
0:
+ vorr.u16 pixels, pixels, msb_mask
vorr.u16 draw_mask, draw_mask, write_mask
vbif.u16 fb_pixels, pixels, draw_mask
vst1.u16 { fb_pixels }, [ fb_ptr ]
bne 0b
1:
+ vorr.u16 pixels, pixels, msb_mask
vorr.u16 draw_mask, draw_mask, write_mask
vbif.u16 fb_pixels, pixels, draw_mask
vst1.u16 { fb_pixels }, [ fb_ptr ]
mov fb_ptr_advance_column, #32; \
vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
\
- sub fb_ptr_advance_column, height, lsl #11; \
+ sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
#define setup_sprite_setup_right_draw_mask_fb_ptr() \
draw_mask_fb_ptr_left_b); \
\
add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
- add fb_ptr, fb_ptr, #16*2; \
+ pld [ fb_ptr, #2048 ]; \
\
vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
- vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
+ add fb_ptr, fb_ptr, #16*2; \
\
- pld [ fb_ptr ]; \
+ vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
\
vzip.8 texels_low, texels_high; \
do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
draw_mask_fb_ptr_##edge##_b); \
\
+ pld [ fb_ptr, #2048 ]; \
add fb_ptr, fb_ptr, #2048 * 2; \
- subs sub_tile_height, sub_tile_height, #1; \
\
+ subs sub_tile_height, sub_tile_height, #1; \
bne 4b; \
\
ldr column_data, [sp], #8; /* fb_ptr2 */ \
do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
draw_mask_fb_ptr_left_b); \
\
+ pld [ fb_ptr, #2048 ]; \
and texture_block_ptr, texture_block_ptr, texture_mask; \
\
add fb_ptr, fb_ptr, #16*2; \
add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
\
vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
- pld [ fb_ptr ]; \
\
do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
draw_mask_fb_ptr_right_b); \
add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
\
+ pld [ fb_ptr, #2048 ]; \
do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
draw_mask_fb_ptr_##edge##_b); \
\
mov fb_ptr_advance_column, #32 * 2; \
vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
- sub fb_ptr_advance_column, height, lsl #11 + 1; \
+ sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
add texture_offset_base, u, u
add width_rounded, width, #7
- add texture_offset_base, v, lsl #11
+ add texture_offset_base, texture_offset_base, v, lsl #11
mov left_mask_bits, #0xFF
ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
and right_width, width_rounded, #0x7
mvn left_mask_bits, left_mask_bits, lsl left_offset
- add texture_mask, texture_mask_height, lsl #11
+ add texture_mask, texture_mask, texture_mask_height, lsl #11
mov block_width, width_rounded, lsr #3
mov right_mask_bits, right_mask_bits, lsl right_width
add texture_offset_base, u, u
add width_rounded, width, #7
- add texture_offset_base, v, lsl #11
+ add texture_offset_base, texture_offset_base, v, lsl #11
movw left_mask_bits, #0xFFFF
ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
lsl right_width, #1
- add texture_mask, texture_mask_height, lsl #11
+ add texture_mask, texture_mask, texture_mask_height, lsl #11
mov block_width, width_rounded, lsr #3
mov right_mask_bits, right_mask_bits, lsl right_width
ldmia sp!, { r4 - r11, pc }
+#undef width
+#undef right_width
+#undef right_mask_bits
+#undef color
+#undef height
+#undef blocks_remaining
+#undef colors
+#undef right_mask
+#undef test_mask
+#undef draw_mask
+
+#define psx_gpu r0
+#define x r1
+#define y r2
+#define width r3
+#define right_width r5
+#define right_mask_bits r6
+#define fb_ptr r7
+#define color r8
+#define height r9
+#define fb_ptr_pitch r12
+
+// referenced by setup_sprites_16bpp_flush
+#define num_blocks r4
+#define block r5
+#define block_width r11
+
+#define color_r r1
+#define color_g r2
+#define color_b r8
+#define blocks_remaining r6
+
+#define colors q0
+#define right_mask q1
+#define test_mask q2
+#define draw_mask q2
+#define draw_mask_bits_fb_ptr d6
+
+
+.align 3
+
+function(setup_sprite_untextured)
+ ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
+ tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
+ | RENDER_FLAGS_BLEND)
+ ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
+ tsteq r12, #RENDER_INTERLACE_ENABLED
+ beq setup_sprite_untextured_simple
+
+ stmdb sp!, { r4 - r11, r14 }
+
+ ldr width, [ sp, #40 ]
+ ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
+
+ ldr height, [ sp, #44 ]
+ add fb_ptr, fb_ptr, y, lsl #11
+
+ add fb_ptr, fb_ptr, x, lsl #1
+ sub right_width, width, #1
+
+ ldr color, [ sp, #48 ]
+ and right_width, #7
+
+ add block_width, width, #7
+ add right_width, #1
+
+ lsr block_width, #3
+ mov right_mask_bits, #0xff
+
+ sub fb_ptr_pitch, block_width, #1
+ lsl right_mask_bits, right_width
+
+ lsl fb_ptr_pitch, #3+1
+ ubfx color_r, color, #3, #5
+
+ rsb fb_ptr_pitch, #1024*2
+ ubfx color_g, color, #11, #5
+
+ vld1.u32 { test_mask }, [ psx_gpu, :128 ]
+ ubfx color_b, color, #19, #5
+
+ vdup.u16 right_mask, right_mask_bits
+ orr color, color_r, color_b, lsl #10
+
+ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+ orr color, color, color_g, lsl #5
+
+ vtst.u16 right_mask, right_mask, test_mask
+ add block, psx_gpu, #psx_gpu_blocks_offset
+
+ vdup.u16 colors, color
+ add block, block, num_blocks, lsl #6
+
+
+setup_sprite_untextured_height_loop:
+ add num_blocks, block_width
+ sub blocks_remaining, block_width, #1
+
+ cmp num_blocks, #MAX_BLOCKS
+ blgt setup_sprites_16bpp_flush
+
+ cmp blocks_remaining, #0
+ ble 1f
+
+ vmov.u8 draw_mask, #0 /* zero_mask */
+ vmov.u8 draw_mask_bits_fb_ptr, #0
+
+ 0:
+ vst1.u32 { draw_mask }, [ block, :128 ]!
+ subs blocks_remaining, #1
+
+ vst1.u32 { colors }, [ block, :128 ]
+ add block, block, #24
+
+ vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+ vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+
+ add block, block, #24
+ add fb_ptr, #8*2
+ bgt 0b
+
+ 1:
+ vst1.u32 { right_mask }, [ block, :128 ]!
+ subs height, #1
+
+ vst1.u32 { colors }, [ block, :128 ]
+ add block, block, #24
+
+ vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+ vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+
+ add block, block, #24
+ add fb_ptr, fb_ptr_pitch
+
+ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+ bgt setup_sprite_untextured_height_loop
+
+ ldmia sp!, { r4 - r11, pc }
+
+
+
#undef texture_page_ptr
#undef vram_ptr
#undef dirty_textures_mask
mov r14, r2
add r0, #1024*2*2
add r4, #1024*2
- sub r0, r2, lsl #4+1
+ sub r0, r0, r2, lsl #4+1
mov r1, r4
add r12, r0, #1024*2
bgt 0b