#define edge_data_right_mask_offset 4
#define edge_data_y_offset 6
+.syntax unified
+.text
#define psx_gpu r0
#define v_a r1
.align 4
-/* FIXME: users of this should be in psx_gpu instead */
-#ifndef __PIC__
-#define load_pointer(register, pointer) \
- movw register, :lower16:pointer; \
- movt register, :upper16:pointer; \
+#ifndef __MACH__
-#else
-#define load_pointer(register, pointer) \
- ldr register, =pointer \
+#define function(name) \
+ .global name; \
+ .type name, %function; \
+ name: \
-#endif
+#define JT_OP_REL(table_label, index_reg, temp)
+#define JT_OP(x...) x
+#define JTE(start, target) target
+
+#define EXTRA_UNSAVED_REGS
+
+#else
#define function(name) \
- .global name; \
+ .globl _##name; \
name: \
+ _##name: \
+
+#define JT_OP_REL(table_label, index_reg, temp) \
+ adr temp, table_label; \
+ ldr temp, [ temp, index_reg, lsl #2 ]; \
+ add pc, pc, temp \
+
+#define JT_OP(x...)
+#define JTE(start, target) (target - start)
+
+// r7 is preserved, but add it for EABI alignment..
+#define EXTRA_UNSAVED_REGS r7, r9,
+
+#define flush_render_block_buffer _flush_render_block_buffer
+#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
+#define update_texture_8bpp_cache _update_texture_8bpp_cache
+
+#endif
@ r0: psx_gpu
@ r1: v_a
vld1.32 { uvrg }, [ temp ]; \
add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
vld1.32 { uvrg_dy }, [ temp ]; \
- load_pointer(reciprocal_table_ptr, reciprocal_table); \
+ ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
\
vmov.u32 c_0x01, #0x01 \
#define height_b_alt r12
#define compute_edge_delta_x3(start_c, height_a, height_b) \
- vmov.u32 heights, height_a, height_b; \
+ vmov heights, height_a, height_b; \
ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
vmov.u32 edge_shifts[0], temp; \
ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
add temp, temp, #(1 << 16); \
add y_a, temp, #2; \
add y_a, y_a, #(2 << 16); \
- vmov.u32 y_x4, temp, y_a; \
+ vmov y_x4, temp, y_a; \
\
setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
right_index); \
sub temp, temp, #(1 << 16); \
sub y_a, temp, #2; \
sub y_a, y_a, #(2 << 16); \
- vmov.u32 y_x4, temp, y_a; \
+ vmov y_x4, temp, y_a; \
\
vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
\
sub height, y_a, y_c; \
\
vdup.u32 x_starts, x_a; \
- vmov.u32 x_ends, x_c, x_b; \
+ vmov x_ends, x_c, x_b; \
\
compute_edge_delta_x3(x_b, height_major, height_minor_a); \
setup_spans_up(major, minor, minor, yes); \
function(setup_spans_up_right)
setup_spans_up_up(right, left)
-.pool
-
#define setup_spans_down_down(minor, major) \
setup_spans_prologue(); \
sub height_minor_a, y_b, y_a; \
sub height, y_c, y_a; \
\
vdup.u32 x_starts, x_a; \
- vmov.u32 x_ends, x_c, x_b; \
+ vmov x_ends, x_c, x_b; \
\
compute_edge_delta_x3(x_b, height_major, height_minor_a); \
setup_spans_down(major, minor, minor, yes); \
function(setup_spans_up_a)
setup_spans_prologue()
- vmov.u32 x_starts, x_a, x_b
+ vmov x_starts, x_a, x_b
vdup.u32 x_ends, x_c
setup_spans_up_flat()
setup_spans_prologue()
vdup.u32 x_starts, x_a
- vmov.u32 x_ends, x_b, x_c
+ vmov x_ends, x_b, x_c
setup_spans_up_flat()
function(setup_spans_down_a)
setup_spans_prologue()
- vmov.u32 x_starts, x_a, x_b
+ vmov x_starts, x_a, x_b
vdup.u32 x_ends, x_c
setup_spans_down_flat()
setup_spans_prologue()
vdup.u32 x_starts, x_a
- vmov.u32 x_ends, x_b, x_c
+ vmov x_ends, x_b, x_c
setup_spans_down_flat()
sub height_minor_b, y_c, y_a
sub height_major, y_c, y_b
- vmov.u32 x_starts, x_a, x_c
+ vmov x_starts, x_a, x_c
vdup.u32 x_ends, x_b
compute_edge_delta_x3(x_a, height_minor_a, height_major)
mov temp, #0
- vmov.u32 height_increment, temp, height_minor_b
+ vmov height_increment, temp, height_minor_b
vmlal.s32 edges_xy, edges_dx_dy, height_increment
vmov edges_xy_b_left, edge_alt_low, edge_alt_high
sub temp, temp, #(1 << 16)
sub y_a, temp, #2
sub y_a, y_a, #(2 << 16)
- vmov.u32 y_x4, temp, y_a
+ vmov y_x4, temp, y_a
vaddw.s32 edges_xy, edges_xy, edges_dx_dy
add temp, temp, #(1 << 16)
add y_a, temp, #2
add y_a, y_a, #(2 << 16)
- vmov.u32 y_x4, temp, y_a
+ vmov y_x4, temp, y_a
setup_spans_adjust_edges_alternate_no(left, right)
bne 2b
bal 1b
-.pool
-
#undef span_uvrg_offset
#undef span_edge_data
#undef span_b_offset
vpush { texture_mask }; \
vpush { uvrg_dx4 }; \
\
- stmdb sp!, { r0 - r3, r12, r14 }; \
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
bl flush_render_block_buffer; \
- ldmia sp!, { r0 - r3, r12, r14 }; \
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
\
vpop { uvrg_dx4 }; \
vpop { texture_mask }; \
vpush { texture_mask }; \
vpush { uvrg_dx4 }; \
\
- stmdb sp!, { r0 - r3, r12, r14 }; \
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
bl flush_render_block_buffer; \
- ldmia sp!, { r0 - r3, r12, r14 }; \
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
\
vpop { uvrg_dx4 }; \
vpop { texture_mask }; \
2:
vpush { colors }
- stmdb sp!, { r0 - r3, r12, r14 }
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
bl flush_render_block_buffer
- ldmia sp!, { r0 - r3, r12, r14 }
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
vpop { colors }
vdup.u16 colors, color
add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
- orr color, color, lsl #16
+ orr color, color, color, lsl #16
0:
moveq right_mask, right_mask, lsr #2
tst right_mask, #0x1
- streqh color, [ fb_ptr ]
+ strheq color, [ fb_ptr ]
1:
add span_edge_data, span_edge_data, #8
/* TODO: Load from psx_gpu instead of saving/restoring these */\
vpush { rg_dx4 }; \
\
- stmdb sp!, { r0 - r3, r12, r14 }; \
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
bl flush_render_block_buffer; \
- ldmia sp!, { r0 - r3, r12, r14 }; \
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
\
vpop { rg_dx4 }; \
\
vmlal.u8 pixels, g_whole_8, d64_4; \
vmlal.u8 pixels, b_whole_8, d64_128; \
\
- ldr pc, [ pc, right_mask, lsl #2 ]; \
+ JT_OP_REL(100f, right_mask, temp); \
+ JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]); \
nop; \
+ 100: \
nop; \
- .word 4f; \
- .word 5f; \
- .word 6f; \
- .word 7f; \
- .word 8f; \
- .word 9f; \
- .word 10f; \
- .word 11f; \
+ .word JTE(100b, 4f); \
+ .word JTE(100b, 5f); \
+ .word JTE(100b, 6f); \
+ .word JTE(100b, 7f); \
+ .word JTE(100b, 8f); \
+ .word JTE(100b, 9f); \
+ .word JTE(100b, 10f); \
+ .word JTE(100b, 11f); \
\
4: \
vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \
orr pixels_a, pixels_a, pixel_3, lsl #24
orr pixels_b, pixels_b, pixel_7, lsl #24
- vmov.u32 texels, pixels_a, pixels_b
+ vmov texels, pixels_a, pixels_b
vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
ldmia sp!, { r3 - r11, pc }
1:
- stmdb sp!, { r1 - r2, r12 }
+ stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
bl update_texture_8bpp_cache
- ldmia sp!, { r1 - r2, r12 }
+ ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
bal 0b
beq 1f
0:
+ vorr.u16 pixels, pixels, msb_mask
vorr.u16 draw_mask, draw_mask, write_mask
vbif.u16 fb_pixels, pixels, draw_mask
vst1.u16 { fb_pixels }, [ fb_ptr ]
bne 0b
1:
+ vorr.u16 pixels, pixels, msb_mask
vorr.u16 draw_mask, draw_mask, write_mask
vbif.u16 fb_pixels, pixels, draw_mask
vst1.u16 { fb_pixels }, [ fb_ptr ]
#define fb_ptr_advance_column r12
#define texture_block_ptr r14
+#define temp r14
+
#define texture_page_ptr r3
#define left_block_mask r4
#define right_block_mask r5
setup_sprite_flush_blocks:
vpush { q1 - q5 }
- stmdb sp!, { r0 - r3, r12, r14 }
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
bl flush_render_block_buffer
- ldmia sp!, { r0 - r3, r12, r14 }
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
vpop { q1 - q5 }
setup_sprite_update_texture_8bpp_cache:
- stmdb sp!, { r0 - r3, r14 }
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
bl update_texture_8bpp_cache
- ldmia sp!, { r0 - r3, pc }
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
#define setup_sprite_tiled_initialize_4bpp() \
mov fb_ptr_advance_column, #32; \
vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
\
- sub fb_ptr_advance_column, height, lsl #11; \
+ sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
#define setup_sprite_setup_right_draw_mask_fb_ptr() \
mov fb_ptr_advance_column, #32 * 2; \
vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
- sub fb_ptr_advance_column, height, lsl #11 + 1; \
+ sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
add block, block, num_blocks, lsl #6; \
\
orreq control_mask, control_mask, #0x2; \
- ldr pc, [ pc, control_mask, lsl #2 ]; \
+ JT_OP_REL(9f, control_mask, temp); \
+ JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]); \
nop; \
\
- .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \
- .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \
- .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \
- .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \
- .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \
- .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \
- .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \
- .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \
- .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \
+ 9: \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \
.word 0x00000000; \
- .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \
setup_sprite_tiled_builder(4bpp,);
#undef texels_wide_high
#undef texels_wide
#undef fb_ptr2
+#undef temp
#define psx_gpu r0
#define x r1
setup_sprites_16bpp_flush:
vpush { d0 - d3 }
- stmdb sp!, { r0 - r3, r12, r14 }
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
bl flush_render_block_buffer
- ldmia sp!, { r0 - r3, r12, r14 }
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
vpop { d0 - d3 }
add texture_offset_base, u, u
add width_rounded, width, #7
- add texture_offset_base, v, lsl #11
+ add texture_offset_base, texture_offset_base, v, lsl #11
mov left_mask_bits, #0xFF
ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
and right_width, width_rounded, #0x7
mvn left_mask_bits, left_mask_bits, lsl left_offset
- add texture_mask, texture_mask_height, lsl #11
+ add texture_mask, texture_mask, texture_mask_height, lsl #11
mov block_width, width_rounded, lsr #3
mov right_mask_bits, right_mask_bits, lsl right_width
add texture_offset_base, u, u
add width_rounded, width, #7
- add texture_offset_base, v, lsl #11
+ add texture_offset_base, texture_offset_base, v, lsl #11
movw left_mask_bits, #0xFFFF
ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
lsl right_width, #1
- add texture_mask, texture_mask_height, lsl #11
+ add texture_mask, texture_mask, texture_mask_height, lsl #11
mov block_width, width_rounded, lsr #3
mov right_mask_bits, right_mask_bits, lsl right_width
ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
| RENDER_FLAGS_BLEND)
- ldreqb r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
+ ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
tsteq r12, #RENDER_INTERLACE_ENABLED
beq setup_sprite_untextured_simple
mov r14, r2
add r0, #1024*2*2
add r4, #1024*2
- sub r0, r2, lsl #4+1
+ sub r0, r0, r2, lsl #4+1
mov r1, r4
add r12, r0, #1024*2
bgt 0b