... which the asm isn't doing properly
+#if 0
+#define save_abi_regs() \
+ vpush {q4-q7}
+#define restore_abi_regs() \
+ vpop {q4-q7}
+#else
+#define save_abi_regs()
+#define restore_abi_regs()
+#endif
+
#define psx_gpu r0
#define v_a r1
#define v_b r2
#define psx_gpu r0
#define v_a r1
#define v_b r2
@ r12 = psx_gpu->triangle_area
ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
stmdb sp!, { r4 - r11, lr }
@ r12 = psx_gpu->triangle_area
ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
stmdb sp!, { r4 - r11, lr }
@ load exponent of 62 into upper half of double
movw r4, #0
@ load exponent of 62 into upper half of double
movw r4, #0
stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
ldmia sp!, { r4 - r11, pc }
ldmia sp!, { r4 - r11, pc }
#define setup_spans_prologue() \
stmdb sp!, { r4 - r11, lr }; \
#define setup_spans_prologue() \
stmdb sp!, { r4 - r11, lr }; \
\
ldrsh x_a, [v_a, #8]; \
ldrsh x_b, [v_b, #8]; \
\
ldrsh x_a, [v_a, #8]; \
ldrsh x_b, [v_b, #8]; \
#define setup_spans_epilogue() \
#define setup_spans_epilogue() \
ldmia sp!, { r4 - r11, pc } \
ldmia sp!, { r4 - r11, pc } \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
vshl.u32 uvrg_dx4, uvrg_dx, #2; \
\
ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
vshl.u32 uvrg_dx4, uvrg_dx, #2; \
\
ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
bne 0b; \
\
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
bne 0b; \
\
ldmia sp!, { r4 - r11, pc }; \
\
2: \
ldmia sp!, { r4 - r11, pc }; \
\
2: \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
vshl.u32 uvrg_dx4, uvrg_dx, #2; \
\
vshl.u32 uvrg_dx8, uvrg_dx, #3; \
vshl.u32 uvrg_dx4, uvrg_dx, #2; \
\
vshl.u32 uvrg_dx8, uvrg_dx, #3; \
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
bne 0b; \
\
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
bne 0b; \
\
ldmia sp!, { r4 - r11, pc }; \
\
2: \
ldmia sp!, { r4 - r11, pc }; \
\
2: \
bxeq lr
stmdb sp!, { r4 - r11, r14 }
bxeq lr
stmdb sp!, { r4 - r11, r14 }
vld1.u32 { test_mask }, [psx_gpu, :128]
ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
vld1.u32 { test_mask }, [psx_gpu, :128]
ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
bne 0b
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
bne 0b
ldmia sp!, { r4 - r11, pc }
2:
ldmia sp!, { r4 - r11, pc }
2:
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
vshl.u32 rg_dx4, rg_dx, #2; \
\
ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
vshl.u32 rg_dx4, rg_dx, #2; \
\
ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
bne 0b; \
\
strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
bne 0b; \
\
ldmia sp!, { r4 - r11, pc }; \
\
2: \
ldmia sp!, { r4 - r11, pc }; \
\
2: \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
bxeq lr; \
\
stmdb sp!, { r4 - r11, r14 }; \
vshl.u32 rg_dx4, rg_dx, #2; \
\
ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
vshl.u32 rg_dx4, rg_dx, #2; \
\
ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
ldmia sp!, { r4 - r11, pc } \
setup_blocks_shaded_untextured_direct_builder(undithered)
ldmia sp!, { r4 - r11, pc } \
setup_blocks_shaded_untextured_direct_builder(undithered)
function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
stmdb sp!, { r4 - r5, lr }; \
function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
stmdb sp!, { r4 - r5, lr }; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vld1.u32 { test_mask }, [psx_gpu, :128]; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vld1.u32 { test_mask }, [psx_gpu, :128]; \
shade_blocks_textured_modulated_store_draw_mask_##target(28); \
shade_blocks_textured_modulated_store_pixels_##target(); \
\
shade_blocks_textured_modulated_store_draw_mask_##target(28); \
shade_blocks_textured_modulated_store_pixels_##target(); \
\
ldmia sp!, { r4 - r5, pc } \
ldmia sp!, { r4 - r5, pc } \
.align 3
function(shade_blocks_textured_unmodulated_indirect)
.align 3
function(shade_blocks_textured_unmodulated_indirect)
+ stmdb sp!, { r4, r14 }
+ save_abi_regs()
add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
vorr.u16 draw_mask_combined, draw_mask, zero_mask
vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
vorr.u16 draw_mask_combined, draw_mask, zero_mask
vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
+ restore_abi_regs()
+ ldmia sp!, { r4, pc }
.align 3
function(shade_blocks_textured_unmodulated_direct)
stmdb sp!, { r4, r14 }
.align 3
function(shade_blocks_textured_unmodulated_direct)
stmdb sp!, { r4, r14 }
add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
vst1.u16 { fb_pixels_next }, [fb_ptr_next]
vst1.u16 { fb_pixels_next }, [fb_ptr_next]
function(shade_blocks_unshaded_untextured_direct)
stmdb sp!, { r4, r14 }
function(shade_blocks_unshaded_untextured_direct)
stmdb sp!, { r4, r14 }
add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
vbif.u16 fb_pixels_next, pixels, draw_mask
vst1.u16 { fb_pixels_next }, [fb_ptr_next]
vbif.u16 fb_pixels_next, pixels, draw_mask
vst1.u16 { fb_pixels_next }, [fb_ptr_next]
\
function(blend_blocks_##texturing##_average_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
\
function(blend_blocks_##texturing##_average_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
vst1.u16 { fb_pixels_next }, [fb_ptr_next]; \
\
vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
vst1.u16 { fb_pixels_next }, [fb_ptr_next]; \
\
ldmia sp!, { r4, pc }; \
\
2: \
ldmia sp!, { r4, pc }; \
\
2: \
\
function(blend_blocks_textured_add_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
\
function(blend_blocks_textured_add_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
ldmia sp!, { r4, pc }; \
\
2: \
ldmia sp!, { r4, pc }; \
\
2: \
\
function(blend_blocks_untextured_add_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
\
function(blend_blocks_untextured_add_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
ldmia sp!, { r4, pc }; \
\
2: \
ldmia sp!, { r4, pc }; \
\
2: \
\
function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
\
function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
ldmia sp!, { r4, pc }; \
\
2: \
ldmia sp!, { r4, pc }; \
\
2: \
\
function(blend_blocks_textured_add_fourth_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
\
function(blend_blocks_textured_add_fourth_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
ldmia sp!, { r4, pc }; \
\
2: \
ldmia sp!, { r4, pc }; \
\
2: \
\
function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
\
function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
stmdb sp!, { r4, r14 }; \
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
vbit.u16 blend_pixels, fb_pixels, draw_mask; \
vst1.u16 { blend_pixels }, [fb_ptr_next]; \
\
ldmia sp!, { r4, pc }; \
\
2: \
ldmia sp!, { r4, pc }; \
\
2: \
function(blend_blocks_textured_unblended_on)
stmdb sp!, { r4, r14 }
function(blend_blocks_textured_unblended_on)
stmdb sp!, { r4, r14 }
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
vbif.u16 fb_pixels, pixels, draw_mask
vst1.u16 { fb_pixels }, [fb_ptr]
vbif.u16 fb_pixels, pixels, draw_mask
vst1.u16 { fb_pixels }, [fb_ptr]
setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
\
setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
\
setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
ldmia sp!, { r4 - r11, pc } \
#define setup_sprite_tiled_advance_column() \
ldmia sp!, { r4 - r11, pc } \
#define setup_sprite_tiled_advance_column() \
\
setup_sprite_tiled_advance_column(); \
setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
\
setup_sprite_tiled_advance_column(); \
setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
ldmia sp!, { r4 - r11, pc } \
ldmia sp!, { r4 - r11, pc } \
ldr height, [sp, #44]; \
add fb_ptr, fb_ptr, y, lsl #11; \
\
ldr height, [sp, #44]; \
add fb_ptr, fb_ptr, y, lsl #11; \
\
add fb_ptr, fb_ptr, x, lsl #1; \
and offset_v, v, #0xF; \
\
add fb_ptr, fb_ptr, x, lsl #1; \
and offset_v, v, #0xF; \
\
+#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD)
+ // the asm doesn't bother to save callee-save vector regs, so do it here
+ __asm__ __volatile__("":::"q4","q5","q6","q7");
+#endif
+
if (gpu.state.enhancement_active)
ret = gpu_parse_enhanced(&egpu, list, count * 4, (u32 *)last_cmd);
else
ret = gpu_parse(&egpu, list, count * 4, (u32 *)last_cmd);
if (gpu.state.enhancement_active)
ret = gpu_parse_enhanced(&egpu, list, count * 4, (u32 *)last_cmd);
else
ret = gpu_parse(&egpu, list, count * 4, (u32 *)last_cmd);
+#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD)
+ __asm__ __volatile__("":::"q4","q5","q6","q7");
+#endif
+
ex_regs[1] &= ~0x1ff;
ex_regs[1] |= egpu.texture_settings & 0x1ff;
return ret;
ex_regs[1] &= ~0x1ff;
ex_regs[1] |= egpu.texture_settings & 0x1ff;
return ret;