X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=plugins%2Fgpu_neon%2Fpsx_gpu%2Fpsx_gpu_arm_neon.S;h=efb065d7e44959c50a759d3d39f8610d78ebfb67;hb=4d6467383217647e3fbc58ab9213a31c0f3bd8c9;hp=103483a8a45fbb3c5f0730727d5b73dd439b2a54;hpb=59d15d23d97d4347d8046057013f8979db0914f0;p=pcsx_rearmed.git diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index 103483a8..efb065d7 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -17,6 +17,11 @@ #define MAX_BLOCKS 64 #define MAX_BLOCKS_PER_ROW 128 +#define RENDER_STATE_MASK_EVALUATE 0x20 +#define RENDER_FLAGS_MODULATE_TEXELS 0x1 +#define RENDER_FLAGS_BLEND 0x2 +#define RENDER_INTERLACE_ENABLED 0x1 + #include "psx_gpu_offsets.h" #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4) @@ -26,6 +31,8 @@ #define edge_data_right_mask_offset 4 #define edge_data_y_offset 6 +.syntax unified +.text #define psx_gpu r0 #define v_a r1 @@ -187,21 +194,42 @@ .align 4 -/* FIXME: users of this should be in psx_gpu instead */ -#ifndef __PIC__ -#define load_pointer(register, pointer) \ - movw register, :lower16:pointer; \ - movt register, :upper16:pointer; \ +#ifndef __MACH__ -#else -#define load_pointer(register, pointer) \ - ldr register, =pointer \ +#define function(name) \ + .global name; \ + .type name, %function; \ + name: \ -#endif +#define JT_OP_REL(table_label, index_reg, temp) +#define JT_OP(x...) x +#define JTE(start, target) target + +#define EXTRA_UNSAVED_REGS + +#else #define function(name) \ - .global name; \ + .globl _##name; \ name: \ + _##name: \ + +#define JT_OP_REL(table_label, index_reg, temp) \ + adr temp, table_label; \ + ldr temp, [ temp, index_reg, lsl #2 ]; \ + add pc, pc, temp \ + +#define JT_OP(x...) +#define JTE(start, target) (target - start) + +// r7 is preserved, but add it for EABI alignment.. +#define EXTRA_UNSAVED_REGS r7, r9, + +#define flush_render_block_buffer _flush_render_block_buffer +#define setup_sprite_untextured_simple _setup_sprite_untextured_simple +#define update_texture_8bpp_cache _update_texture_8bpp_cache + +#endif @ r0: psx_gpu @ r1: v_a @@ -571,7 +599,7 @@ function(compute_all_gradients) vld1.32 { uvrg }, [ temp ]; \ add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \ vld1.32 { uvrg_dy }, [ temp ]; \ - load_pointer(reciprocal_table_ptr, reciprocal_table); \ + ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \ \ vmov.u32 c_0x01, #0x01 \ @@ -619,7 +647,7 @@ function(compute_all_gradients) #define height_b_alt r12 #define compute_edge_delta_x3(start_c, height_a, height_b) \ - vmov.u32 heights, height_a, height_b; \ + vmov heights, height_a, height_b; \ ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \ vmov.u32 edge_shifts[0], temp; \ ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \ @@ -879,7 +907,7 @@ function(compute_all_gradients) add temp, temp, #(1 << 16); \ add y_a, temp, #2; \ add y_a, y_a, #(2 << 16); \ - vmov.u32 y_x4, temp, y_a; \ + vmov y_x4, temp, y_a; \ \ setup_spans_adjust_edges_alternate_##alternate_active(left_index, \ right_index); \ @@ -934,7 +962,7 @@ function(compute_all_gradients) sub temp, temp, #(1 << 16); \ sub y_a, temp, #2; \ sub y_a, y_a, #(2 << 16); \ - vmov.u32 y_x4, temp, y_a; \ + vmov y_x4, temp, y_a; \ \ vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \ \ @@ -965,7 +993,7 @@ function(compute_all_gradients) sub height, y_a, y_c; \ \ vdup.u32 x_starts, x_a; \ - vmov.u32 x_ends, x_c, x_b; \ + vmov x_ends, x_c, x_b; \ \ compute_edge_delta_x3(x_b, height_major, height_minor_a); \ setup_spans_up(major, minor, minor, yes); \ @@ -977,8 +1005,6 @@ function(setup_spans_up_left) function(setup_spans_up_right) setup_spans_up_up(right, left) -.pool - #define setup_spans_down_down(minor, major) \ setup_spans_prologue(); \ sub height_minor_a, y_b, y_a; \ @@ -986,7 +1012,7 @@ function(setup_spans_up_right) sub height, y_c, y_a; \ \ vdup.u32 x_starts, x_a; \ - vmov.u32 x_ends, x_c, x_b; \ + vmov x_ends, x_c, x_b; \ \ compute_edge_delta_x3(x_b, height_major, height_minor_a); \ setup_spans_down(major, minor, minor, yes); \ @@ -1009,7 +1035,7 @@ function(setup_spans_down_right) function(setup_spans_up_a) setup_spans_prologue() - vmov.u32 x_starts, x_a, x_b + vmov x_starts, x_a, x_b vdup.u32 x_ends, x_c setup_spans_up_flat() @@ -1018,7 +1044,7 @@ function(setup_spans_up_b) setup_spans_prologue() vdup.u32 x_starts, x_a - vmov.u32 x_ends, x_b, x_c + vmov x_ends, x_b, x_c setup_spans_up_flat() @@ -1032,7 +1058,7 @@ function(setup_spans_up_b) function(setup_spans_down_a) setup_spans_prologue() - vmov.u32 x_starts, x_a, x_b + vmov x_starts, x_a, x_b vdup.u32 x_ends, x_c setup_spans_down_flat() @@ -1041,7 +1067,7 @@ function(setup_spans_down_b) setup_spans_prologue() vdup.u32 x_starts, x_a - vmov.u32 x_ends, x_b, x_c + vmov x_ends, x_b, x_c setup_spans_down_flat() @@ -1072,13 +1098,13 @@ function(setup_spans_up_down) sub height_minor_b, y_c, y_a sub height_major, y_c, y_b - vmov.u32 x_starts, x_a, x_c + vmov x_starts, x_a, x_c vdup.u32 x_ends, x_b compute_edge_delta_x3(x_a, height_minor_a, height_major) mov temp, #0 - vmov.u32 height_increment, temp, height_minor_b + vmov height_increment, temp, height_minor_b vmlal.s32 edges_xy, edges_dx_dy, height_increment vmov edges_xy_b_left, edge_alt_low, edge_alt_high @@ -1115,7 +1141,7 @@ function(setup_spans_up_down) sub temp, temp, #(1 << 16) sub y_a, temp, #2 sub y_a, y_a, #(2 << 16) - vmov.u32 y_x4, temp, y_a + vmov y_x4, temp, y_a vaddw.s32 edges_xy, edges_xy, edges_dx_dy @@ -1165,12 +1191,16 @@ function(setup_spans_up_down) add temp, temp, #(1 << 16) add y_a, temp, #2 add y_a, y_a, #(2 << 16) - vmov.u32 y_x4, temp, y_a + vmov y_x4, temp, y_a setup_spans_adjust_edges_alternate_no(left, right) ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ] add temp, temp, height_minor_b + + cmp temp, #MAX_SPANS + beq 5f + strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ] 2: @@ -1186,7 +1216,14 @@ function(setup_spans_up_down) setup_spans_prologue_b() bal 4b -.pool + 5: + // FIXME: overflow corner case + sub temp, temp, height_minor_b + bics height_minor_b, #3 + add temp, temp, height_minor_b + strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ] + bne 2b + bal 1b #undef span_uvrg_offset #undef span_edge_data @@ -1556,9 +1593,9 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ - stmdb sp!, { r0 - r3, r12, r14 }; \ + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, r12, r14 }; \ + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ \ vpop { uvrg_dx4 }; \ vpop { texture_mask }; \ @@ -1753,9 +1790,9 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ - stmdb sp!, { r0 - r3, r12, r14 }; \ + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, r12, r14 }; \ + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ \ vpop { uvrg_dx4 }; \ vpop { texture_mask }; \ @@ -1869,9 +1906,9 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect) 2: vpush { colors } - stmdb sp!, { r0 - r3, r12, r14 } + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } bl flush_render_block_buffer - ldmia sp!, { r0 - r3, r12, r14 } + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } vpop { colors } @@ -1918,7 +1955,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) vdup.u16 colors, color add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset - orr color, color, lsl #16 + orr color, color, color, lsl #16 0: @@ -1960,7 +1997,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) moveq right_mask, right_mask, lsr #2 tst right_mask, #0x1 - streqh color, [ fb_ptr ] + strheq color, [ fb_ptr ] 1: add span_edge_data, span_edge_data, #8 @@ -2284,9 +2321,9 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { rg_dx4 }; \ \ - stmdb sp!, { r0 - r3, r12, r14 }; \ + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, r12, r14 }; \ + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ \ vpop { rg_dx4 }; \ \ @@ -2489,17 +2526,19 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \ vmlal.u8 pixels, g_whole_8, d64_4; \ vmlal.u8 pixels, b_whole_8, d64_128; \ \ - ldr pc, [ pc, right_mask, lsl #2 ]; \ + JT_OP_REL(100f, right_mask, temp); \ + JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]); \ nop; \ + 100: \ nop; \ - .word 4f; \ - .word 5f; \ - .word 6f; \ - .word 7f; \ - .word 8f; \ - .word 9f; \ - .word 10f; \ - .word 11f; \ + .word JTE(100b, 4f); \ + .word JTE(100b, 5f); \ + .word JTE(100b, 6f); \ + .word JTE(100b, 7f); \ + .word JTE(100b, 8f); \ + .word JTE(100b, 9f); \ + .word JTE(100b, 10f); \ + .word JTE(100b, 11f); \ \ 4: \ vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \ @@ -2672,7 +2711,7 @@ function(texture_blocks_4bpp) orr pixels_a, pixels_a, pixel_3, lsl #24 orr pixels_b, pixels_b, pixel_7, lsl #24 - vmov.u32 texels, pixels_a, pixels_b + vmov texels, pixels_a, pixels_b vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels @@ -2775,11 +2814,11 @@ function(texture_blocks_8bpp) ldmia sp!, { r3 - r11, pc } 1: - stmdb sp!, { r1 - r2, r12 } + stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 } bl update_texture_8bpp_cache - ldmia sp!, { r1 - r2, r12 } + ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 } bal 0b @@ -3388,10 +3427,12 @@ function(shade_blocks_textured_unmodulated_direct) [ draw_mask_bits_ptr, :16 ], c_64 vbif.u16 fb_pixels, pixels, draw_mask_combined - vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64 - sub fb_ptr_cmp, fb_ptr_next, fb_ptr + pld [ fb_ptr_next, #64 ] + add fb_ptr_cmp, fb_ptr_cmp, #14 + vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64 + cmp fb_ptr_cmp, #28 bls 4f @@ -3750,11 +3791,15 @@ function(blend_blocks_textured_add_##mask_evaluate) \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vand.u16 pixels_mg, pixels, d128_0x83E0; \ \ - vbit.u16 blend_pixels, fb_pixels, draw_mask; \ - vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ + sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ + pld [ fb_ptr_next, #64 ]; \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ + vbit.u16 blend_pixels, fb_pixels, draw_mask; \ + \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ + vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ + \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ @@ -4256,6 +4301,7 @@ function(blend_blocks_textured_unblended_on) beq 1f 0: + vorr.u16 pixels, pixels, msb_mask vorr.u16 draw_mask, draw_mask, write_mask vbif.u16 fb_pixels, pixels, draw_mask vst1.u16 { fb_pixels }, [ fb_ptr ] @@ -4270,6 +4316,7 @@ function(blend_blocks_textured_unblended_on) bne 0b 1: + vorr.u16 pixels, pixels, msb_mask vorr.u16 draw_mask, draw_mask, write_mask vbif.u16 fb_pixels, pixels, draw_mask vst1.u16 { fb_pixels }, [ fb_ptr ] @@ -4380,6 +4427,8 @@ function(render_block_fill_body) #define fb_ptr_advance_column r12 #define texture_block_ptr r14 +#define temp r14 + #define texture_page_ptr r3 #define left_block_mask r4 #define right_block_mask r5 @@ -4435,9 +4484,9 @@ function(render_block_fill_body) setup_sprite_flush_blocks: vpush { q1 - q5 } - stmdb sp!, { r0 - r3, r12, r14 } + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } bl flush_render_block_buffer - ldmia sp!, { r0 - r3, r12, r14 } + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } vpop { q1 - q5 } @@ -4452,9 +4501,9 @@ setup_sprite_update_texture_4bpp_cache: setup_sprite_update_texture_8bpp_cache: - stmdb sp!, { r0 - r3, r14 } + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 } bl update_texture_8bpp_cache - ldmia sp!, { r0 - r3, pc } + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc } #define setup_sprite_tiled_initialize_4bpp() \ @@ -4725,7 +4774,7 @@ setup_sprite_update_texture_8bpp_cache: mov fb_ptr_advance_column, #32; \ vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \ \ - sub fb_ptr_advance_column, height, lsl #11; \ + sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \ vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \ #define setup_sprite_setup_right_draw_mask_fb_ptr() \ @@ -4913,12 +4962,12 @@ setup_sprite_update_texture_8bpp_cache: draw_mask_fb_ptr_left_b); \ \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ - add fb_ptr, fb_ptr, #16*2; \ + pld [ fb_ptr, #2048 ]; \ \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ - vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \ + add fb_ptr, fb_ptr, #16*2; \ \ - pld [ fb_ptr ]; \ + vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \ vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \ \ vzip.8 texels_low, texels_high; \ @@ -4957,9 +5006,10 @@ setup_sprite_update_texture_8bpp_cache: do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \ draw_mask_fb_ptr_##edge##_b); \ \ + pld [ fb_ptr, #2048 ]; \ add fb_ptr, fb_ptr, #2048 * 2; \ - subs sub_tile_height, sub_tile_height, #1; \ \ + subs sub_tile_height, sub_tile_height, #1; \ bne 4b; \ \ ldr column_data, [sp], #8; /* fb_ptr2 */ \ @@ -4983,13 +5033,13 @@ setup_sprite_update_texture_8bpp_cache: do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \ draw_mask_fb_ptr_left_b); \ \ + pld [ fb_ptr, #2048 ]; \ and texture_block_ptr, texture_block_ptr, texture_mask; \ \ add fb_ptr, fb_ptr, #16*2; \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ - pld [ fb_ptr ]; \ \ do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \ draw_mask_fb_ptr_right_b); \ @@ -5018,6 +5068,7 @@ setup_sprite_update_texture_8bpp_cache: add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ \ + pld [ fb_ptr, #2048 ]; \ do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \ draw_mask_fb_ptr_##edge##_b); \ \ @@ -5067,7 +5118,7 @@ setup_sprite_update_texture_8bpp_cache: mov fb_ptr_advance_column, #32 * 2; \ vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \ vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \ - sub fb_ptr_advance_column, height, lsl #11 + 1; \ + sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \ vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \ vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \ @@ -5198,24 +5249,26 @@ function(setup_sprite_##texture_mode##x4mode) \ add block, block, num_blocks, lsl #6; \ \ orreq control_mask, control_mask, #0x2; \ - ldr pc, [ pc, control_mask, lsl #2 ]; \ + JT_OP_REL(9f, control_mask, temp); \ + JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]); \ nop; \ \ - .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \ - .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \ - .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \ - .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \ - .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \ - .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \ - .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \ - .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \ - .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \ + 9: \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \ .word 0x00000000; \ - .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \ setup_sprite_tiled_builder(4bpp,); @@ -5320,6 +5373,7 @@ function(texture_sprite_blocks_8bpp) #undef texels_wide_high #undef texels_wide #undef fb_ptr2 +#undef temp #define psx_gpu r0 #define x r1 @@ -5373,9 +5427,9 @@ function(texture_sprite_blocks_8bpp) setup_sprites_16bpp_flush: vpush { d0 - d3 } - stmdb sp!, { r0 - r3, r12, r14 } + stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } bl flush_render_block_buffer - ldmia sp!, { r0 - r3, r12, r14 } + ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } vpop { d0 - d3 } @@ -5400,7 +5454,7 @@ function(setup_sprite_16bpp) add texture_offset_base, u, u add width_rounded, width, #7 - add texture_offset_base, v, lsl #11 + add texture_offset_base, texture_offset_base, v, lsl #11 mov left_mask_bits, #0xFF ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ] @@ -5415,7 +5469,7 @@ function(setup_sprite_16bpp) and right_width, width_rounded, #0x7 mvn left_mask_bits, left_mask_bits, lsl left_offset - add texture_mask, texture_mask_height, lsl #11 + add texture_mask, texture_mask, texture_mask_height, lsl #11 mov block_width, width_rounded, lsr #3 mov right_mask_bits, right_mask_bits, lsl right_width @@ -5562,7 +5616,7 @@ function(setup_sprite_16bpp_4x) add texture_offset_base, u, u add width_rounded, width, #7 - add texture_offset_base, v, lsl #11 + add texture_offset_base, texture_offset_base, v, lsl #11 movw left_mask_bits, #0xFFFF ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ] @@ -5581,7 +5635,7 @@ function(setup_sprite_16bpp_4x) lsl right_width, #1 - add texture_mask, texture_mask_height, lsl #11 + add texture_mask, texture_mask, texture_mask_height, lsl #11 mov block_width, width_rounded, lsr #3 mov right_mask_bits, right_mask_bits, lsl right_width @@ -5687,6 +5741,147 @@ function(setup_sprite_16bpp_4x) ldmia sp!, { r4 - r11, pc } +#undef width +#undef right_width +#undef right_mask_bits +#undef color +#undef height +#undef blocks_remaining +#undef colors +#undef right_mask +#undef test_mask +#undef draw_mask + +#define psx_gpu r0 +#define x r1 +#define y r2 +#define width r3 +#define right_width r5 +#define right_mask_bits r6 +#define fb_ptr r7 +#define color r8 +#define height r9 +#define fb_ptr_pitch r12 + +// referenced by setup_sprites_16bpp_flush +#define num_blocks r4 +#define block r5 +#define block_width r11 + +#define color_r r1 +#define color_g r2 +#define color_b r8 +#define blocks_remaining r6 + +#define colors q0 +#define right_mask q1 +#define test_mask q2 +#define draw_mask q2 +#define draw_mask_bits_fb_ptr d6 + + +.align 3 + +function(setup_sprite_untextured) + ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ] + tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \ + | RENDER_FLAGS_BLEND) + ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ] + tsteq r12, #RENDER_INTERLACE_ENABLED + beq setup_sprite_untextured_simple + + stmdb sp!, { r4 - r11, r14 } + + ldr width, [ sp, #40 ] + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ] + + ldr height, [ sp, #44 ] + add fb_ptr, fb_ptr, y, lsl #11 + + add fb_ptr, fb_ptr, x, lsl #1 + sub right_width, width, #1 + + ldr color, [ sp, #48 ] + and right_width, #7 + + add block_width, width, #7 + add right_width, #1 + + lsr block_width, #3 + mov right_mask_bits, #0xff + + sub fb_ptr_pitch, block_width, #1 + lsl right_mask_bits, right_width + + lsl fb_ptr_pitch, #3+1 + ubfx color_r, color, #3, #5 + + rsb fb_ptr_pitch, #1024*2 + ubfx color_g, color, #11, #5 + + vld1.u32 { test_mask }, [ psx_gpu, :128 ] + ubfx color_b, color, #19, #5 + + vdup.u16 right_mask, right_mask_bits + orr color, color_r, color_b, lsl #10 + + ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] + orr color, color, color_g, lsl #5 + + vtst.u16 right_mask, right_mask, test_mask + add block, psx_gpu, #psx_gpu_blocks_offset + + vdup.u16 colors, color + add block, block, num_blocks, lsl #6 + + +setup_sprite_untextured_height_loop: + add num_blocks, block_width + sub blocks_remaining, block_width, #1 + + cmp num_blocks, #MAX_BLOCKS + blgt setup_sprites_16bpp_flush + + cmp blocks_remaining, #0 + ble 1f + + vmov.u8 draw_mask, #0 /* zero_mask */ + vmov.u8 draw_mask_bits_fb_ptr, #0 + + 0: + vst1.u32 { draw_mask }, [ block, :128 ]! + subs blocks_remaining, #1 + + vst1.u32 { colors }, [ block, :128 ] + add block, block, #24 + + vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr + vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ] + + add block, block, #24 + add fb_ptr, #8*2 + bgt 0b + + 1: + vst1.u32 { right_mask }, [ block, :128 ]! + subs height, #1 + + vst1.u32 { colors }, [ block, :128 ] + add block, block, #24 + + vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr + vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ] + + add block, block, #24 + add fb_ptr, fb_ptr_pitch + + strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] + bgt setup_sprite_untextured_height_loop + + ldmia sp!, { r4 - r11, pc } + + + #undef texture_page_ptr #undef vram_ptr #undef dirty_textures_mask @@ -5912,7 +6107,7 @@ function(scale2x_tiles8) mov r14, r2 add r0, #1024*2*2 add r4, #1024*2 - sub r0, r2, lsl #4+1 + sub r0, r0, r2, lsl #4+1 mov r1, r4 add r12, r0, #1024*2 bgt 0b