psx_gpu: workaround overflow crash
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
index 103483a..4e1e403 100644 (file)
 #define MAX_BLOCKS                                        64
 #define MAX_BLOCKS_PER_ROW                                128
 
+#define RENDER_STATE_MASK_EVALUATE                        0x20
+#define RENDER_FLAGS_MODULATE_TEXELS                      0x1
+#define RENDER_FLAGS_BLEND                                0x2
+
 #include "psx_gpu_offsets.h"
 
 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
@@ -1171,6 +1175,10 @@ function(setup_spans_up_down)
 
   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
   add temp, temp, height_minor_b
+
+  cmp temp, #MAX_SPANS
+  beq 5f
+
   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
 
  2:                                                     
@@ -1186,6 +1194,15 @@ function(setup_spans_up_down)
   setup_spans_prologue_b()
   bal 4b
 
+ 5:
+  // FIXME: overflow corner case
+  sub temp, temp, height_minor_b
+  bics height_minor_b, #3
+  add temp, temp, height_minor_b
+  strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
+  bne 2b
+  bal 1b
+
 .pool
 
 #undef span_uvrg_offset
@@ -3388,10 +3405,12 @@ function(shade_blocks_textured_unmodulated_direct)
    [ draw_mask_bits_ptr, :16 ], c_64
   vbif.u16 fb_pixels, pixels, draw_mask_combined
 
-  vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
-
   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
+  pld [ fb_ptr_next, #64 ]
+
   add fb_ptr_cmp, fb_ptr_cmp, #14
+  vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
+
   cmp fb_ptr_cmp, #28
   bls 4f
 
@@ -3750,11 +3769,15 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
                                                                                \
-  vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
-  vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
+  sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
+  pld [ fb_ptr_next, #64 ];                                                    \
                                                                                \
   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
+  vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
+                                                                               \
   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
+  vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
+                                                                               \
   cmp fb_ptr_cmp, #28;                                                         \
   bls 2f;                                                                      \
                                                                                \
@@ -4913,12 +4936,12 @@ setup_sprite_update_texture_8bpp_cache:
    draw_mask_fb_ptr_left_b);                                                   \
                                                                                \
   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
-  add fb_ptr, fb_ptr, #16*2;                                                   \
+  pld [ fb_ptr, #2048 ];                                                       \
                                                                                \
   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
-  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
+  add fb_ptr, fb_ptr, #16*2;                                                   \
                                                                                \
-  pld [ fb_ptr ];                                                              \
+  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
                                                                                \
   vzip.8 texels_low, texels_high;                                              \
@@ -4957,9 +4980,10 @@ setup_sprite_update_texture_8bpp_cache:
   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
    draw_mask_fb_ptr_##edge##_b);                                               \
                                                                                \
+  pld [ fb_ptr, #2048 ];                                                       \
   add fb_ptr, fb_ptr, #2048 * 2;                                               \
-  subs sub_tile_height, sub_tile_height, #1;                                   \
                                                                                \
+  subs sub_tile_height, sub_tile_height, #1;                                   \
   bne 4b;                                                                      \
                                                                                \
   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
@@ -4983,13 +5007,13 @@ setup_sprite_update_texture_8bpp_cache:
   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
    draw_mask_fb_ptr_left_b);                                                   \
                                                                                \
+  pld [ fb_ptr, #2048 ];                                                       \
   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
                                                                                \
   add fb_ptr, fb_ptr, #16*2;                                                   \
   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
                                                                                \
   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
-  pld [ fb_ptr ];                                                              \
                                                                                \
   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
    draw_mask_fb_ptr_right_b);                                                  \
@@ -5018,6 +5042,7 @@ setup_sprite_update_texture_8bpp_cache:
   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
                                                                                \
+  pld [ fb_ptr, #2048 ];                                                       \
   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
    draw_mask_fb_ptr_##edge##_b);                                               \
                                                                                \
@@ -5687,6 +5712,145 @@ function(setup_sprite_16bpp_4x)
   ldmia sp!, { r4 - r11, pc }
 
 
+#undef width
+#undef right_width
+#undef right_mask_bits
+#undef color
+#undef height
+#undef blocks_remaining
+#undef colors
+#undef right_mask
+#undef test_mask
+#undef draw_mask
+
+#define psx_gpu                                           r0
+#define x                                                 r1
+#define y                                                 r2
+#define width                                             r3
+#define right_width                                       r5
+#define right_mask_bits                                   r6
+#define fb_ptr                                            r7
+#define color                                             r8
+#define height                                            r9
+#define fb_ptr_pitch                                      r12
+
+// referenced by setup_sprites_16bpp_flush
+#define num_blocks                                        r4
+#define block                                             r5
+#define block_width                                       r11
+
+#define color_r                                           r1
+#define color_g                                           r2
+#define color_b                                           r8
+#define blocks_remaining                                  r6
+
+#define colors                                            q0
+#define right_mask                                        q1
+#define test_mask                                         q2
+#define draw_mask                                         q2
+#define draw_mask_bits_fb_ptr                             d6
+
+
+.align 3
+
+function(setup_sprite_untextured)
+  ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
+  tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
+    | RENDER_FLAGS_BLEND)
+  beq setup_sprite_untextured_simple
+
+  stmdb sp!, { r4 - r11, r14 }
+
+  ldr width, [ sp, #40 ]
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
+
+  ldr height, [ sp, #44 ]
+  add fb_ptr, fb_ptr, y, lsl #11
+
+  add fb_ptr, fb_ptr, x, lsl #1
+  sub right_width, width, #1
+
+  ldr color, [ sp, #48 ]
+  and right_width, #7
+
+  add block_width, width, #7
+  add right_width, #1
+
+  lsr block_width, #3
+  mov right_mask_bits, #0xff
+
+  sub fb_ptr_pitch, block_width, #1
+  lsl right_mask_bits, right_width
+
+  lsl fb_ptr_pitch, #3+1
+  ubfx color_r, color, #3, #5
+
+  rsb fb_ptr_pitch, #1024*2
+  ubfx color_g, color, #11, #5
+
+  vld1.u32 { test_mask }, [ psx_gpu, :128 ]
+  ubfx color_b, color, #19, #5
+
+  vdup.u16 right_mask, right_mask_bits
+  orr color, color_r, color_b, lsl #10
+
+  ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  orr color, color, color_g, lsl #5
+
+  vtst.u16 right_mask, right_mask, test_mask
+  add block, psx_gpu, #psx_gpu_blocks_offset
+
+  vdup.u16 colors, color
+  add block, block, num_blocks, lsl #6
+
+
+setup_sprite_untextured_height_loop:
+  add num_blocks, block_width
+  sub blocks_remaining, block_width, #1
+
+  cmp num_blocks, #MAX_BLOCKS
+  blgt setup_sprites_16bpp_flush
+
+  cmp blocks_remaining, #0
+  ble 1f
+
+  vmov.u8 draw_mask, #0 /* zero_mask */
+  vmov.u8 draw_mask_bits_fb_ptr, #0
+
+ 0:
+  vst1.u32 { draw_mask }, [ block, :128 ]!
+  subs blocks_remaining, #1
+
+  vst1.u32 { colors }, [ block, :128 ]
+  add block, block, #24
+
+  vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+  vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+  
+  add block, block, #24
+  add fb_ptr, #8*2
+  bgt 0b
+
+ 1:
+  vst1.u32 { right_mask }, [ block, :128 ]!
+  subs height, #1
+
+  vst1.u32 { colors }, [ block, :128 ]
+  add block, block, #24
+
+  vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+  vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+  
+  add block, block, #24
+  add fb_ptr, fb_ptr_pitch
+
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  bgt setup_sprite_untextured_height_loop
+
+  ldmia sp!, { r4 - r11, pc }
+
+
+
 #undef texture_page_ptr
 #undef vram_ptr
 #undef dirty_textures_mask