psx_gpu: convert to UAL, load everything from context
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
index 103483a..8df7aca 100644 (file)
 #define MAX_BLOCKS                                        64
 #define MAX_BLOCKS_PER_ROW                                128
 
+#define RENDER_STATE_MASK_EVALUATE                        0x20
+#define RENDER_FLAGS_MODULATE_TEXELS                      0x1
+#define RENDER_FLAGS_BLEND                                0x2
+#define RENDER_INTERLACE_ENABLED                          0x1
+
 #include "psx_gpu_offsets.h"
 
 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
@@ -26,6 +31,8 @@
 #define edge_data_right_mask_offset                       4
 #define edge_data_y_offset                                6
 
+.syntax unified
+.text
 
 #define psx_gpu                                           r0
 #define v_a                                               r1
 
 .align 4
 
-/* FIXME: users of this should be in psx_gpu instead */
-#ifndef __PIC__
-#define load_pointer(register, pointer)                                        \
-  movw register, :lower16:pointer;                                             \
-  movt register, :upper16:pointer;                                             \
-
-#else
-#define load_pointer(register, pointer)                                        \
-  ldr  register, =pointer                                                      \
-
-#endif
-
 #define function(name)                                                         \
   .global name;                                                                \
   name:                                                                        \
@@ -571,7 +566,7 @@ function(compute_all_gradients)
   vld1.32 { uvrg }, [ temp ];                                                  \
   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
   vld1.32 { uvrg_dy }, [ temp ];                                               \
-  load_pointer(reciprocal_table_ptr, reciprocal_table);                        \
+  ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
                                                                                \
   vmov.u32 c_0x01, #0x01                                                       \
 
@@ -619,7 +614,7 @@ function(compute_all_gradients)
 #define height_b_alt              r12
 
 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
-  vmov.u32 heights, height_a, height_b;                                        \
+  vmov heights, height_a, height_b;                                            \
   ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ];                        \
   vmov.u32 edge_shifts[0], temp;                                               \
   ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ];                        \
@@ -879,7 +874,7 @@ function(compute_all_gradients)
   add temp, temp, #(1 << 16);                                                  \
   add y_a, temp, #2;                                                           \
   add y_a, y_a, #(2 << 16);                                                    \
-  vmov.u32 y_x4, temp, y_a;                                                    \
+  vmov y_x4, temp, y_a;                                                        \
                                                                                \
   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
    right_index);                                                               \
@@ -934,7 +929,7 @@ function(compute_all_gradients)
   sub temp, temp, #(1 << 16);                                                  \
   sub y_a, temp, #2;                                                           \
   sub y_a, y_a, #(2 << 16);                                                    \
-  vmov.u32 y_x4, temp, y_a;                                                    \
+  vmov y_x4, temp, y_a;                                                        \
                                                                                \
   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
                                                                                \
@@ -965,7 +960,7 @@ function(compute_all_gradients)
   sub height, y_a, y_c;                                                        \
                                                                                \
   vdup.u32 x_starts, x_a;                                                      \
-  vmov.u32 x_ends, x_c, x_b;                                                   \
+  vmov x_ends, x_c, x_b;                                                       \
                                                                                \
   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
   setup_spans_up(major, minor, minor, yes);                                    \
@@ -977,8 +972,6 @@ function(setup_spans_up_left)
 function(setup_spans_up_right)
   setup_spans_up_up(right, left)
 
-.pool
-
 #define setup_spans_down_down(minor, major)                                    \
   setup_spans_prologue();                                                      \
   sub height_minor_a, y_b, y_a;                                                \
@@ -986,7 +979,7 @@ function(setup_spans_up_right)
   sub height, y_c, y_a;                                                        \
                                                                                \
   vdup.u32 x_starts, x_a;                                                      \
-  vmov.u32 x_ends, x_c, x_b;                                                   \
+  vmov x_ends, x_c, x_b;                                                       \
                                                                                \
   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
   setup_spans_down(major, minor, minor, yes);                                  \
@@ -1009,7 +1002,7 @@ function(setup_spans_down_right)
 function(setup_spans_up_a)
   setup_spans_prologue()
 
-  vmov.u32 x_starts, x_a, x_b
+  vmov x_starts, x_a, x_b
   vdup.u32 x_ends, x_c
 
   setup_spans_up_flat()
@@ -1018,7 +1011,7 @@ function(setup_spans_up_b)
   setup_spans_prologue()
 
   vdup.u32 x_starts, x_a
-  vmov.u32 x_ends, x_b, x_c
+  vmov x_ends, x_b, x_c
 
   setup_spans_up_flat()
 
@@ -1032,7 +1025,7 @@ function(setup_spans_up_b)
 function(setup_spans_down_a)
   setup_spans_prologue()
 
-  vmov.u32 x_starts, x_a, x_b
+  vmov x_starts, x_a, x_b
   vdup.u32 x_ends, x_c
 
   setup_spans_down_flat()
@@ -1041,7 +1034,7 @@ function(setup_spans_down_b)
   setup_spans_prologue()
 
   vdup.u32 x_starts, x_a
-  vmov.u32 x_ends, x_b, x_c
+  vmov x_ends, x_b, x_c
 
   setup_spans_down_flat()
 
@@ -1072,13 +1065,13 @@ function(setup_spans_up_down)
   sub height_minor_b, y_c, y_a
   sub height_major, y_c, y_b
 
-  vmov.u32 x_starts, x_a, x_c
+  vmov x_starts, x_a, x_c
   vdup.u32 x_ends, x_b
 
   compute_edge_delta_x3(x_a, height_minor_a, height_major)
 
   mov temp, #0
-  vmov.u32 height_increment, temp, height_minor_b
+  vmov height_increment, temp, height_minor_b
   vmlal.s32 edges_xy, edges_dx_dy, height_increment
 
   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
@@ -1115,7 +1108,7 @@ function(setup_spans_up_down)
   sub temp, temp, #(1 << 16)
   sub y_a, temp, #2
   sub y_a, y_a, #(2 << 16)
-  vmov.u32 y_x4, temp, y_a
+  vmov y_x4, temp, y_a
 
   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
 
@@ -1165,12 +1158,16 @@ function(setup_spans_up_down)
   add temp, temp, #(1 << 16) 
   add y_a, temp, #2
   add y_a, y_a, #(2 << 16)
-  vmov.u32 y_x4, temp, y_a
+  vmov y_x4, temp, y_a
 
   setup_spans_adjust_edges_alternate_no(left, right)
 
   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
   add temp, temp, height_minor_b
+
+  cmp temp, #MAX_SPANS
+  beq 5f
+
   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
 
  2:                                                     
@@ -1186,7 +1183,14 @@ function(setup_spans_up_down)
   setup_spans_prologue_b()
   bal 4b
 
-.pool
+ 5:
+  // FIXME: overflow corner case
+  sub temp, temp, height_minor_b
+  bics height_minor_b, #3
+  add temp, temp, height_minor_b
+  strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
+  bne 2b
+  bal 1b
 
 #undef span_uvrg_offset
 #undef span_edge_data
@@ -1918,7 +1922,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
   vdup.u16 colors, color
 
   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
-  orr color, color, lsl #16
+  orr color, color, color, lsl #16
 
 
  0:
@@ -1960,7 +1964,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
   moveq right_mask, right_mask, lsr #2
 
   tst right_mask, #0x1
-  streqh color, [ fb_ptr ]
+  strheq color, [ fb_ptr ]
 
  1:
   add span_edge_data, span_edge_data, #8
@@ -2672,7 +2676,7 @@ function(texture_blocks_4bpp)
   orr pixels_a, pixels_a, pixel_3, lsl #24
 
   orr pixels_b, pixels_b, pixel_7, lsl #24
-  vmov.u32 texels, pixels_a, pixels_b
+  vmov texels, pixels_a, pixels_b
 
   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
@@ -3388,10 +3392,12 @@ function(shade_blocks_textured_unmodulated_direct)
    [ draw_mask_bits_ptr, :16 ], c_64
   vbif.u16 fb_pixels, pixels, draw_mask_combined
 
-  vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
-
   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
+  pld [ fb_ptr_next, #64 ]
+
   add fb_ptr_cmp, fb_ptr_cmp, #14
+  vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
+
   cmp fb_ptr_cmp, #28
   bls 4f
 
@@ -3750,11 +3756,15 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
                                                                                \
-  vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
-  vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
+  sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
+  pld [ fb_ptr_next, #64 ];                                                    \
                                                                                \
   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
+  vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
+                                                                               \
   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
+  vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
+                                                                               \
   cmp fb_ptr_cmp, #28;                                                         \
   bls 2f;                                                                      \
                                                                                \
@@ -4256,6 +4266,7 @@ function(blend_blocks_textured_unblended_on)
   beq 1f
 
  0:
+  vorr.u16 pixels, pixels, msb_mask
   vorr.u16 draw_mask, draw_mask, write_mask
   vbif.u16 fb_pixels, pixels, draw_mask
   vst1.u16 { fb_pixels }, [ fb_ptr ]
@@ -4270,6 +4281,7 @@ function(blend_blocks_textured_unblended_on)
   bne 0b
  
  1:
+  vorr.u16 pixels, pixels, msb_mask
   vorr.u16 draw_mask, draw_mask, write_mask
   vbif.u16 fb_pixels, pixels, draw_mask
   vst1.u16 { fb_pixels }, [ fb_ptr ]
@@ -4725,7 +4737,7 @@ setup_sprite_update_texture_8bpp_cache:
   mov fb_ptr_advance_column, #32;                                              \
   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
                                                                                \
-  sub fb_ptr_advance_column, height, lsl #11;                                  \
+  sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
 
 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
@@ -4913,12 +4925,12 @@ setup_sprite_update_texture_8bpp_cache:
    draw_mask_fb_ptr_left_b);                                                   \
                                                                                \
   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
-  add fb_ptr, fb_ptr, #16*2;                                                   \
+  pld [ fb_ptr, #2048 ];                                                       \
                                                                                \
   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
-  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
+  add fb_ptr, fb_ptr, #16*2;                                                   \
                                                                                \
-  pld [ fb_ptr ];                                                              \
+  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
                                                                                \
   vzip.8 texels_low, texels_high;                                              \
@@ -4957,9 +4969,10 @@ setup_sprite_update_texture_8bpp_cache:
   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
    draw_mask_fb_ptr_##edge##_b);                                               \
                                                                                \
+  pld [ fb_ptr, #2048 ];                                                       \
   add fb_ptr, fb_ptr, #2048 * 2;                                               \
-  subs sub_tile_height, sub_tile_height, #1;                                   \
                                                                                \
+  subs sub_tile_height, sub_tile_height, #1;                                   \
   bne 4b;                                                                      \
                                                                                \
   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
@@ -4983,13 +4996,13 @@ setup_sprite_update_texture_8bpp_cache:
   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
    draw_mask_fb_ptr_left_b);                                                   \
                                                                                \
+  pld [ fb_ptr, #2048 ];                                                       \
   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
                                                                                \
   add fb_ptr, fb_ptr, #16*2;                                                   \
   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
                                                                                \
   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
-  pld [ fb_ptr ];                                                              \
                                                                                \
   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
    draw_mask_fb_ptr_right_b);                                                  \
@@ -5018,6 +5031,7 @@ setup_sprite_update_texture_8bpp_cache:
   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
                                                                                \
+  pld [ fb_ptr, #2048 ];                                                       \
   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
    draw_mask_fb_ptr_##edge##_b);                                               \
                                                                                \
@@ -5067,7 +5081,7 @@ setup_sprite_update_texture_8bpp_cache:
   mov fb_ptr_advance_column, #32 * 2;                                          \
   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
-  sub fb_ptr_advance_column, height, lsl #11 + 1;                              \
+  sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
 
@@ -5400,7 +5414,7 @@ function(setup_sprite_16bpp)
   add texture_offset_base, u, u
   add width_rounded, width, #7
 
-  add texture_offset_base, v, lsl #11
+  add texture_offset_base, texture_offset_base, v, lsl #11
   mov left_mask_bits, #0xFF
   
   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
@@ -5415,7 +5429,7 @@ function(setup_sprite_16bpp)
   and right_width, width_rounded, #0x7
   mvn left_mask_bits, left_mask_bits, lsl left_offset
 
-  add texture_mask, texture_mask_height, lsl #11
+  add texture_mask, texture_mask, texture_mask_height, lsl #11
   mov block_width, width_rounded, lsr #3
 
   mov right_mask_bits, right_mask_bits, lsl right_width
@@ -5562,7 +5576,7 @@ function(setup_sprite_16bpp_4x)
   add texture_offset_base, u, u
   add width_rounded, width, #7
 
-  add texture_offset_base, v, lsl #11
+  add texture_offset_base, texture_offset_base, v, lsl #11
   movw left_mask_bits, #0xFFFF
   
   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
@@ -5581,7 +5595,7 @@ function(setup_sprite_16bpp_4x)
 
   lsl right_width, #1
 
-  add texture_mask, texture_mask_height, lsl #11
+  add texture_mask, texture_mask, texture_mask_height, lsl #11
   mov block_width, width_rounded, lsr #3
 
   mov right_mask_bits, right_mask_bits, lsl right_width
@@ -5687,6 +5701,147 @@ function(setup_sprite_16bpp_4x)
   ldmia sp!, { r4 - r11, pc }
 
 
+#undef width
+#undef right_width
+#undef right_mask_bits
+#undef color
+#undef height
+#undef blocks_remaining
+#undef colors
+#undef right_mask
+#undef test_mask
+#undef draw_mask
+
+#define psx_gpu                                           r0
+#define x                                                 r1
+#define y                                                 r2
+#define width                                             r3
+#define right_width                                       r5
+#define right_mask_bits                                   r6
+#define fb_ptr                                            r7
+#define color                                             r8
+#define height                                            r9
+#define fb_ptr_pitch                                      r12
+
+// referenced by setup_sprites_16bpp_flush
+#define num_blocks                                        r4
+#define block                                             r5
+#define block_width                                       r11
+
+#define color_r                                           r1
+#define color_g                                           r2
+#define color_b                                           r8
+#define blocks_remaining                                  r6
+
+#define colors                                            q0
+#define right_mask                                        q1
+#define test_mask                                         q2
+#define draw_mask                                         q2
+#define draw_mask_bits_fb_ptr                             d6
+
+
+.align 3
+
+function(setup_sprite_untextured)
+  ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
+  tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
+    | RENDER_FLAGS_BLEND)
+  ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
+  tsteq r12, #RENDER_INTERLACE_ENABLED
+  beq setup_sprite_untextured_simple
+
+  stmdb sp!, { r4 - r11, r14 }
+
+  ldr width, [ sp, #40 ]
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
+
+  ldr height, [ sp, #44 ]
+  add fb_ptr, fb_ptr, y, lsl #11
+
+  add fb_ptr, fb_ptr, x, lsl #1
+  sub right_width, width, #1
+
+  ldr color, [ sp, #48 ]
+  and right_width, #7
+
+  add block_width, width, #7
+  add right_width, #1
+
+  lsr block_width, #3
+  mov right_mask_bits, #0xff
+
+  sub fb_ptr_pitch, block_width, #1
+  lsl right_mask_bits, right_width
+
+  lsl fb_ptr_pitch, #3+1
+  ubfx color_r, color, #3, #5
+
+  rsb fb_ptr_pitch, #1024*2
+  ubfx color_g, color, #11, #5
+
+  vld1.u32 { test_mask }, [ psx_gpu, :128 ]
+  ubfx color_b, color, #19, #5
+
+  vdup.u16 right_mask, right_mask_bits
+  orr color, color_r, color_b, lsl #10
+
+  ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  orr color, color, color_g, lsl #5
+
+  vtst.u16 right_mask, right_mask, test_mask
+  add block, psx_gpu, #psx_gpu_blocks_offset
+
+  vdup.u16 colors, color
+  add block, block, num_blocks, lsl #6
+
+
+setup_sprite_untextured_height_loop:
+  add num_blocks, block_width
+  sub blocks_remaining, block_width, #1
+
+  cmp num_blocks, #MAX_BLOCKS
+  blgt setup_sprites_16bpp_flush
+
+  cmp blocks_remaining, #0
+  ble 1f
+
+  vmov.u8 draw_mask, #0 /* zero_mask */
+  vmov.u8 draw_mask_bits_fb_ptr, #0
+
+ 0:
+  vst1.u32 { draw_mask }, [ block, :128 ]!
+  subs blocks_remaining, #1
+
+  vst1.u32 { colors }, [ block, :128 ]
+  add block, block, #24
+
+  vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+  vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+  
+  add block, block, #24
+  add fb_ptr, #8*2
+  bgt 0b
+
+ 1:
+  vst1.u32 { right_mask }, [ block, :128 ]!
+  subs height, #1
+
+  vst1.u32 { colors }, [ block, :128 ]
+  add block, block, #24
+
+  vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+  vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+  
+  add block, block, #24
+  add fb_ptr, fb_ptr_pitch
+
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  bgt setup_sprite_untextured_height_loop
+
+  ldmia sp!, { r4 - r11, pc }
+
+
+
 #undef texture_page_ptr
 #undef vram_ptr
 #undef dirty_textures_mask
@@ -5912,7 +6067,7 @@ function(scale2x_tiles8)
   mov r14, r2
   add r0, #1024*2*2
   add r4, #1024*2
-  sub r0, r2, lsl #4+1
+  sub r0, r0, r2, lsl #4+1
   mov r1, r4
   add r12, r0, #1024*2
   bgt 0b