gpu_neon: adjust some comments and things
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
index 110c868..c62c1ba 100644 (file)
 .syntax unified
 .text
 
+#if 0
+#define save_abi_regs() \
+  vpush {q4-q7}
+#define restore_abi_regs() \
+  vpop  {q4-q7}
+#else
+#define save_abi_regs()
+#define restore_abi_regs()
+#endif
+
 #define psx_gpu                                           r0
 #define v_a                                               r1
 #define v_b                                               r2
 
 .align 4
 
-#ifndef __MACH__
+#include "arm_features.h"
 
-#define function(name)                                                         \
-  .global name;                                                                \
-  .type name, %function;                                                       \
-  name:                                                                        \
+#define function(name) FUNCTION(name):
+
+#ifndef TEXRELS_FORBIDDEN
 
 #define JT_OP_REL(table_label, index_reg, temp)
 #define JT_OP(x...) x
 #define JTE(start, target) target
 
-#define EXTRA_UNSAVED_REGS
-
 #else
 
-#define function(name)                                                         \
-  .globl _##name;                                                              \
-  name:                                                                        \
-  _##name:                                                                     \
-
 #define JT_OP_REL(table_label, index_reg, temp)                                \
   adr temp, table_label;                                                       \
   ldr temp, [temp, index_reg, lsl #2];                                         \
 #define JT_OP(x...)
 #define JTE(start, target) (target - start)
 
-// r7 is preserved, but add it for EABI alignment..
-#define EXTRA_UNSAVED_REGS r7, r9,
+#endif
 
+#ifdef __MACH__
 #define flush_render_block_buffer _flush_render_block_buffer
 #define setup_sprite_untextured_simple _setup_sprite_untextured_simple
 #define update_texture_8bpp_cache _update_texture_8bpp_cache
-
 #endif
 
 @ r0: psx_gpu
@@ -242,6 +243,7 @@ function(compute_all_gradients)
   @ r12 = psx_gpu->triangle_area
   ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
   stmdb sp!, { r4 - r11, lr }
+  save_abi_regs()
 
   @ load exponent of 62 into upper half of double
   movw r4, #0
@@ -367,8 +369,8 @@ function(compute_all_gradients)
   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
 
   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
-  vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
-
+  vdup.u32 r_shift, r14              @ r_shift = { shift, shift*, shift, shift* }
+                                     @ * - vshl.u64: ignored by hw
   vadd.u32 uvrg_base, uvrgb_phase
   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
 
@@ -457,6 +459,7 @@ function(compute_all_gradients)
 
   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
 
+  restore_abi_regs()
   ldmia sp!, { r4 - r11, pc }
 
 
@@ -587,6 +590,7 @@ function(compute_all_gradients)
 
 #define setup_spans_prologue()                                                 \
   stmdb sp!, { r4 - r11, lr };                                                 \
+  save_abi_regs();                                                             \
                                                                                \
   ldrsh x_a, [v_a, #8];                                                        \
   ldrsh x_b, [v_b, #8];                                                        \
@@ -983,6 +987,7 @@ function(compute_all_gradients)
 
 
 #define setup_spans_epilogue()                                                 \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 
@@ -1357,6 +1362,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
                                                                                \
   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
@@ -1586,6 +1592,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
@@ -1593,9 +1600,9 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
                                                                                \
   vpop { uvrg_dx4 };                                                           \
   vpop { texture_mask };                                                       \
@@ -1626,6 +1633,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
                                                                                \
   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
@@ -1783,6 +1791,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
@@ -1790,9 +1799,9 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
                                                                                \
   vpop { uvrg_dx4 };                                                           \
   vpop { texture_mask };                                                       \
@@ -1819,6 +1828,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
   bxeq lr
 
   stmdb sp!, { r4 - r11, r14 }
+  save_abi_regs()
   vld1.u32 { test_mask }, [psx_gpu, :128]
 
   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
@@ -1901,6 +1911,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
   bne 0b
 
+  restore_abi_regs()
   ldmia sp!, { r4 - r11, pc }
                                                                            
  2:
@@ -2123,6 +2134,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
                                                                                \
   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
@@ -2315,6 +2327,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
@@ -2366,6 +2379,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
                                                                                \
   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
@@ -2586,6 +2600,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
                                                                                \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 setup_blocks_shaded_untextured_direct_builder(undithered)
@@ -3159,6 +3174,7 @@ function(texture_blocks_16bpp)
 .align 3;                                                                      \
                                                                                \
 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
+  save_abi_regs();                                                             \
   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
   stmdb sp!, { r4 - r5, lr };                                                  \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
@@ -3276,7 +3292,9 @@ function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
   shade_blocks_textured_modulated_store_pixels_##target();                     \
                                                                                \
-  ldmia sp!, { r4 - r5, pc }                                                   \
+  ldmia sp!, { r4 - r5, lr };                                                  \
+  restore_abi_regs();                                                          \
+  bx lr                                                                        \
 
 
 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
@@ -3341,7 +3359,8 @@ shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
 
 .align 3
 function(shade_blocks_textured_unmodulated_indirect)
-  str r14, [sp, #-4]
+  stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -3384,13 +3403,15 @@ function(shade_blocks_textured_unmodulated_indirect)
   vorr.u16 draw_mask_combined, draw_mask, zero_mask
   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
 
-  ldr pc, [sp, #-4]
+  restore_abi_regs()
+  ldmia sp!, { r4, pc }
 
 
 .align 3
 
 function(shade_blocks_textured_unmodulated_direct)
   stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -3452,6 +3473,7 @@ function(shade_blocks_textured_unmodulated_direct)
 
   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
 
+  restore_abi_regs()
   ldmia sp!, { r4, pc }
 
  4:
@@ -3471,6 +3493,7 @@ function(shade_blocks_unshaded_untextured_indirect)
 
 function(shade_blocks_unshaded_untextured_direct)
   stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -3517,6 +3540,7 @@ function(shade_blocks_unshaded_untextured_direct)
   vbif.u16 fb_pixels_next, pixels, draw_mask
   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
 
+  restore_abi_regs()
   ldmia sp!, { r4, pc }
 
  4:
@@ -3622,6 +3646,7 @@ function(shade_blocks_unshaded_untextured_direct)
                                                                                \
 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -3703,6 +3728,7 @@ function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
   vst1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -3741,6 +3767,7 @@ blend_blocks_average_builder(untextured, on)
                                                                                \
 function(blend_blocks_textured_add_##mask_evaluate)                            \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -3826,6 +3853,7 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -3845,6 +3873,7 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
                                                                                \
 function(blend_blocks_untextured_add_##mask_evaluate)                          \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -3920,6 +3949,7 @@ function(blend_blocks_untextured_add_##mask_evaluate)                          \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -3944,7 +3974,7 @@ blend_blocks_add_untextured_builder(on)
 #define blend_blocks_subtract_combine_textured()                               \
   vbif.u16 blend_pixels, pixels, blend_mask                                    \
 
-#define blend_blocks_subtract_set_stb_textured()                               \
+#define blend_blocks_subtract_set_stp_textured()                               \
   vorr.u16 blend_pixels, #0x8000                                               \
 
 #define blend_blocks_subtract_msb_mask_textured()                              \
@@ -3954,7 +3984,7 @@ blend_blocks_add_untextured_builder(on)
 
 #define blend_blocks_subtract_combine_untextured()                             \
 
-#define blend_blocks_subtract_set_stb_untextured()                             \
+#define blend_blocks_subtract_set_stp_untextured()                             \
   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
 
 #define blend_blocks_subtract_msb_mask_untextured()                            \
@@ -3977,6 +4007,7 @@ blend_blocks_add_untextured_builder(on)
                                                                                \
 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -4019,7 +4050,7 @@ function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
-  blend_blocks_subtract_set_stb_##texturing();                                 \
+  blend_blocks_subtract_set_stp_##texturing();                                 \
   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
   blend_blocks_subtract_combine_##texturing();                                 \
   blend_blocks_subtract_set_blend_mask_##texturing();                          \
@@ -4047,11 +4078,12 @@ function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
                                                                                \
   blend_blocks_subtract_msb_mask_##texturing();                                \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
-  blend_blocks_subtract_set_stb_##texturing();                                 \
+  blend_blocks_subtract_set_stp_##texturing();                                 \
   blend_blocks_subtract_combine_##texturing();                                 \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -4076,6 +4108,7 @@ blend_blocks_subtract_builder(untextured, on)
                                                                                \
 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -4119,6 +4152,7 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
                                                                                \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
+  vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
                                                                                \
   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
@@ -4154,11 +4188,13 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
                                                                                \
  1:                                                                            \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
-  vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
+  vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
+  vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -4178,6 +4214,7 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
                                                                                \
 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -4257,6 +4294,7 @@ function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -4282,6 +4320,7 @@ blend_blocks_add_fourth_untextured_builder(on)
 
 function(blend_blocks_textured_unblended_on)         
   stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
 
@@ -4321,6 +4360,7 @@ function(blend_blocks_textured_unblended_on)
   vbif.u16 fb_pixels, pixels, draw_mask
   vst1.u16 { fb_pixels }, [fb_ptr]
 
+  restore_abi_regs()
   ldmia sp!, { r4, pc }
 
 
@@ -4790,6 +4830,7 @@ setup_sprite_update_texture_8bpp_cache:
   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 #define setup_sprite_tiled_advance_column()                                    \
@@ -4826,6 +4867,7 @@ setup_sprite_update_texture_8bpp_cache:
                                                                                \
   setup_sprite_tiled_advance_column();                                         \
   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 
@@ -5184,6 +5226,8 @@ function(setup_sprite_##texture_mode##x4mode)                                  \
   ldr height, [sp, #44];                                                       \
   add fb_ptr, fb_ptr, y, lsl #11;                                              \
                                                                                \
+  save_abi_regs();                                                             \
+                                                                               \
   add fb_ptr, fb_ptr, x, lsl #1;                                               \
   and offset_v, v, #0xF;                                                       \
                                                                                \
@@ -5907,7 +5951,7 @@ setup_sprite_untextured_height_loop:
 #define texel_block_expanded_b                            q2
 #define texel_block_expanded_ab                           q2
 #define texel_block_expanded_c                            q3
-#define texel_block_expanded_d                            q4
+#define texel_block_expanded_d                            q0
 #define texel_block_expanded_cd                           q3
 
 function(update_texture_4bpp_cache)