cdrom: change pause timing again
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
index 110c868..ffbea04 100644 (file)
  * General Public License for more details.
  */
 
-#define MAX_SPANS                                         512
-#define MAX_BLOCKS                                        64
-#define MAX_BLOCKS_PER_ROW                                128
-
-#define RENDER_STATE_MASK_EVALUATE                        0x20
-#define RENDER_FLAGS_MODULATE_TEXELS                      0x1
-#define RENDER_FLAGS_BLEND                                0x2
 #define RENDER_INTERLACE_ENABLED                          0x1
 
+#include "psx_gpu.h"
 #include "psx_gpu_offsets.h"
 
 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
 .syntax unified
 .text
 
+#if 0
+#define save_abi_regs() \
+  vpush {q4-q7}
+#define restore_abi_regs() \
+  vpop  {q4-q7}
+#else
+#define save_abi_regs()
+#define restore_abi_regs()
+#endif
+
 #define psx_gpu                                           r0
 #define v_a                                               r1
 #define v_b                                               r2
 
 .align 4
 
-#ifndef __MACH__
+#include "arm_features.h"
+
+#define function(name) FUNCTION(name):
 
-#define function(name)                                                         \
-  .global name;                                                                \
-  .type name, %function;                                                       \
-  name:                                                                        \
+#ifndef TEXRELS_FORBIDDEN
 
 #define JT_OP_REL(table_label, index_reg, temp)
 #define JT_OP(x...) x
 #define JTE(start, target) target
 
-#define EXTRA_UNSAVED_REGS
-
 #else
 
-#define function(name)                                                         \
-  .globl _##name;                                                              \
-  name:                                                                        \
-  _##name:                                                                     \
-
 #define JT_OP_REL(table_label, index_reg, temp)                                \
   adr temp, table_label;                                                       \
   ldr temp, [temp, index_reg, lsl #2];                                         \
 #define JT_OP(x...)
 #define JTE(start, target) (target - start)
 
-// r7 is preserved, but add it for EABI alignment..
-#define EXTRA_UNSAVED_REGS r7, r9,
+#endif
 
+#ifdef __MACH__
 #define flush_render_block_buffer _flush_render_block_buffer
-#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
 #define update_texture_8bpp_cache _update_texture_8bpp_cache
-
 #endif
 
 @ r0: psx_gpu
@@ -242,6 +236,7 @@ function(compute_all_gradients)
   @ r12 = psx_gpu->triangle_area
   ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
   stmdb sp!, { r4 - r11, lr }
+  save_abi_regs()
 
   @ load exponent of 62 into upper half of double
   movw r4, #0
@@ -367,8 +362,8 @@ function(compute_all_gradients)
   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
 
   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
-  vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
-
+  vdup.u32 r_shift, r14              @ r_shift = { shift, shift*, shift, shift* }
+                                     @ * - vshl.u64: ignored by hw
   vadd.u32 uvrg_base, uvrgb_phase
   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
 
@@ -457,6 +452,7 @@ function(compute_all_gradients)
 
   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
 
+  restore_abi_regs()
   ldmia sp!, { r4 - r11, pc }
 
 
@@ -562,6 +558,8 @@ function(compute_all_gradients)
 #define left_x_32_low                            d22
 #define left_x_32_high                           d23
 
+#define tmp_max_blocks                           d20
+
 #define edges_xy                                 q0
 #define edges_dx_dy                              d2
 #define edge_shifts                              d3
@@ -587,6 +585,7 @@ function(compute_all_gradients)
 
 #define setup_spans_prologue()                                                 \
   stmdb sp!, { r4 - r11, lr };                                                 \
+  save_abi_regs();                                                             \
                                                                                \
   ldrsh x_a, [v_a, #8];                                                        \
   ldrsh x_b, [v_b, #8];                                                        \
@@ -815,8 +814,10 @@ function(compute_all_gradients)
   str b, [span_b_offset], #4;                                                  \
   setup_spans_adjust_interpolants_##direction();                               \
                                                                                \
+  vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
                                                                                \
   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
                                                                                \
@@ -863,8 +864,10 @@ function(compute_all_gradients)
   str b, [span_b_offset], #4;                                                  \
   setup_spans_adjust_interpolants_##direction();                               \
                                                                                \
-  vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
+  vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
                                                                                \
   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
                                                                                \
@@ -904,7 +907,9 @@ function(compute_all_gradients)
   ble 1f;                                                                      \
                                                                                \
   orr temp, y_a, y_a, lsl #16;                                                 \
+  cmp height, #512;                                                            \
   add temp, temp, #(1 << 16);                                                  \
+  movgt height, #512;                                                          \
   add y_a, temp, #2;                                                           \
   add y_a, y_a, #(2 << 16);                                                    \
   vmov y_x4, temp, y_a;                                                        \
@@ -959,7 +964,9 @@ function(compute_all_gradients)
   ble 1f;                                                                      \
                                                                                \
   orr temp, y_a, y_a, lsl #16;                                                 \
+  cmp height, #512;                                                            \
   sub temp, temp, #(1 << 16);                                                  \
+  movgt height, #512;                                                          \
   sub y_a, temp, #2;                                                           \
   sub y_a, y_a, #(2 << 16);                                                    \
   vmov y_x4, temp, y_a;                                                        \
@@ -983,6 +990,7 @@ function(compute_all_gradients)
 
 
 #define setup_spans_epilogue()                                                 \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 
@@ -1357,6 +1365,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
                                                                                \
   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
@@ -1586,6 +1595,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
@@ -1593,9 +1603,9 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
                                                                                \
   vpop { uvrg_dx4 };                                                           \
   vpop { texture_mask };                                                       \
@@ -1626,6 +1636,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
                                                                                \
   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
@@ -1783,6 +1794,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
@@ -1790,9 +1802,9 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
                                                                                \
   vpop { uvrg_dx4 };                                                           \
   vpop { texture_mask };                                                       \
@@ -1819,6 +1831,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
   bxeq lr
 
   stmdb sp!, { r4 - r11, r14 }
+  save_abi_regs()
   vld1.u32 { test_mask }, [psx_gpu, :128]
 
   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
@@ -1901,6 +1914,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
   bne 0b
 
+  restore_abi_regs()
   ldmia sp!, { r4 - r11, pc }
                                                                            
  2:
@@ -2123,6 +2137,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
                                                                                \
   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
@@ -2315,6 +2330,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
@@ -2366,6 +2382,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
   bxeq lr;                                                                     \
                                                                                \
   stmdb sp!, { r4 - r11, r14 };                                                \
+  save_abi_regs();                                                             \
   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
                                                                                \
   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
@@ -2586,6 +2603,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
                                                                                \
   bne 0b;                                                                      \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 setup_blocks_shaded_untextured_direct_builder(undithered)
@@ -3159,6 +3177,7 @@ function(texture_blocks_16bpp)
 .align 3;                                                                      \
                                                                                \
 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
+  save_abi_regs();                                                             \
   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
   stmdb sp!, { r4 - r5, lr };                                                  \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
@@ -3276,7 +3295,9 @@ function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
   shade_blocks_textured_modulated_store_pixels_##target();                     \
                                                                                \
-  ldmia sp!, { r4 - r5, pc }                                                   \
+  ldmia sp!, { r4 - r5, lr };                                                  \
+  restore_abi_regs();                                                          \
+  bx lr                                                                        \
 
 
 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
@@ -3341,7 +3362,8 @@ shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
 
 .align 3
 function(shade_blocks_textured_unmodulated_indirect)
-  str r14, [sp, #-4]
+  stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -3384,13 +3406,15 @@ function(shade_blocks_textured_unmodulated_indirect)
   vorr.u16 draw_mask_combined, draw_mask, zero_mask
   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
 
-  ldr pc, [sp, #-4]
+  restore_abi_regs()
+  ldmia sp!, { r4, pc }
 
 
 .align 3
 
 function(shade_blocks_textured_unmodulated_direct)
   stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -3452,6 +3476,7 @@ function(shade_blocks_textured_unmodulated_direct)
 
   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
 
+  restore_abi_regs()
   ldmia sp!, { r4, pc }
 
  4:
@@ -3471,6 +3496,7 @@ function(shade_blocks_unshaded_untextured_indirect)
 
 function(shade_blocks_unshaded_untextured_direct)
   stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -3517,6 +3543,7 @@ function(shade_blocks_unshaded_untextured_direct)
   vbif.u16 fb_pixels_next, pixels, draw_mask
   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
 
+  restore_abi_regs()
   ldmia sp!, { r4, pc }
 
  4:
@@ -3622,6 +3649,7 @@ function(shade_blocks_unshaded_untextured_direct)
                                                                                \
 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -3703,6 +3731,7 @@ function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
   vst1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -3741,6 +3770,7 @@ blend_blocks_average_builder(untextured, on)
                                                                                \
 function(blend_blocks_textured_add_##mask_evaluate)                            \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -3826,6 +3856,7 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -3845,6 +3876,7 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
                                                                                \
 function(blend_blocks_untextured_add_##mask_evaluate)                          \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -3920,6 +3952,7 @@ function(blend_blocks_untextured_add_##mask_evaluate)                          \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -3944,7 +3977,7 @@ blend_blocks_add_untextured_builder(on)
 #define blend_blocks_subtract_combine_textured()                               \
   vbif.u16 blend_pixels, pixels, blend_mask                                    \
 
-#define blend_blocks_subtract_set_stb_textured()                               \
+#define blend_blocks_subtract_set_stp_textured()                               \
   vorr.u16 blend_pixels, #0x8000                                               \
 
 #define blend_blocks_subtract_msb_mask_textured()                              \
@@ -3954,7 +3987,7 @@ blend_blocks_add_untextured_builder(on)
 
 #define blend_blocks_subtract_combine_untextured()                             \
 
-#define blend_blocks_subtract_set_stb_untextured()                             \
+#define blend_blocks_subtract_set_stp_untextured()                             \
   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
 
 #define blend_blocks_subtract_msb_mask_untextured()                            \
@@ -3977,6 +4010,7 @@ blend_blocks_add_untextured_builder(on)
                                                                                \
 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -4019,7 +4053,7 @@ function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
-  blend_blocks_subtract_set_stb_##texturing();                                 \
+  blend_blocks_subtract_set_stp_##texturing();                                 \
   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
   blend_blocks_subtract_combine_##texturing();                                 \
   blend_blocks_subtract_set_blend_mask_##texturing();                          \
@@ -4047,11 +4081,12 @@ function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
                                                                                \
   blend_blocks_subtract_msb_mask_##texturing();                                \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
-  blend_blocks_subtract_set_stb_##texturing();                                 \
+  blend_blocks_subtract_set_stp_##texturing();                                 \
   blend_blocks_subtract_combine_##texturing();                                 \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -4076,6 +4111,7 @@ blend_blocks_subtract_builder(untextured, on)
                                                                                \
 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -4119,6 +4155,7 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
                                                                                \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
+  vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
                                                                                \
   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
@@ -4154,11 +4191,13 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
                                                                                \
  1:                                                                            \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
-  vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
+  vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
+  vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -4178,6 +4217,7 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
                                                                                \
 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
   stmdb sp!, { r4, r14 };                                                      \
+  save_abi_regs();                                                             \
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
                                                                                \
@@ -4257,6 +4297,7 @@ function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
                                                                                \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4, pc };                                                       \
                                                                                \
  2:                                                                            \
@@ -4282,6 +4323,7 @@ blend_blocks_add_fourth_untextured_builder(on)
 
 function(blend_blocks_textured_unblended_on)         
   stmdb sp!, { r4, r14 }
+  save_abi_regs()
   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
 
@@ -4321,6 +4363,7 @@ function(blend_blocks_textured_unblended_on)
   vbif.u16 fb_pixels, pixels, draw_mask
   vst1.u16 { fb_pixels }, [fb_ptr]
 
+  restore_abi_regs()
   ldmia sp!, { r4, pc }
 
 
@@ -4790,6 +4833,7 @@ setup_sprite_update_texture_8bpp_cache:
   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 #define setup_sprite_tiled_advance_column()                                    \
@@ -4826,6 +4870,7 @@ setup_sprite_update_texture_8bpp_cache:
                                                                                \
   setup_sprite_tiled_advance_column();                                         \
   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
+  restore_abi_regs();                                                          \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 
@@ -5184,6 +5229,8 @@ function(setup_sprite_##texture_mode##x4mode)                                  \
   ldr height, [sp, #44];                                                       \
   add fb_ptr, fb_ptr, y, lsl #11;                                              \
                                                                                \
+  save_abi_regs();                                                             \
+                                                                               \
   add fb_ptr, fb_ptr, x, lsl #1;                                               \
   and offset_v, v, #0xF;                                                       \
                                                                                \
@@ -5782,14 +5829,7 @@ function(setup_sprite_16bpp_4x)
 
 .align 3
 
-function(setup_sprite_untextured)
-  ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
-  tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
-    | RENDER_FLAGS_BLEND)
-  ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
-  tsteq r12, #RENDER_INTERLACE_ENABLED
-  beq setup_sprite_untextured_simple
-
+function(setup_sprite_untextured_512)
   stmdb sp!, { r4 - r11, r14 }
 
   ldr width, [sp, #40]
@@ -5907,7 +5947,7 @@ setup_sprite_untextured_height_loop:
 #define texel_block_expanded_b                            q2
 #define texel_block_expanded_ab                           q2
 #define texel_block_expanded_c                            q3
-#define texel_block_expanded_d                            q4
+#define texel_block_expanded_d                            q0
 #define texel_block_expanded_cd                           q3
 
 function(update_texture_4bpp_cache)
@@ -6089,6 +6129,7 @@ function(scale2x_tiles8)
   mov r14, r2
 
 0:
+  pld [r1, #1024*2]
   vld1.u16 { q0 }, [r1, :128]!
   vld1.u16 { q2 }, [r1, :128]!
   vmov q1, q0