frontend: update libpicofe, fix missed callbacks
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
index c62c1ba..d187fce 100644 (file)
  * General Public License for more details.
  */
 
-#define MAX_SPANS                                         512
-#define MAX_BLOCKS                                        64
-#define MAX_BLOCKS_PER_ROW                                128
-
-#define RENDER_STATE_MASK_EVALUATE                        0x20
-#define RENDER_FLAGS_MODULATE_TEXELS                      0x1
-#define RENDER_FLAGS_BLEND                                0x2
 #define RENDER_INTERLACE_ENABLED                          0x1
 
+#include "psx_gpu.h"
 #include "psx_gpu_offsets.h"
 
 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
 
 #ifdef __MACH__
 #define flush_render_block_buffer _flush_render_block_buffer
-#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
 #define update_texture_8bpp_cache _update_texture_8bpp_cache
+#define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack
 #endif
 
 @ r0: psx_gpu
@@ -550,6 +544,7 @@ function(compute_all_gradients)
 
 #define uvrg                                     q14
 #define uvrg_dy                                  q15
+#define uv                                       d28
 
 #define alternate_x_16                           d4
 
@@ -565,6 +560,8 @@ function(compute_all_gradients)
 #define left_x_32_low                            d22
 #define left_x_32_high                           d23
 
+#define tmp_max_blocks                           d20
+
 #define edges_xy                                 q0
 #define edges_dx_dy                              d2
 #define edge_shifts                              d3
@@ -819,8 +816,10 @@ function(compute_all_gradients)
   str b, [span_b_offset], #4;                                                  \
   setup_spans_adjust_interpolants_##direction();                               \
                                                                                \
+  vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
                                                                                \
   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
                                                                                \
@@ -867,8 +866,10 @@ function(compute_all_gradients)
   str b, [span_b_offset], #4;                                                  \
   setup_spans_adjust_interpolants_##direction();                               \
                                                                                \
-  vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
+  vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
                                                                                \
   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
                                                                                \
@@ -908,7 +909,9 @@ function(compute_all_gradients)
   ble 1f;                                                                      \
                                                                                \
   orr temp, y_a, y_a, lsl #16;                                                 \
+  cmp height, #512;                                                            \
   add temp, temp, #(1 << 16);                                                  \
+  movgt height, #512;                                                          \
   add y_a, temp, #2;                                                           \
   add y_a, y_a, #(2 << 16);                                                    \
   vmov y_x4, temp, y_a;                                                        \
@@ -924,6 +927,14 @@ function(compute_all_gradients)
   subs height, height, #4;                                                     \
   bhi 2b;                                                                      \
                                                                                \
+  nop;                                                                         \
+  ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset];                           \
+  tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V);                      \
+  beq 1f;                                                                      \
+  add temp, span_uvrg_offset, height, lsl #4;                                  \
+  vldr uv, [temp, #(-16*2)];                                                   \
+  vstr uv, [temp, #(-16)];                                                     \
+                                                                               \
  1:                                                                            \
 
 
@@ -963,7 +974,9 @@ function(compute_all_gradients)
   ble 1f;                                                                      \
                                                                                \
   orr temp, y_a, y_a, lsl #16;                                                 \
+  cmp height, #512;                                                            \
   sub temp, temp, #(1 << 16);                                                  \
+  movgt height, #512;                                                          \
   sub y_a, temp, #2;                                                           \
   sub y_a, y_a, #(2 << 16);                                                    \
   vmov y_x4, temp, y_a;                                                        \
@@ -983,6 +996,14 @@ function(compute_all_gradients)
   subs height, height, #4;                                                     \
   bhi 2b;                                                                      \
                                                                                \
+  nop;                                                                         \
+  ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset];                           \
+  tst temp, #AHACK_TEXTURE_ADJ_V;                                              \
+  beq 1f;                                                                      \
+  add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset;                         \
+  vldr uv, [temp, #16];                                                        \
+  vstr uv, [temp, #0];                                                         \
+                                                                               \
  1:                                                                            \
 
 
@@ -1213,6 +1234,14 @@ function(setup_spans_up_down)
   subs height_minor_b, height_minor_b, #4
   bhi 2b
 
+  nop
+  ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]
+  tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)
+  beq 1f
+  add temp, span_uvrg_offset, height, lsl #4
+  vldr uv, [temp, #(-16*2)]
+  vstr uv, [temp, #(-16)]
+
  1:
   setup_spans_epilogue()
 
@@ -1253,6 +1282,7 @@ function(setup_spans_up_down)
 
 #define uvrg_dx_ptr                              r2
 #define texture_mask_ptr                         r3
+#define hacks_active                             r6
 #define dither_shift                             r8
 #define dither_row                               r10
 
@@ -1270,6 +1300,7 @@ function(setup_spans_up_down)
 #define color_b                                  r5
 
 #undef uvrg
+#undef uv
 
 #define u_block                                  q0
 #define v_block                                  q1
@@ -1347,6 +1378,26 @@ function(setup_spans_up_down)
 
 #define setup_blocks_texture_unswizzled()                                      \
 
+#define setup_blocks_uv_adj_hack_textured(hacks_active)                        \
+  tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V);              \
+  beq 91f;                                                                     \
+                                                                               \
+  /* pushing odd num of regs here realigns our unaligned stack */              \
+  vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
+  vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
+  push { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 };                               \
+  mov r12, span_uvrg_offset;                                                   \
+  sub r1, block_ptr_a, #64;                                                    \
+  mov r2, span_edge_data;                                                      \
+  mov r3, r12;                                                                 \
+  bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */                                \
+  pop  { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 };                               \
+  vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
+  vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
+                                                                               \
+  vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
+91:                                                                            \
+
 
 #define setup_blocks_shaded_textured_builder(swizzling)                        \
 .align 3;                                                                      \
@@ -1572,6 +1623,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
   vdup.u8 draw_mask, right_mask;                                               \
                                                                                \
+  ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset];                   \
   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
   vzip.u8 u_whole_8, v_whole_8;                                                \
@@ -1582,6 +1634,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
                                                                                \
+  setup_blocks_uv_adj_hack_textured(hacks_active);                             \
+                                                                               \
  1:                                                                            \
   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
   add span_b_offset, span_b_offset, #4;                                        \
@@ -1596,16 +1650,14 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
-  /* TODO: Load from psx_gpu instead of saving/restoring these               */\
-  vpush { texture_mask };                                                      \
-  vpush { uvrg_dx4 };                                                          \
-                                                                               \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
+  vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
+  vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
+  /* pushing odd num of regs here realigns our unaligned stack */              \
+  push  { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */              \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
-                                                                               \
-  vpop { uvrg_dx4 };                                                           \
-  vpop { texture_mask };                                                       \
+  pop   { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
+  vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
+  vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
                                                                                \
   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
   vmov.u8 fb_mask_ptrs, #0;                                                    \
@@ -1773,6 +1825,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
   vdup.u8 draw_mask, right_mask;                                               \
                                                                                \
+  ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset];                   \
   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
   vzip.u8 u_whole_8, v_whole_8;                                                \
@@ -1783,6 +1836,8 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
                                                                                \
+  setup_blocks_uv_adj_hack_textured(hacks_active);                             \
+                                                                               \
  1:                                                                            \
   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
   add span_edge_data, span_edge_data, #8;                                      \
@@ -1795,16 +1850,13 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   ldmia sp!, { r4 - r11, pc };                                                 \
                                                                                \
  2:                                                                            \
-  /* TODO: Load from psx_gpu instead of saving/restoring these               */\
-  vpush { texture_mask };                                                      \
-  vpush { uvrg_dx4 };                                                          \
-                                                                               \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
+  vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
+  vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
+  push  { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */              \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
-                                                                               \
-  vpop { uvrg_dx4 };                                                           \
-  vpop { texture_mask };                                                       \
+  pop   { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
+  vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
+  vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
                                                                                \
   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
   vmov.u8 fb_mask_ptrs, #0;                                                    \
@@ -1915,13 +1967,13 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
   ldmia sp!, { r4 - r11, pc }
                                                                            
  2:
-  vpush { colors }
-
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
+  vstr d4, [r0, #psx_gpu_saved_tmp_offset]       /* colors */
+  vstr d5, [r0, #psx_gpu_saved_tmp_offset + 8]
+  push { r0 - r3, EXTRA_UNSAVED_REGS r12 }
   bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
-
-  vpop { colors }
+  pop  { r0 - r3, EXTRA_UNSAVED_REGS r12 }
+  vldr d4, [r0, #psx_gpu_saved_tmp_offset]
+  vldr d5, [r0, #psx_gpu_saved_tmp_offset + 8]
 
   vld1.u32 { test_mask }, [psx_gpu, :128]
   veor.u32 draw_mask, draw_mask, draw_mask
@@ -2328,17 +2380,14 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   bne 0b;                                                                      \
                                                                                \
   restore_abi_regs();                                                          \
-  ldmia sp!, { r4 - r11, pc };                                                 \
+  pop   { r4 - r11, pc };                                                      \
                                                                                \
  2:                                                                            \
-  /* TODO: Load from psx_gpu instead of saving/restoring these               */\
-  vpush { rg_dx4 };                                                            \
-                                                                               \
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
+  vstr rg_dx4, [r0, #psx_gpu_saved_tmp_offset];                                \
+  push  { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
-                                                                               \
-  vpop { rg_dx4 };                                                             \
+  pop   { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
+  vldr rg_dx4, [r0, #psx_gpu_saved_tmp_offset];                                \
                                                                                \
   vmov.u8 d64_1, #1;                                                           \
   vmov.u8 d128_4, #4;                                                          \
@@ -2748,7 +2797,7 @@ function(texture_blocks_4bpp)
 .align 3
 
 function(texture_blocks_8bpp)
-  stmdb sp!, { r3 - r11, r14 }
+  push { r4 - r11, lr }
   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
 
   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
@@ -2826,15 +2875,14 @@ function(texture_blocks_8bpp)
   add block_ptr, block_ptr, #64
   bne 0b
 
-  ldmia sp!, { r3 - r11, pc }
+  pop { r4 - r11, pc }
 
 1:
-  stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
-
-  bl update_texture_8bpp_cache
-
-  ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
-  bal 0b
+  /* pushing odd num of regs here realigns our unaligned stack */
+  push { r1 - r2, EXTRA_UNSAVED_REGS r12 }
+  bl   update_texture_8bpp_cache
+  pop  { r1 - r2, EXTRA_UNSAVED_REGS r12 }
+  bal  0b
 
 
 #undef uv_0
@@ -4383,51 +4431,6 @@ function(warmup)
 
 #undef vram_ptr
 #undef color
-#undef width
-#undef height
-#undef pitch
-
-#define vram_ptr                                          r0
-#define color                                             r1
-#define width                                             r2
-#define height                                            r3
-
-#define pitch                                             r1
-
-#define num_width                                         r12
-
-#undef colors_a
-#undef colors_b
-
-#define colors_a                                          q0
-#define colors_b                                          q1
-
-.align 3
-
-function(render_block_fill_body)
-  vdup.u16 colors_a, color
-  mov pitch, #2048
-
-  vmov colors_b, colors_a
-  sub pitch, pitch, width, lsl #1
-
-  mov num_width, width
-
- 0:  
-  vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
-
-  subs num_width, num_width, #16
-  bne 0b
-
-  add vram_ptr, vram_ptr, pitch
-  mov num_width, width
-
-  subs height, height, #1
-  bne 0b
-
-  bx lr
-
 #undef x
 #undef y
 #undef width
@@ -4520,30 +4523,30 @@ function(render_block_fill_body)
 #define texels_wide_high                                  d15
 #define texels_wide                                       q7
 
+.align 3
 
 setup_sprite_flush_blocks:
-  vpush { q1 - q5 }
+  push   { r0 - r3, EXTRA_UNSAVED_REGS r12, lr }
+  add    block, r0, #psx_gpu_saved_tmp_offset        /* r5 */
+  vstmia block, { q1 - q3 }
+  bl     flush_render_block_buffer
+  vldmia block, { q1 - q3 }
+  pop    { r0 - r3, EXTRA_UNSAVED_REGS r12, lr }
 
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
-  bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
-
-  vpop { q1 - q5 }
-
-  add block, psx_gpu, #psx_gpu_blocks_offset
-  bx lr
+  add    block, psx_gpu, #psx_gpu_blocks_offset
+  bx     lr
 
 
 setup_sprite_update_texture_4bpp_cache:
-  stmdb sp!, { r0 - r3, r14 }
+  push { r0 - r4, lr }
   bl update_texture_4bpp_cache
-  ldmia sp!, { r0 - r3, pc }
+  pop  { r0 - r4, pc }
 
 
 setup_sprite_update_texture_8bpp_cache:
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
+  push { r0 - r4, EXTRA_UNSAVED_REGS lr }
   bl update_texture_8bpp_cache
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
+  pop  { r0 - r4, EXTRA_UNSAVED_REGS pc }
 
 
 #define setup_sprite_tiled_initialize_4bpp()                                   \
@@ -4830,8 +4833,8 @@ setup_sprite_update_texture_8bpp_cache:
   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
-  restore_abi_regs();                                                          \
-  ldmia sp!, { r4 - r11, pc }                                                  \
+  vpop { q4 - q7 };                                                            \
+  pop  { r3 - r11, pc }                                                        \
 
 #define setup_sprite_tiled_advance_column()                                    \
   add texture_offset_base, texture_offset_base, #0x100;                        \
@@ -4867,8 +4870,8 @@ setup_sprite_update_texture_8bpp_cache:
                                                                                \
   setup_sprite_tiled_advance_column();                                         \
   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
-  restore_abi_regs();                                                          \
-  ldmia sp!, { r4 - r11, pc }                                                  \
+  vpop { q4 - q7 };                                                            \
+  pop  { r3 - r11, pc }                                                        \
 
 
 #define setup_sprite_offset_u_adjust()                                         \
@@ -5214,19 +5217,19 @@ setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
 .align 4;                                                                      \
                                                                                \
 function(setup_sprite_##texture_mode##x4mode)                                  \
-  stmdb sp!, { r4 - r11, r14 };                                                \
+  push { r3 - r11, lr };                                                       \
   setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
                                                                                \
-  ldr v, [sp, #36];                                                            \
+  ldr v, [sp, #4*(10+0)];                                                      \
   and offset_u, u, #0xF;                                                       \
                                                                                \
-  ldr width, [sp, #40];                                                        \
+  ldr width, [sp, #4*(10+1)];                                                  \
   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
                                                                                \
-  ldr height, [sp, #44];                                                       \
+  ldr height, [sp, #4*(10+2)];                                                 \
   add fb_ptr, fb_ptr, y, lsl #11;                                              \
                                                                                \
-  save_abi_regs();                                                             \
+  vpush { q4 - q7 };                                                           \
                                                                                \
   add fb_ptr, fb_ptr, x, lsl #1;                                               \
   and offset_v, v, #0xF;                                                       \
@@ -5350,7 +5353,7 @@ setup_sprite_tiled_builder(8bpp, _4x);
 #define texels_67                                         r9
 
 function(texture_sprite_blocks_8bpp)
-  stmdb sp!, { r4 - r11, r14 }
+  push { r4 - r11, r14 }
   movw texel_shift_mask, #(0xFF << 1)
 
   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
@@ -5403,8 +5406,9 @@ function(texture_sprite_blocks_8bpp)
   add block_ptr, block_ptr, #64
 
   bne 0b
+  nop
 
-  ldmia sp!, { r4 - r11, pc }
+  pop { r4 - r11, pc }
 
 
 #undef width_rounded
@@ -5469,30 +5473,30 @@ function(texture_sprite_blocks_8bpp)
 
 
 setup_sprites_16bpp_flush:
-  vpush { d0 - d3 }
-
-  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
-  bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
-
-  vpop { d0 - d3 }
+  push   { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
+  add    r1, r0, #psx_gpu_saved_tmp_offset
+  vstmia r1, { d0 - d3 }
+  bl     flush_render_block_buffer
+  pop    { r0 - r3, EXTRA_UNSAVED_REGS r12 }
+  add    lr, r0, #psx_gpu_saved_tmp_offset
+  vldmia lr, { d0 - d3 }
 
   add block, psx_gpu, #psx_gpu_blocks_offset
   mov num_blocks, block_width
 
-  bx lr
+  pop { pc }
 
 function(setup_sprite_16bpp)
-  stmdb sp!, { r4 - r11, r14 }
+  push { r3 - r11, lr }
   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
 
-  ldr v, [sp, #36]
+  ldr v, [sp, #4*(10+0)]
   add fb_ptr, fb_ptr, y, lsl #11
 
-  ldr width, [sp, #40]
+  ldr width, [sp, #4*(10+1)]
   add fb_ptr, fb_ptr, x, lsl #1
 
-  ldr height, [sp, #44]
+  ldr height, [sp, #4*(10+2)]
   and left_offset, u, #0x7
 
   add texture_offset_base, u, u
@@ -5562,7 +5566,7 @@ function(setup_sprite_16bpp)
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
   bne 1b
 
-  ldmia sp!, { r4 - r11, pc }
+  pop { r3 - r11, pc }
 
  0:
   add num_blocks, num_blocks, block_width
@@ -5636,8 +5640,9 @@ function(setup_sprite_16bpp)
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
 
   bne 0b
+  nop
 
-  ldmia sp!, { r4 - r11, pc }
+  pop { r3 - r11, pc }
 
 
 // 4x version
@@ -5645,16 +5650,16 @@ function(setup_sprite_16bpp)
 #undef draw_mask_fb_ptr
 
 function(setup_sprite_16bpp_4x)
-  stmdb sp!, { r4 - r11, r14 }
+  push { r3 - r11, lr }
   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
 
-  ldr v, [sp, #36]
+  ldr v, [sp, #4*(10+0)]
   add fb_ptr, fb_ptr, y, lsl #11
 
-  ldr width, [sp, #40]
+  ldr width, [sp, #4*(10+1)]
   add fb_ptr, fb_ptr, x, lsl #1
 
-  ldr height, [sp, #44]
+  ldr height, [sp, #4*(10+2)]
   and left_offset, u, #0x7
 
   add texture_offset_base, u, u
@@ -5723,7 +5728,7 @@ function(setup_sprite_16bpp_4x)
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
   bne 1b
 
-  ldmia sp!, { r4 - r11, pc }
+  pop { r3 - r11, pc }
 
  0:
   add num_blocks, num_blocks, block_width
@@ -5781,8 +5786,9 @@ function(setup_sprite_16bpp_4x)
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
 
   bne 0b
+  nop
 
-  ldmia sp!, { r4 - r11, pc }
+  pop { r3 - r11, pc }
 
 
 #undef width
@@ -5826,26 +5832,19 @@ function(setup_sprite_16bpp_4x)
 
 .align 3
 
-function(setup_sprite_untextured)
-  ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
-  tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
-    | RENDER_FLAGS_BLEND)
-  ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
-  tsteq r12, #RENDER_INTERLACE_ENABLED
-  beq setup_sprite_untextured_simple
-
-  stmdb sp!, { r4 - r11, r14 }
+function(setup_sprite_untextured_512)
+  push { r4 - r11, r14 }
 
-  ldr width, [sp, #40]
+  ldr width, [sp, #4*(9+1)]
   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
 
-  ldr height, [sp, #44]
+  ldr height, [sp, #4*(9+2)]
   add fb_ptr, fb_ptr, y, lsl #11
 
   add fb_ptr, fb_ptr, x, lsl #1
   sub right_width, width, #1
 
-  ldr color, [sp, #48]
+  ldr color, [sp, #4*(9+3)]
   and right_width, #7
 
   add block_width, width, #7
@@ -5922,7 +5921,7 @@ setup_sprite_untextured_height_loop:
   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
   bgt setup_sprite_untextured_height_loop
 
-  ldmia sp!, { r4 - r11, pc }
+  pop { r4 - r11, pc }
 
 
 
@@ -5955,7 +5954,7 @@ setup_sprite_untextured_height_loop:
 #define texel_block_expanded_cd                           q3
 
 function(update_texture_4bpp_cache)
-  stmdb sp!, { r4 - r11, r14 }
+  push  { r3 - r11, r14 }
   vpush { q0 - q3 }
 
   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
@@ -6029,7 +6028,7 @@ function(update_texture_4bpp_cache)
   bne 0b
 
   vpop { q0 - q3 }
-  ldmia sp!, { r4 - r11, pc }
+  pop  { r3 - r11, pc }
 
 
 #undef current_texture_page
@@ -6059,7 +6058,6 @@ function(update_texture_4bpp_cache)
 
 function(update_texture_8bpp_cache_slice)
   stmdb sp!, { r4 - r11, r14 }
-  vpush { q0 - q3 }
 
   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
@@ -6120,7 +6118,6 @@ function(update_texture_8bpp_cache_slice)
 
   bne 0b
 
-  vpop { q0 - q3 }
   ldmia sp!, { r4 - r11, pc }
 
 
@@ -6133,6 +6130,7 @@ function(scale2x_tiles8)
   mov r14, r2
 
 0:
+  pld [r1, #1024*2]
   vld1.u16 { q0 }, [r1, :128]!
   vld1.u16 { q2 }, [r1, :128]!
   vmov q1, q0