X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=plugins%2Fgpu_neon%2Fpsx_gpu%2Fpsx_gpu_arm_neon.S;h=ffbea043c2cb64ddde846414f0ea62390ebf7d19;hp=da47756efcaea45cd3704fd5338e88c3895daa75;hb=refs%2Fheads%2Fmaster;hpb=26e3e2aa7525fd4e63e64192dfbb68950e0e4c5a diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index da47756e..d187fce9 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -13,15 +13,9 @@ * General Public License for more details. */ -#define MAX_SPANS 512 -#define MAX_BLOCKS 64 -#define MAX_BLOCKS_PER_ROW 128 - -#define RENDER_STATE_MASK_EVALUATE 0x20 -#define RENDER_FLAGS_MODULATE_TEXELS 0x1 -#define RENDER_FLAGS_BLEND 0x2 #define RENDER_INTERLACE_ENABLED 0x1 +#include "psx_gpu.h" #include "psx_gpu_offsets.h" #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4) @@ -228,8 +222,8 @@ #ifdef __MACH__ #define flush_render_block_buffer _flush_render_block_buffer -#define setup_sprite_untextured_simple _setup_sprite_untextured_simple #define update_texture_8bpp_cache _update_texture_8bpp_cache +#define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack #endif @ r0: psx_gpu @@ -369,8 +363,8 @@ function(compute_all_gradients) sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS) vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16 - vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift } - + vdup.u32 r_shift, r14 @ r_shift = { shift, shift*, shift, shift* } + @ * - vshl.u64: ignored by hw vadd.u32 uvrg_base, uvrgb_phase vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x) @@ -550,6 +544,7 @@ function(compute_all_gradients) #define uvrg q14 #define uvrg_dy q15 +#define uv d28 #define alternate_x_16 d4 @@ -565,6 +560,8 @@ function(compute_all_gradients) #define left_x_32_low d22 #define left_x_32_high d23 +#define tmp_max_blocks d20 + #define edges_xy q0 #define edges_dx_dy d2 #define edge_shifts d3 @@ -819,8 +816,10 @@ function(compute_all_gradients) str b, [span_b_offset], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ + vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW; \ vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \ vshl.u16 span_shifts, c_0xFFFE, span_shifts; \ + vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks; \ \ vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \ \ @@ -867,8 +866,10 @@ function(compute_all_gradients) str b, [span_b_offset], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ - vshl.u16 span_shifts, c_0xFFFE, span_shifts; \ + vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW; \ vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \ + vshl.u16 span_shifts, c_0xFFFE, span_shifts; \ + vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks; \ \ vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \ \ @@ -908,7 +909,9 @@ function(compute_all_gradients) ble 1f; \ \ orr temp, y_a, y_a, lsl #16; \ + cmp height, #512; \ add temp, temp, #(1 << 16); \ + movgt height, #512; \ add y_a, temp, #2; \ add y_a, y_a, #(2 << 16); \ vmov y_x4, temp, y_a; \ @@ -924,6 +927,14 @@ function(compute_all_gradients) subs height, height, #4; \ bhi 2b; \ \ + nop; \ + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \ + tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ + beq 1f; \ + add temp, span_uvrg_offset, height, lsl #4; \ + vldr uv, [temp, #(-16*2)]; \ + vstr uv, [temp, #(-16)]; \ + \ 1: \ @@ -963,7 +974,9 @@ function(compute_all_gradients) ble 1f; \ \ orr temp, y_a, y_a, lsl #16; \ + cmp height, #512; \ sub temp, temp, #(1 << 16); \ + movgt height, #512; \ sub y_a, temp, #2; \ sub y_a, y_a, #(2 << 16); \ vmov y_x4, temp, y_a; \ @@ -983,6 +996,14 @@ function(compute_all_gradients) subs height, height, #4; \ bhi 2b; \ \ + nop; \ + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \ + tst temp, #AHACK_TEXTURE_ADJ_V; \ + beq 1f; \ + add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ + vldr uv, [temp, #16]; \ + vstr uv, [temp, #0]; \ + \ 1: \ @@ -1213,6 +1234,14 @@ function(setup_spans_up_down) subs height_minor_b, height_minor_b, #4 bhi 2b + nop + ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset] + tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V) + beq 1f + add temp, span_uvrg_offset, height, lsl #4 + vldr uv, [temp, #(-16*2)] + vstr uv, [temp, #(-16)] + 1: setup_spans_epilogue() @@ -1253,6 +1282,7 @@ function(setup_spans_up_down) #define uvrg_dx_ptr r2 #define texture_mask_ptr r3 +#define hacks_active r6 #define dither_shift r8 #define dither_row r10 @@ -1270,6 +1300,7 @@ function(setup_spans_up_down) #define color_b r5 #undef uvrg +#undef uv #define u_block q0 #define v_block q1 @@ -1347,6 +1378,26 @@ function(setup_spans_up_down) #define setup_blocks_texture_unswizzled() \ +#define setup_blocks_uv_adj_hack_textured(hacks_active) \ + tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ + beq 91f; \ + \ + /* pushing odd num of regs here realigns our unaligned stack */ \ + vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + push { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 }; \ + mov r12, span_uvrg_offset; \ + sub r1, block_ptr_a, #64; \ + mov r2, span_edge_data; \ + mov r3, r12; \ + bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */ \ + pop { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 }; \ + vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + \ + vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ +91: \ + #define setup_blocks_shaded_textured_builder(swizzling) \ .align 3; \ @@ -1572,6 +1623,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vld1.u32 { test_mask }, [psx_gpu, :128]; \ vdup.u8 draw_mask, right_mask; \ \ + ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ @@ -1582,6 +1634,8 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \ \ + setup_blocks_uv_adj_hack_textured(hacks_active); \ + \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_b_offset, span_b_offset, #4; \ @@ -1596,16 +1650,14 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ - vpush { texture_mask }; \ - vpush { uvrg_dx4 }; \ - \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ + vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + /* pushing odd num of regs here realigns our unaligned stack */ \ + push { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ - \ - vpop { uvrg_dx4 }; \ - vpop { texture_mask }; \ + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ + vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ vmov.u8 fb_mask_ptrs, #0; \ @@ -1773,6 +1825,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vld1.u32 { test_mask }, [psx_gpu, :128]; \ vdup.u8 draw_mask, right_mask; \ \ + ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ @@ -1783,6 +1836,8 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \ \ + setup_blocks_uv_adj_hack_textured(hacks_active); \ + \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_edge_data, span_edge_data, #8; \ @@ -1795,16 +1850,13 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ - vpush { texture_mask }; \ - vpush { uvrg_dx4 }; \ - \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ + vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + push { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ - \ - vpop { uvrg_dx4 }; \ - vpop { texture_mask }; \ + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ + vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ vmov.u8 fb_mask_ptrs, #0; \ @@ -1915,13 +1967,13 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect) ldmia sp!, { r4 - r11, pc } 2: - vpush { colors } - - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } + vstr d4, [r0, #psx_gpu_saved_tmp_offset] /* colors */ + vstr d5, [r0, #psx_gpu_saved_tmp_offset + 8] + push { r0 - r3, EXTRA_UNSAVED_REGS r12 } bl flush_render_block_buffer - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - - vpop { colors } + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 } + vldr d4, [r0, #psx_gpu_saved_tmp_offset] + vldr d5, [r0, #psx_gpu_saved_tmp_offset + 8] vld1.u32 { test_mask }, [psx_gpu, :128] veor.u32 draw_mask, draw_mask, draw_mask @@ -2328,17 +2380,14 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ bne 0b; \ \ restore_abi_regs(); \ - ldmia sp!, { r4 - r11, pc }; \ + pop { r4 - r11, pc }; \ \ 2: \ - /* TODO: Load from psx_gpu instead of saving/restoring these */\ - vpush { rg_dx4 }; \ - \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + vstr rg_dx4, [r0, #psx_gpu_saved_tmp_offset]; \ + push { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ - \ - vpop { rg_dx4 }; \ + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ + vldr rg_dx4, [r0, #psx_gpu_saved_tmp_offset]; \ \ vmov.u8 d64_1, #1; \ vmov.u8 d128_4, #4; \ @@ -2748,7 +2797,7 @@ function(texture_blocks_4bpp) .align 3 function(texture_blocks_8bpp) - stmdb sp!, { r3 - r11, r14 } + push { r4 - r11, lr } add block_ptr, psx_gpu, #psx_gpu_blocks_offset ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset] @@ -2826,15 +2875,14 @@ function(texture_blocks_8bpp) add block_ptr, block_ptr, #64 bne 0b - ldmia sp!, { r3 - r11, pc } + pop { r4 - r11, pc } 1: - stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 } - - bl update_texture_8bpp_cache - - ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 } - bal 0b + /* pushing odd num of regs here realigns our unaligned stack */ + push { r1 - r2, EXTRA_UNSAVED_REGS r12 } + bl update_texture_8bpp_cache + pop { r1 - r2, EXTRA_UNSAVED_REGS r12 } + bal 0b #undef uv_0 @@ -4383,51 +4431,6 @@ function(warmup) #undef vram_ptr #undef color -#undef width -#undef height -#undef pitch - -#define vram_ptr r0 -#define color r1 -#define width r2 -#define height r3 - -#define pitch r1 - -#define num_width r12 - -#undef colors_a -#undef colors_b - -#define colors_a q0 -#define colors_b q1 - -.align 3 - -function(render_block_fill_body) - vdup.u16 colors_a, color - mov pitch, #2048 - - vmov colors_b, colors_a - sub pitch, pitch, width, lsl #1 - - mov num_width, width - - 0: - vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]! - - subs num_width, num_width, #16 - bne 0b - - add vram_ptr, vram_ptr, pitch - mov num_width, width - - subs height, height, #1 - bne 0b - - bx lr - - #undef x #undef y #undef width @@ -4520,30 +4523,30 @@ function(render_block_fill_body) #define texels_wide_high d15 #define texels_wide q7 +.align 3 setup_sprite_flush_blocks: - vpush { q1 - q5 } + push { r0 - r3, EXTRA_UNSAVED_REGS r12, lr } + add block, r0, #psx_gpu_saved_tmp_offset /* r5 */ + vstmia block, { q1 - q3 } + bl flush_render_block_buffer + vldmia block, { q1 - q3 } + pop { r0 - r3, EXTRA_UNSAVED_REGS r12, lr } - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - bl flush_render_block_buffer - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - - vpop { q1 - q5 } - - add block, psx_gpu, #psx_gpu_blocks_offset - bx lr + add block, psx_gpu, #psx_gpu_blocks_offset + bx lr setup_sprite_update_texture_4bpp_cache: - stmdb sp!, { r0 - r3, r14 } + push { r0 - r4, lr } bl update_texture_4bpp_cache - ldmia sp!, { r0 - r3, pc } + pop { r0 - r4, pc } setup_sprite_update_texture_8bpp_cache: - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 } + push { r0 - r4, EXTRA_UNSAVED_REGS lr } bl update_texture_8bpp_cache - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc } + pop { r0 - r4, EXTRA_UNSAVED_REGS pc } #define setup_sprite_tiled_initialize_4bpp() \ @@ -4830,8 +4833,8 @@ setup_sprite_update_texture_8bpp_cache: setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \ \ setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \ - restore_abi_regs(); \ - ldmia sp!, { r4 - r11, pc } \ + vpop { q4 - q7 }; \ + pop { r3 - r11, pc } \ #define setup_sprite_tiled_advance_column() \ add texture_offset_base, texture_offset_base, #0x100; \ @@ -4867,8 +4870,8 @@ setup_sprite_update_texture_8bpp_cache: \ setup_sprite_tiled_advance_column(); \ setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\ - restore_abi_regs(); \ - ldmia sp!, { r4 - r11, pc } \ + vpop { q4 - q7 }; \ + pop { r3 - r11, pc } \ #define setup_sprite_offset_u_adjust() \ @@ -5214,19 +5217,19 @@ setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \ .align 4; \ \ function(setup_sprite_##texture_mode##x4mode) \ - stmdb sp!, { r4 - r11, r14 }; \ + push { r3 - r11, lr }; \ setup_sprite_tiled_initialize_##texture_mode##x4mode(); \ \ - ldr v, [sp, #36]; \ + ldr v, [sp, #4*(10+0)]; \ and offset_u, u, #0xF; \ \ - ldr width, [sp, #40]; \ + ldr width, [sp, #4*(10+1)]; \ ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \ \ - ldr height, [sp, #44]; \ + ldr height, [sp, #4*(10+2)]; \ add fb_ptr, fb_ptr, y, lsl #11; \ \ - save_abi_regs(); \ + vpush { q4 - q7 }; \ \ add fb_ptr, fb_ptr, x, lsl #1; \ and offset_v, v, #0xF; \ @@ -5350,7 +5353,7 @@ setup_sprite_tiled_builder(8bpp, _4x); #define texels_67 r9 function(texture_sprite_blocks_8bpp) - stmdb sp!, { r4 - r11, r14 } + push { r4 - r11, r14 } movw texel_shift_mask, #(0xFF << 1) ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] @@ -5403,8 +5406,9 @@ function(texture_sprite_blocks_8bpp) add block_ptr, block_ptr, #64 bne 0b + nop - ldmia sp!, { r4 - r11, pc } + pop { r4 - r11, pc } #undef width_rounded @@ -5469,30 +5473,30 @@ function(texture_sprite_blocks_8bpp) setup_sprites_16bpp_flush: - vpush { d0 - d3 } - - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - bl flush_render_block_buffer - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - - vpop { d0 - d3 } + push { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } + add r1, r0, #psx_gpu_saved_tmp_offset + vstmia r1, { d0 - d3 } + bl flush_render_block_buffer + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 } + add lr, r0, #psx_gpu_saved_tmp_offset + vldmia lr, { d0 - d3 } add block, psx_gpu, #psx_gpu_blocks_offset mov num_blocks, block_width - bx lr + pop { pc } function(setup_sprite_16bpp) - stmdb sp!, { r4 - r11, r14 } + push { r3 - r11, lr } ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset] - ldr v, [sp, #36] + ldr v, [sp, #4*(10+0)] add fb_ptr, fb_ptr, y, lsl #11 - ldr width, [sp, #40] + ldr width, [sp, #4*(10+1)] add fb_ptr, fb_ptr, x, lsl #1 - ldr height, [sp, #44] + ldr height, [sp, #4*(10+2)] and left_offset, u, #0x7 add texture_offset_base, u, u @@ -5562,7 +5566,7 @@ function(setup_sprite_16bpp) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 1b - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } 0: add num_blocks, num_blocks, block_width @@ -5636,8 +5640,9 @@ function(setup_sprite_16bpp) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 0b + nop - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } // 4x version @@ -5645,16 +5650,16 @@ function(setup_sprite_16bpp) #undef draw_mask_fb_ptr function(setup_sprite_16bpp_4x) - stmdb sp!, { r4 - r11, r14 } + push { r3 - r11, lr } ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset] - ldr v, [sp, #36] + ldr v, [sp, #4*(10+0)] add fb_ptr, fb_ptr, y, lsl #11 - ldr width, [sp, #40] + ldr width, [sp, #4*(10+1)] add fb_ptr, fb_ptr, x, lsl #1 - ldr height, [sp, #44] + ldr height, [sp, #4*(10+2)] and left_offset, u, #0x7 add texture_offset_base, u, u @@ -5723,7 +5728,7 @@ function(setup_sprite_16bpp_4x) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 1b - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } 0: add num_blocks, num_blocks, block_width @@ -5781,8 +5786,9 @@ function(setup_sprite_16bpp_4x) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 0b + nop - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } #undef width @@ -5826,26 +5832,19 @@ function(setup_sprite_16bpp_4x) .align 3 -function(setup_sprite_untextured) - ldrh r12, [psx_gpu, #psx_gpu_render_state_offset] - tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \ - | RENDER_FLAGS_BLEND) - ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset] - tsteq r12, #RENDER_INTERLACE_ENABLED - beq setup_sprite_untextured_simple - - stmdb sp!, { r4 - r11, r14 } +function(setup_sprite_untextured_512) + push { r4 - r11, r14 } - ldr width, [sp, #40] + ldr width, [sp, #4*(9+1)] ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset] - ldr height, [sp, #44] + ldr height, [sp, #4*(9+2)] add fb_ptr, fb_ptr, y, lsl #11 add fb_ptr, fb_ptr, x, lsl #1 sub right_width, width, #1 - ldr color, [sp, #48] + ldr color, [sp, #4*(9+3)] and right_width, #7 add block_width, width, #7 @@ -5922,7 +5921,7 @@ setup_sprite_untextured_height_loop: strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bgt setup_sprite_untextured_height_loop - ldmia sp!, { r4 - r11, pc } + pop { r4 - r11, pc } @@ -5955,7 +5954,7 @@ setup_sprite_untextured_height_loop: #define texel_block_expanded_cd q3 function(update_texture_4bpp_cache) - stmdb sp!, { r4 - r11, r14 } + push { r3 - r11, r14 } vpush { q0 - q3 } ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset] @@ -6029,7 +6028,7 @@ function(update_texture_4bpp_cache) bne 0b vpop { q0 - q3 } - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } #undef current_texture_page @@ -6059,7 +6058,6 @@ function(update_texture_4bpp_cache) function(update_texture_8bpp_cache_slice) stmdb sp!, { r4 - r11, r14 } - vpush { q0 - q3 } ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset] ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset] @@ -6120,7 +6118,6 @@ function(update_texture_8bpp_cache_slice) bne 0b - vpop { q0 - q3 } ldmia sp!, { r4 - r11, pc } @@ -6133,6 +6130,7 @@ function(scale2x_tiles8) mov r14, r2 0: + pld [r1, #1024*2] vld1.u16 { q0 }, [r1, :128]! vld1.u16 { q2 }, [r1, :128]! vmov q1, q0