X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=plugins%2Fgpu_neon%2Fpsx_gpu%2Fpsx_gpu_arm_neon.S;h=3239412b6565ff924b6c9b8f8693b9ea4e9e0a8d;hp=fd9979808f81814a44c6e56e58dd846eb48b71fb;hb=c6063f8985c69362a89a12111f393229ab65d05f;hpb=3867c6efed8d1cd6cd40f07cd46876f59da8912f diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index fd997980..3239412b 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -16,65 +16,9 @@ #define MAX_BLOCKS 64 #define MAX_BLOCKS_PER_ROW 128 -#define psx_gpu_test_mask_offset 0 -#define psx_gpu_uvrg_offset 16 -#define psx_gpu_uvrg_dx_offset 32 -#define psx_gpu_uvrg_dy_offset 48 -#define psx_gpu_u_block_span_offset 64 -#define psx_gpu_v_block_span_offset 80 -#define psx_gpu_r_block_span_offset 96 -#define psx_gpu_g_block_span_offset 112 -#define psx_gpu_b_block_span_offset 128 - -#define psx_gpu_b_dx_offset 132 - -#define psx_gpu_b_offset 144 -#define psx_gpu_b_dy_offset 148 -#define psx_gpu_triangle_area_offset 152 -#define psx_gpu_texture_window_settings_offset 156 -#define psx_gpu_current_texture_mask_offset 160 -#define psx_gpu_viewport_mask_offset 164 -#define psx_gpu_dirty_textures_4bpp_mask_offset 168 -#define psx_gpu_dirty_textures_8bpp_mask_offset 172 -#define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 176 -#define psx_gpu_triangle_color_offset 180 -#define psx_gpu_dither_table_offset 184 -#define psx_gpu_render_block_handler_offset 200 -#define psx_gpu_texture_page_ptr_offset 204 -#define psx_gpu_texture_page_base_offset 208 -#define psx_gpu_clut_ptr_offset 212 -#define psx_gpu_vram_ptr_offset 216 - -#define psx_gpu_render_state_base_offset 220 -#define psx_gpu_render_state_offset 222 -#define psx_gpu_num_spans_offset 224 -#define psx_gpu_num_blocks_offset 226 -#define psx_gpu_offset_x_offset 228 -#define psx_gpu_offset_y_offset 230 -#define psx_gpu_clut_settings_offset 232 -#define psx_gpu_texture_settings_offset 234 -#define psx_gpu_viewport_start_x_offset 236 -#define psx_gpu_viewport_start_y_offset 238 -#define psx_gpu_viewport_end_x_offset 240 -#define psx_gpu_viewport_end_y_offset 242 -#define psx_gpu_mask_msb_offset 244 - -#define psx_gpu_triangle_winding_offset 246 -#define psx_gpu_display_area_draw_enable_offset 247 -#define psx_gpu_current_texture_page_offset 248 -#define psx_gpu_last_8bpp_texture_page_offset 249 -#define psx_gpu_texture_mask_width_offset 250 -#define psx_gpu_texture_mask_height_offset 251 -#define psx_gpu_texture_window_x_offset 252 -#define psx_gpu_texture_window_y_offset 253 -#define psx_gpu_primitive_type_offset 254 - -#define psx_gpu_reserved_a_offset 255 - -#define psx_gpu_blocks_offset 0x0100 -#define psx_gpu_span_uvrg_offset_offset 0x2100 -#define psx_gpu_span_edge_data_offset 0x4100 -#define psx_gpu_span_b_offset_offset 0x5100 +#include "psx_gpu_offsets.h" + +#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4) #define edge_data_left_x_offset 0 #define edge_data_num_blocks_offset 2 @@ -238,9 +182,22 @@ #define uvrg_dx3l d6 #define uvrg_dx3h d7 +#define uvrgb_phase q13 .align 4 +/* FIXME: users of this should be in psx_gpu instead */ +#ifndef __PIC__ +#define load_pointer(register, pointer) \ + movw register, :lower16:pointer; \ + movt register, :upper16:pointer; \ + +#else +#define load_pointer(register, pointer) \ + ldr register, =pointer \ + +#endif + #define function(name) \ .global name; \ name: \ @@ -357,11 +314,16 @@ function(compute_all_gradients) vmull.s16 ga_uvrg_y, d0_b, d1_b rsbmi ga_bx, ga_bx, #0 + @ r12 = psx_gpu->uvrgb_phase + ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ] + vmlsl.s16 ga_uvrg_y, d2_b, d3_b movs gs_by, ga_by, asr #31 vshr.u64 d0, d30, #22 - mov b_base, b0, lsl #16 + add b_base, r12, b0, lsl #16 + + vdup.u32 uvrgb_phase, r12 rsbmi ga_by, ga_by, #0 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0 @@ -370,7 +332,6 @@ function(compute_all_gradients) ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ] vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0 - add b_base, b_base, #0x8000 rsb r12, r12, #0 @ r12 = -(triangle->winding) vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w } @@ -379,7 +340,7 @@ function(compute_all_gradients) vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift } - vorr.u32 uvrg_base, #0x8000 + vadd.u32 uvrg_base, uvrgb_phase vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x) vmov area_r_s, s0 @ area_r_s = triangle_reciprocal @@ -609,8 +570,7 @@ function(compute_all_gradients) vld1.32 { uvrg }, [ temp ]; \ add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \ vld1.32 { uvrg_dy }, [ temp ]; \ - movw reciprocal_table_ptr, :lower16:reciprocal_table; \ - movt reciprocal_table_ptr, :upper16:reciprocal_table; \ + load_pointer(reciprocal_table_ptr, reciprocal_table); \ \ vmov.u32 c_0x01, #0x01 \ @@ -646,7 +606,7 @@ function(compute_all_gradients) \ vdup.u32 edge_shifts, temp; \ vsub.u32 heights_b, heights, c_0x01; \ - vshr.u32 height_reciprocals, edge_shifts, #12; \ + vshr.u32 height_reciprocals, edge_shifts, #10; \ \ vmla.s32 heights_b, x_starts, heights; \ vbic.u16 edge_shifts, #0xE0; \ @@ -671,8 +631,8 @@ function(compute_all_gradients) vsub.u32 heights_b, heights, c_0x01; \ sub height_b_alt, height_minor_b, #1; \ \ - vshr.u32 height_reciprocals, edge_shifts, #12; \ - lsr height_reciprocal_alt, edge_shift_alt, #12; \ + vshr.u32 height_reciprocals, edge_shifts, #10; \ + lsr height_reciprocal_alt, edge_shift_alt, #10; \ \ vmla.s32 heights_b, x_starts, heights; \ mla height_b_alt, height_minor_b, start_c, height_b_alt; \ @@ -1016,6 +976,7 @@ function(setup_spans_up_left) function(setup_spans_up_right) setup_spans_up_up(right, left) +.pool #define setup_spans_down_down(minor, major) \ setup_spans_prologue(); \ @@ -1224,6 +1185,7 @@ function(setup_spans_up_down) setup_spans_prologue_b() bal 4b +.pool #undef span_uvrg_offset #undef span_edge_data @@ -1380,7 +1342,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ @@ -1647,7 +1609,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ @@ -1842,7 +1804,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect) ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ] ldrh y, [ span_edge_data, #edge_data_y_offset ] - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ] cmp span_num_blocks, #0 beq 1f @@ -1962,7 +1924,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ] ldrh y, [ span_edge_data, #edge_data_y_offset ] - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ] cmp span_num_blocks, #0 beq 1f @@ -2002,8 +1964,6 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) 1: add span_edge_data, span_edge_data, #8 subs num_spans, num_spans, #1 - - strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] bne 0b ldmia sp!, { r4 - r11, pc } @@ -2151,7 +2111,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ @@ -2391,7 +2351,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ @@ -4080,14 +4040,11 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ - vmov.u16 d128_0x83E0, #0x8300; \ vmov.u16 d128_0x1C07, #0x1C00; \ - vmov.u16 d128_0x80E0, #0x8000; \ + vmov.u16 d128_0x00E0, #0x00E0; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ - vorr.u16 d128_0x83E0, #0x00E0; \ vorr.u16 d128_0x1C07, #0x0007; \ - vorr.u16 d128_0x80E0, #0x00E0; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ @@ -4096,33 +4053,31 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vshr.s16 pixels_fourth, pixels, #2; \ + vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ blend_blocks_add_mask_copy_##mask_evaluate(); \ - vorr.u16 pixels, pixels, msb_mask; \ - vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ - vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ - vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ - vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ - vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ + vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ + vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ + vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ - vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ + vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ - vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ + vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ - \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ + vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ + vbif.u16 blend_pixels, pixels, blend_mask; \ + \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vclt.s16 blend_mask, pixels, #0; \ - \ vshr.s16 pixels_fourth, pixels, #2; \ - vorr.u16 pixels, pixels, msb_mask; \ - vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ + vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ @@ -4135,24 +4090,25 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ - vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ - vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ - vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ + vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ + vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ \ 3: \ - vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ + vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ - vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ + vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ - vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ + vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ + vorr.u16 blend_pixels, blend_pixels, msb_mask; \ + vbif.u16 blend_pixels, pixels, blend_mask; \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ @@ -4160,16 +4116,16 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ - vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ + vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ - vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ - vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ + vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ bal 3b \ + #define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \ .align 3; \ \ @@ -4186,12 +4142,10 @@ function(blend_blocks_untextured_add_fourth_##mask_evaluate) \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ - vmov.u16 d128_0x83E0, #0x8300; \ vmov.u16 d128_0x1C07, #0x1C00; \ vmov.u16 d128_0x00E0, #0x00E0; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ - vorr.u16 d128_0x83E0, #0x00E0; \ vorr.u16 d128_0x1C07, #0x0007; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ @@ -4216,7 +4170,6 @@ function(blend_blocks_untextured_add_fourth_##mask_evaluate) \ \ 0: \ mov fb_ptr, fb_ptr_next; \ - \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ @@ -4339,102 +4292,52 @@ function(warmup) bx lr +#undef vram_ptr #undef color -#undef y +#undef width #undef height - -#define psx_gpu r0 -#define color r1 -#define x r2 -#define y r3 +#undef pitch #define vram_ptr r0 -#define width r3 -#define height r12 - -#define parameter_width_offset 0 -#define parameter_height_offset 4 +#define color r1 +#define width r2 +#define height r3 -#define color_r r14 -#define color_g r4 -#define color_b r5 +#define pitch r1 -#define left_unaligned r14 -#define right_unaligned r4 -#define pitch r5 -#define num_unaligned r2 -#define num_width r6 +#define num_width r12 -#undef colors +#undef colors_a +#undef colors_b -#define colors q0 +#define colors_a q0 +#define colors_b q1 .align 3 function(render_block_fill_body) - ldr vram_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] - ldr height, [ sp, #parameter_height_offset ] - - add vram_ptr, vram_ptr, y, lsl #11 - ldr width, [ sp, #parameter_width_offset ] - - add vram_ptr, vram_ptr, x, lsl #1 - stmdb sp!, { r4 - r6, r14 } - - ubfx color_r, color, #3, #5 - ubfx color_g, color, #11, #5 - - ubfx color_b, color, #19, #5 - orr color, color_r, color_g, lsl #5 - - orr color, color, color_b, lsl #10 - add left_unaligned, x, #0x7 - - bic left_unaligned, left_unaligned, #0x7 - vdup.u16 colors, color - - sub left_unaligned, left_unaligned, x + vdup.u16 colors_a, color mov pitch, #2048 + vmov colors_b, colors_a sub pitch, pitch, width, lsl #1 - sub width, width, left_unaligned - - and right_unaligned, width, #0x7 - bic width, width, #0x7 - 0: - mov num_width, width, lsr #3 + mov num_width, width - movs num_unaligned, left_unaligned - beq 2f + 0: + vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]! - 1: - strh color, [ vram_ptr ], #2 - - subs num_unaligned, num_unaligned, #1 - bne 1b - - 2: - vst1.u32 { colors }, [ vram_ptr, :128 ]! - subs num_width, num_width, #1 - bne 2b - - movs num_unaligned, right_unaligned - beq 4f - - 3: - strh color, [ vram_ptr ], #2 - - subs num_unaligned, num_unaligned, #1 - bne 3b + subs num_width, num_width, #16 + bne 0b - 4: add vram_ptr, vram_ptr, pitch + mov num_width, width + subs height, height, #1 bne 0b - - ldmia sp!, { r4 - r6, pc } + bx lr + #undef x #undef y @@ -4906,7 +4809,7 @@ function(setup_sprite_##texture_mode) \ and offset_u, u, #0xF; \ \ ldr width, [ sp, #40 ]; \ - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \ \ ldr height, [ sp, #44 ]; \ add fb_ptr, fb_ptr, y, lsl #11; \ @@ -5154,7 +5057,7 @@ setup_sprites_16bpp_flush_row: function(setup_sprite_16bpp) stmdb sp!, { r4 - r11, r14 } - ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] + ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ] ldr v, [ sp, #36 ] add fb_ptr, fb_ptr, y, lsl #11