X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=pcsx_rearmed.git;a=blobdiff_plain;f=plugins%2Fgpu_neon%2Fpsx_gpu%2Fpsx_gpu_arm_neon.S;h=294685aee8271fcdf4ef402e54d7a07dbfac5963;hp=973a8b36d2d288c21d286eecbdafb269b4fb8992;hb=b8d961effdd3fc2a00dc073cae06b6d937682420;hpb=87c45ad1e2a265cedb7970cc1b7777591d0050b7 diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index 973a8b36..294685ae 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -241,6 +241,18 @@ .align 4 +/* FIXME: users of this should be in psx_gpu instead */ +#ifndef __PIC__ +#define load_pointer(register, pointer) \ + movw register, :lower16:pointer; \ + movt register, :upper16:pointer; \ + +#else +#define load_pointer(register, pointer) \ + ldr register, =pointer \ + +#endif + #define function(name) \ .global name; \ name: \ @@ -609,8 +621,7 @@ function(compute_all_gradients) vld1.32 { uvrg }, [ temp ]; \ add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \ vld1.32 { uvrg_dy }, [ temp ]; \ - movw reciprocal_table_ptr, :lower16:reciprocal_table; \ - movt reciprocal_table_ptr, :upper16:reciprocal_table; \ + load_pointer(reciprocal_table_ptr, reciprocal_table); \ \ vmov.u32 c_0x01, #0x01 \ @@ -1016,6 +1027,7 @@ function(setup_spans_up_left) function(setup_spans_up_right) setup_spans_up_up(right, left) +.pool #define setup_spans_down_down(minor, major) \ setup_spans_prologue(); \ @@ -1224,6 +1236,7 @@ function(setup_spans_up_down) setup_spans_prologue_b() bal 4b +.pool #undef span_uvrg_offset #undef span_edge_data @@ -4078,14 +4091,11 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ - vmov.u16 d128_0x83E0, #0x8300; \ vmov.u16 d128_0x1C07, #0x1C00; \ - vmov.u16 d128_0x80E0, #0x8000; \ + vmov.u16 d128_0x00E0, #0x00E0; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ - vorr.u16 d128_0x83E0, #0x00E0; \ vorr.u16 d128_0x1C07, #0x0007; \ - vorr.u16 d128_0x80E0, #0x00E0; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ @@ -4094,33 +4104,31 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vshr.s16 pixels_fourth, pixels, #2; \ + vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ blend_blocks_add_mask_copy_##mask_evaluate(); \ - vorr.u16 pixels, pixels, msb_mask; \ - vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ - vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ - vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ - vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ - vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ + vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ + vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ + vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ - vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ + vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ - vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ + vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ - \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ + vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ + vbif.u16 blend_pixels, pixels, blend_mask; \ + \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vclt.s16 blend_mask, pixels, #0; \ - \ vshr.s16 pixels_fourth, pixels, #2; \ - vorr.u16 pixels, pixels, msb_mask; \ - vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ + vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ @@ -4133,24 +4141,25 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ - vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ - vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ - vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ + vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ + vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ \ 3: \ - vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ + vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ - vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ + vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ - vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ + vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ + vorr.u16 blend_pixels, blend_pixels, msb_mask; \ + vbif.u16 blend_pixels, pixels, blend_mask; \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ @@ -4158,16 +4167,16 @@ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ - vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ + vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ - vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ - vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ + vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ bal 3b \ + #define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \ .align 3; \ \ @@ -4184,12 +4193,10 @@ function(blend_blocks_untextured_add_fourth_##mask_evaluate) \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ - vmov.u16 d128_0x83E0, #0x8300; \ vmov.u16 d128_0x1C07, #0x1C00; \ vmov.u16 d128_0x00E0, #0x00E0; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ - vorr.u16 d128_0x83E0, #0x00E0; \ vorr.u16 d128_0x1C07, #0x0007; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ @@ -4214,7 +4221,6 @@ function(blend_blocks_untextured_add_fourth_##mask_evaluate) \ \ 0: \ mov fb_ptr, fb_ptr_next; \ - \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ @@ -4337,31 +4343,20 @@ function(warmup) bx lr +#undef vram_ptr #undef color -#undef y +#undef width #undef height - -#define psx_gpu r0 -#define color r1 -#define x r2 -#define y r3 +#undef pitch #define vram_ptr r0 -#define width r3 -#define height r12 - -#define parameter_width_offset 0 -#define parameter_height_offset 4 +#define color r1 +#define width r2 +#define height r3 -#define color_r r14 -#define color_g r4 -#define color_b r5 +#define pitch r1 -#define left_unaligned r14 -#define right_unaligned r4 -#define pitch r5 -#define num_unaligned r2 -#define num_width r6 +#define num_width r12 #undef colors_a #undef colors_b @@ -4372,44 +4367,28 @@ function(warmup) .align 3 function(render_block_fill_body) - ldr vram_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] - ldr height, [ sp, #parameter_height_offset ] - - add vram_ptr, vram_ptr, y, lsl #11 - ldr width, [ sp, #parameter_width_offset ] - - add vram_ptr, vram_ptr, x, lsl #1 - stmdb sp!, { r4 - r6, r14 } - - ubfx color_r, color, #3, #5 - ubfx color_g, color, #11, #5 - - ubfx color_b, color, #19, #5 - orr color, color_r, color_g, lsl #5 - - orr color, color, color_b, lsl #10 vdup.u16 colors_a, color + mov pitch, #2048 vmov colors_b, colors_a - mov pitch, #2048 sub pitch, pitch, width, lsl #1 - 0: - mov num_width, width, lsr #4 + mov num_width, width - 1: - vst1.u32 { colors_a, colors_b }, [ vram_ptr, :128 ]! + 0: + vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]! - subs num_width, num_width, #1 - bne 1b + subs num_width, num_width, #16 + bne 0b add vram_ptr, vram_ptr, pitch + mov num_width, width + subs height, height, #1 bne 0b - - 1: - ldmia sp!, { r4 - r6, pc } + bx lr + #undef x #undef y