/* * Copyright (C) 2011 Gilead Kutnick "Exophase" * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #define MAX_SPANS 512 #define MAX_BLOCKS 64 #define MAX_BLOCKS_PER_ROW 128 #define psx_gpu_test_mask_offset 0 #define psx_gpu_uvrg_offset 16 #define psx_gpu_uvrg_dx_offset 32 #define psx_gpu_uvrg_dy_offset 48 #define psx_gpu_u_block_span_offset 64 #define psx_gpu_v_block_span_offset 80 #define psx_gpu_r_block_span_offset 96 #define psx_gpu_g_block_span_offset 112 #define psx_gpu_b_block_span_offset 128 #define psx_gpu_b_dx_offset 132 #define psx_gpu_b_offset 144 #define psx_gpu_b_dy_offset 148 #define psx_gpu_triangle_area_offset 152 #define psx_gpu_texture_window_settings_offset 156 #define psx_gpu_current_texture_mask_offset 160 #define psx_gpu_viewport_mask_offset 164 #define psx_gpu_dirty_textures_4bpp_mask_offset 168 #define psx_gpu_dirty_textures_8bpp_mask_offset 172 #define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 176 #define psx_gpu_triangle_color_offset 180 #define psx_gpu_primitive_color_offset 184 #define psx_gpu_dither_table_offset 188 #define psx_gpu_render_block_handler_offset 204 #define psx_gpu_texture_page_ptr_offset 208 #define psx_gpu_clut_ptr_offset 212 #define psx_gpu_vram_ptr_offset 216 #define psx_gpu_render_state_base_offset 220 #define psx_gpu_render_state_offset 222 #define psx_gpu_num_spans_offset 224 #define psx_gpu_num_blocks_offset 226 #define psx_gpu_offset_x_offset 228 #define psx_gpu_offset_y_offset 230 #define psx_gpu_clut_settings_offset 232 #define psx_gpu_texture_settings_offset 234 #define psx_gpu_viewport_start_x_offset 236 #define psx_gpu_viewport_start_y_offset 238 #define psx_gpu_viewport_end_x_offset 240 #define psx_gpu_viewport_end_y_offset 242 #define psx_gpu_mask_msb_offset 244 #define psx_gpu_triangle_winding_offset 246 #define psx_gpu_display_area_draw_enable_offset 247 #define psx_gpu_current_texture_page_offset 248 #define psx_gpu_last_8bpp_texture_page_offset 249 #define psx_gpu_texture_mask_width_offset 250 #define psx_gpu_texture_mask_height_offset 251 #define psx_gpu_texture_window_x_offset 252 #define psx_gpu_texture_window_y_offset 253 #define psx_gpu_primitive_type_offset 254 #define psx_gpu_reserved_a_offset 255 #define psx_gpu_blocks_offset 0x0100 #define psx_gpu_span_uvrg_offset_offset 0x2100 #define psx_gpu_span_edge_data_offset 0x4100 #define psx_gpu_span_b_offset_offset 0x5100 #define edge_data_left_x_offset 0 #define edge_data_num_blocks_offset 2 #define edge_data_right_mask_offset 4 #define edge_data_y_offset 6 #define psx_gpu r0 #define v_a r1 #define v_b r2 #define v_c r3 #define x0 r4 #define x1 r5 #define x2 r6 #define x0_x1 r5 #define x1_x2 r6 #define y0 r7 #define y1 r8 #define y2 r9 #define y0_y1 r7 #define y1_y2 r8 #define b0 r9 #define b1 r10 #define b2 r11 #define b0_b1 r10 #define b1_b2 r11 #define area_r_s r5 #define g_bx0 r2 #define g_bx r3 #define g_bx2 r4 #define g_bx3 r5 #define b_base r6 #define g_by r8 #define gs_bx r7 #define gs_by r10 #define ga_bx g_bx #define ga_by g_by #define gw_bx_h g_bx #define gw_by_h g_by #define gw_bx_l r11 #define gw_by_l gw_bx_l #define store_a r0 #define store_b r1 #define store_inc r5 #define v0 q0 #define uvrgb0 d0 #define x0_y0 d1 #define v1 q1 #define uvrgb1 d2 #define x1_y1 d3 #define v2 q2 #define uvrgb2 d4 #define x2_y2 d5 #define x0_ab q3 #define uvrg_xxxx0 q3 #define uvrg0 d6 #define xxxx0 d7 #define x1_ab q4 #define uvrg_xxxx1 q4 #define uvrg1 d8 #define xxxx1 d9 #define x2_ab q5 #define uvrg_xxxx2 q5 #define uvrg2 d10 #define xxxx2 d11 #define y0_ab q6 #define yyyy_uvrg0 q6 #define yyyy0 d12 #define uvrg0b d13 #define y1_ab q7 #define yyyy_uvrg1 q7 #define yyyy1 d14 #define uvrg1b d15 #define y2_ab q8 #define yyyy_uvrg2 q8 #define yyyy2 d16 #define uvrg2b d17 #define d0_ab q9 #define d0_a d18 #define d0_b d19 #define d1_ab q10 #define d1_a d20 #define d1_b d21 #define d2_ab q11 #define d2_a d22 #define d2_b d23 #define d3_ab q12 #define d3_a d24 #define d3_b d25 #define ga_uvrg_x q1 #define ga_uvrg_y q4 #define dx x0_x1 #define dy y0_y1 #define db b0_b1 #define uvrg_base q11 #define gs_uvrg_x q5 #define gs_uvrg_y q6 #define g_uvrg_x q1 #define ga_uv_x d2 #define g_uv_x d2 #define ga_rg_x d3 #define g_rg_x d3 #define g_uvrg_y q4 #define ga_uv_y d8 #define g_uv_y d8 #define ga_rg_y d9 #define g_rg_y d9 #define gw_uv_x q1 #define gw_rg_x q2 #define gw_uv_y q4 #define gw_rg_y q3 #define w_mask q9 #define w_mask_l d18 #define r_shift q10 #define uvrg_dx0 q0 #define uvrg_dx0l d0 #define uvrg_dx0h d1 #define uvrg_dx1 q1 #define uvrg_dx1l d2 #define uvrg_dx1h d3 #define uvrg_dx2 q2 #define uvrg_dx2l d4 #define uvrg_dx2h d5 #define uvrg_dx3 q3 #define uvrg_dx3l d6 #define uvrg_dx3h d7 .align 4 #define function(name) \ .global name; \ name: \ @ r0: psx_gpu @ r1: v_a @ r2: v_b @ r3: v_c function(compute_all_gradients) // First compute the triangle area reciprocal and shift. The division will // happen concurrently with much of the work which follows. @ r12 = psx_gpu->triangle_area ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ] stmdb sp!, { r4 - r11, lr } @ load exponent of 62 into upper half of double movw r4, #0 clz r14, r12 @ r14 = shift movt r4, #((62 + 1023) << 4) mov r12, r12, lsl r14 @ r12 = triangle_area_normalized @ load area normalized into lower half of double mov r5, r12, lsr #10 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n movt r4, #((1022 + 31) << 4) mov r5, r12, lsl #20 add r4, r4, r12, lsr #11 vmov.f64 d31, r5, r4 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) = // ( d0 * d1 ) - ( d2 * d3 ) = // ( m0 ) - ( m1 ) = gradient // This is split to do 12 elements at a time over three sets: a, b, and c. // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so // two of the slots are unused. // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as // is g. // First type is: uvrg bxxx xxxx // Second type is: yyyy ybyy uvrg // Since x_a and y_c are the same the same variable is used for both. vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 } ldrsh x0, [ v_a, #8 ] @ load x0 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1} ldrh x1, [ v_b, #8 ] @ load x1 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 } ldrh x2, [ v_c, #8 ] @ load x2 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- } ldrh y0, [ v_a, #10 ] @ load y0 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- } ldrh y1, [ v_b, #10 ] @ load y1 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- } ldrh y2, [ v_c, #10 ] @ load y2 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 } vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 } orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 } pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 } vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 } vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 } vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 } vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 } ldrb b2, [ v_c, #4 ] @ load b2 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 } ldrb b1, [ v_b, #4 ] @ load b1 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 } vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 } vsub.s16 d0_ab, x1_ab, x0_ab ldrb b0, [ v_a, #4 ] @ load b0 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 } vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 } vsub.s16 d2_ab, x2_ab, x1_ab vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 } vsub.s16 d1_ab, y2_ab, y1_ab orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 } ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 } ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 } ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 } vsub.s16 d3_ab, y1_ab, y0_ab smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) - @ ((x2 - X1) * (b1 - b0)) vmull.s16 ga_uvrg_x, d0_a, d1_a smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) - @ ((b2 - b1) * (y1 - y0)) vmlsl.s16 ga_uvrg_x, d2_a, d3_a movs gs_bx, ga_bx, asr #31 vmull.s16 ga_uvrg_y, d0_b, d1_b rsbmi ga_bx, ga_bx, #0 vmlsl.s16 ga_uvrg_y, d2_b, d3_b movs gs_by, ga_by, asr #31 vshr.u64 d0, d30, #22 mov b_base, b0, lsl #16 rsbmi ga_by, ga_by, #0 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0 @ r12 = psx_gpu->triangle_winding_offset ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ] vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0 add b_base, b_base, #0x8000 rsb r12, r12, #0 @ r12 = -(triangle->winding) vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w } sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS) vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift } vorr.u32 uvrg_base, #0x8000 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x) vmov area_r_s, s0 @ area_r_s = triangle_reciprocal vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y) vmull.u32 gw_rg_x, ga_rg_x, d0[0] vmull.u32 gw_uv_x, ga_uv_x, d0[0] vmull.u32 gw_rg_y, ga_rg_y, d0[0] vmull.u32 gw_uv_y, ga_uv_y, d0[0] vshl.u64 gw_rg_x, gw_rg_x, r_shift vshl.u64 gw_uv_x, gw_uv_x, r_shift vshl.u64 gw_rg_y, gw_rg_y, r_shift vshl.u64 gw_uv_y, gw_uv_y, r_shift veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask vmovn.u64 g_uv_x, gw_uv_x veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask vmovn.u64 g_rg_x, gw_rg_x veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x vmovn.u64 g_uv_y, gw_uv_y vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x vmovn.u64 g_rg_y, gw_rg_y veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y mov ga_bx, ga_bx, lsl #13 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y mov ga_by, ga_by, lsl #13 vdup.u32 x0_y0, x0 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s vshl.u32 g_uvrg_x, g_uvrg_x, #4 vshl.u32 g_uvrg_y, g_uvrg_y, #4 umull gw_by_l, gw_by_h, ga_by, area_r_s vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0] eor gs_bx, gs_bx, r12 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0 eor gs_by, gs_by, r12 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr add store_a, psx_gpu, #psx_gpu_uvrg_offset sub r11, r11, #(32 - 13) add store_b, store_a, #16 mov store_inc, #32 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc mov g_bx, gw_bx_h, lsr r11 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc mov g_by, gw_by_h, lsr r11 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \ [ store_b, : 128 ], store_inc eor g_bx, g_bx, gs_bx vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \ [ store_b, : 128 ], store_inc sub g_bx, g_bx, gs_bx lsl g_bx, g_bx, #4 eor g_by, g_by, gs_by mls b_base, g_bx, x0, b_base sub g_by, g_by, gs_by lsl g_by, g_by, #4 mov g_bx0, #0 add g_bx2, g_bx, g_bx add g_bx3, g_bx, g_bx2 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by } ldmia sp!, { r4 - r11, pc } #define psx_gpu r0 #define v_a r1 #define v_b r2 #define v_c r3 #define temp r14 #define x_a r4 #define x_b r5 #define x_c r6 #define y_a r1 #define y_b r2 #define y_c r3 #define height_minor_a r7 #define height_minor_b r8 #define height_major r9 #define height r9 #define reciprocal_table_ptr r10 #define edge_alt_low r4 #define edge_alt_high r5 #define edge_dx_dy_alt r6 #define edge_shift_alt r10 #define edge_dx_dy_alt_low r4 #define edge_dx_dy_alt_high r5 #define span_edge_data r4 #define span_uvrg_offset r5 #define span_b_offset r6 #define clip r14 #define b r11 #define b_dy r12 #define alternate_x q0 #define alternate_dx_dy q1 #define alternate_x_32 q2 #define alternate_x_low d0 #define alternate_x_high d1 #define alternate_dx_dy_low d2 #define alternate_dx_dy_high d3 #define alternate_x_32_low d4 #define alternate_x_32_high d5 #define left_x q3 #define right_x q4 #define left_dx_dy q5 #define right_dx_dy q6 #define left_edge q7 #define right_edge q8 #define left_x_low d6 #define left_x_high d7 #define right_x_low d8 #define right_x_high d9 #define left_dx_dy_low d10 #define left_dx_dy_high d11 #define right_dx_dy_low d12 #define right_dx_dy_high d13 #define left_edge_low d14 #define left_edge_high d15 #define right_edge_low d16 #define right_edge_high d17 #define y_mid_point d18 #define c_0x0004 d19 #define left_right_x_16 q11 #define span_shifts_y q12 #define c_0x0001 q13 #define span_shifts d24 #define y_x4 d25 #define c_0xFFFE d26 #define c_0x0007 d27 #define left_right_x_16_low d22 #define left_right_x_16_high d23 #define uvrg q14 #define uvrg_dy q15 #define alternate_x_16 d4 #define v_clip q3 #define v_clip_low d6 #define right_x_32 q10 #define left_x_32 q11 #define alternate_select d24 #define right_x_32_low d20 #define right_x_32_high d21 #define left_x_32_low d22 #define left_x_32_high d23 #define edges_xy q0 #define edges_dx_dy d2 #define edge_shifts d3 #define edge_shifts_64 q2 #define edges_xy_left d0 #define edges_xy_right d1 #define height_reciprocals d6 #define heights d7 #define widths d8 #define c_0x01 d9 #define x_starts d10 #define x_ends d11 #define heights_b d12 #define edges_dx_dy_64 q10 #define edges_dx_dy_64_left d20 #define edges_dx_dy_64_right d21 #define setup_spans_prologue() \ stmdb sp!, { r4 - r11, lr }; \ \ ldrsh x_a, [ v_a, #8 ]; \ ldrsh x_b, [ v_b, #8 ]; \ ldrsh x_c, [ v_c, #8 ]; \ ldrsh y_a, [ v_a, #10 ]; \ ldrsh y_b, [ v_b, #10 ]; \ ldrsh y_c, [ v_c, #10 ]; \ \ add temp, psx_gpu, #psx_gpu_uvrg_offset; \ vld1.32 { uvrg }, [ temp ]; \ add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \ vld1.32 { uvrg_dy }, [ temp ]; \ movw reciprocal_table_ptr, :lower16:reciprocal_table; \ movt reciprocal_table_ptr, :upper16:reciprocal_table; \ \ vmov.u32 c_0x01, #0x01 \ #define setup_spans_load_b() \ ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \ ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \ #define setup_spans_prologue_b() \ add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \ \ add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \ vmov.u16 c_0x0004, #0x0004; \ \ add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \ vmov.u16 c_0x0001, #0x0001; \ \ vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \ add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \ \ vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \ vadd.u16 right_edge, right_edge, c_0x0001; \ \ vmov.u16 c_0x0007, #0x0007; \ vmvn.u16 c_0xFFFE, #0x0001 \ #define compute_edge_delta_x2() \ ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \ \ vdup.u32 heights, height; \ vsub.u32 widths, x_ends, x_starts; \ \ vdup.u32 edge_shifts, temp; \ vsub.u32 heights_b, heights, c_0x01; \ vshr.u32 height_reciprocals, edge_shifts, #12; \ \ vmla.s32 heights_b, x_starts, heights; \ vbic.u16 edge_shifts, #0xE0; \ vmul.s32 edges_dx_dy, widths, height_reciprocals; \ vmull.s32 edges_xy, heights_b, height_reciprocals \ #define width_alt r6 #define height_reciprocal_alt r11 #define height_b_alt r12 #define compute_edge_delta_x3(start_c, height_a, height_b) \ vmov.u32 heights, height_a, height_b; \ ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \ vmov.u32 edge_shifts[0], temp; \ ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \ vmov.u32 edge_shifts[1], temp; \ ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \ \ vsub.u32 widths, x_ends, x_starts; \ sub width_alt, x_c, start_c; \ \ vsub.u32 heights_b, heights, c_0x01; \ sub height_b_alt, height_minor_b, #1; \ \ vshr.u32 height_reciprocals, edge_shifts, #12; \ lsr height_reciprocal_alt, edge_shift_alt, #12; \ \ vmla.s32 heights_b, x_starts, heights; \ mla height_b_alt, height_minor_b, start_c, height_b_alt; \ \ vbic.u16 edge_shifts, #0xE0; \ and edge_shift_alt, edge_shift_alt, #0x1F; \ \ vmul.s32 edges_dx_dy, widths, height_reciprocals; \ mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \ \ vmull.s32 edges_xy, heights_b, height_reciprocals; \ smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \ #define setup_spans_adjust_y_up() \ vsub.u32 y_x4, y_x4, c_0x0004 \ #define setup_spans_adjust_y_down() \ vadd.u32 y_x4, y_x4, c_0x0004 \ #define setup_spans_adjust_interpolants_up() \ vsub.u32 uvrg, uvrg, uvrg_dy; \ sub b, b, b_dy \ #define setup_spans_adjust_interpolants_down() \ vadd.u32 uvrg, uvrg, uvrg_dy; \ add b, b, b_dy \ #define setup_spans_clip_interpolants_increment() \ mla b, b_dy, clip, b; \ vmla.s32 uvrg, uvrg_dy, v_clip \ #define setup_spans_clip_interpolants_decrement() \ mls b, b_dy, clip, b; \ vmls.s32 uvrg, uvrg_dy, v_clip \ #define setup_spans_clip_alternate_yes() \ smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \ #define setup_spans_clip_alternate_no() \ #define setup_spans_clip(direction, alternate_active) \ vdup.u32 v_clip, clip; \ setup_spans_clip_alternate_##alternate_active(); \ setup_spans_clip_interpolants_##direction(); \ vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \ #define setup_spans_adjust_edges_alternate_no(left_index, right_index) \ vmovl.s32 edge_shifts_64, edge_shifts; \ vmovl.s32 edges_dx_dy_64, edges_dx_dy; \ \ vshl.s64 edges_xy, edges_xy, edge_shifts_64; \ vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \ \ vmov left_x_low, edges_xy_##left_index; \ vmov right_x_low, edges_xy_##right_index; \ \ vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \ vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \ vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \ vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \ \ vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \ vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \ \ vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \ vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \ #define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \ setup_spans_adjust_edges_alternate_no(left_index, right_index); \ \ vdup.u16 y_mid_point, y_b; \ rsb temp, edge_shift_alt, #32; \ \ lsl edge_alt_high, edge_alt_high, edge_shift_alt; \ orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \ lsl edge_alt_low, edge_alt_low, edge_shift_alt; \ vmov alternate_x_low, edge_alt_low, edge_alt_high; \ \ asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \ lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \ vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \ vmov alternate_dx_dy_high, alternate_dx_dy_low; \ \ vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \ vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \ #define setup_spans_y_select_up() \ vclt.s16 alternate_select, y_x4, y_mid_point \ #define setup_spans_y_select_down() \ vcgt.s16 alternate_select, y_x4, y_mid_point \ #define setup_spans_alternate_select_left() \ vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \ #define setup_spans_alternate_select_right() \ vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \ #define setup_spans_set_x4_alternate_yes(alternate, direction) \ vshrn.s64 alternate_x_32_low, alternate_x, #32; \ vshrn.s64 left_x_32_low, left_x, #32; \ vshrn.s64 right_x_32_low, right_x, #32; \ \ vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \ vadd.u64 left_x, left_x, left_dx_dy; \ vadd.u64 right_x, right_x, right_dx_dy; \ \ vshrn.s64 alternate_x_32_high, alternate_x, #32; \ vshrn.s64 left_x_32_high, left_x, #32; \ vshrn.s64 right_x_32_high, right_x, #32; \ \ vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \ vadd.u64 left_x, left_x, left_dx_dy; \ vadd.u64 right_x, right_x, right_dx_dy; \ \ vmovn.u32 alternate_x_16, alternate_x_32; \ setup_spans_y_select_##direction(); \ vmovn.u32 left_right_x_16_low, left_x_32; \ \ vmovn.u32 left_right_x_16_high, right_x_32; \ setup_spans_alternate_select_##alternate(); \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vmax.s16 left_right_x_16, left_right_x_16, left_edge; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vmin.s16 left_right_x_16, left_right_x_16, right_edge; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \ vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \ vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \ vshl.u16 span_shifts, c_0xFFFE, span_shifts; \ \ vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \ \ setup_spans_adjust_y_##direction() \ #define setup_spans_set_x4_alternate_no(alternate, direction) \ vshrn.s64 left_x_32_low, left_x, #32; \ vshrn.s64 right_x_32_low, right_x, #32; \ \ vadd.u64 left_x, left_x, left_dx_dy; \ vadd.u64 right_x, right_x, right_dx_dy; \ \ vshrn.s64 left_x_32_high, left_x, #32; \ vshrn.s64 right_x_32_high, right_x, #32; \ \ vadd.u64 left_x, left_x, left_dx_dy; \ vadd.u64 right_x, right_x, right_dx_dy; \ \ vmovn.u32 left_right_x_16_low, left_x_32; \ vmovn.u32 left_right_x_16_high, right_x_32; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vmax.s16 left_right_x_16, left_right_x_16, left_edge; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vmin.s16 left_right_x_16, left_right_x_16, right_edge; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \ vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \ vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \ \ vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \ str b, [ span_b_offset ], #4; \ setup_spans_adjust_interpolants_##direction(); \ \ vshl.u16 span_shifts, c_0xFFFE, span_shifts; \ vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \ \ vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \ \ setup_spans_adjust_y_##direction() \ #define edge_adjust_low r11 #define edge_adjust_high r12 #define setup_spans_alternate_adjust_yes() \ smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \ subs edge_alt_low, edge_alt_low, edge_adjust_low; \ sbc edge_alt_high, edge_alt_high, edge_adjust_high \ #define setup_spans_alternate_adjust_no() \ #define setup_spans_down(left_index, right_index, alternate, alternate_active) \ setup_spans_alternate_adjust_##alternate_active(); \ setup_spans_load_b(); \ \ ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \ subs y_c, y_c, temp; \ subgt height, height, y_c; \ addgt height, height, #1; \ \ ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \ subs clip, temp, y_a; \ ble 0f; \ \ sub height, height, clip; \ add y_a, y_a, clip; \ setup_spans_clip(increment, alternate_active); \ \ 0: \ cmp height, #0; \ ble 1f; \ \ orr temp, y_a, y_a, lsl #16; \ add temp, temp, #(1 << 16); \ add y_a, temp, #2; \ add y_a, y_a, #(2 << 16); \ vmov.u32 y_x4, temp, y_a; \ \ setup_spans_adjust_edges_alternate_##alternate_active(left_index, \ right_index); \ setup_spans_prologue_b(); \ \ strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \ \ 2: \ setup_spans_set_x4_alternate_##alternate_active(alternate, down); \ subs height, height, #4; \ bhi 2b; \ \ 1: \ #define setup_spans_alternate_pre_increment_yes() \ adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \ adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \ #define setup_spans_alternate_pre_increment_no() \ #define setup_spans_up_decrement_yes() \ suble height, height, #1 \ #define setup_spans_up_decrement_no() \ #define setup_spans_up(left_index, right_index, alternate, alternate_active) \ setup_spans_alternate_adjust_##alternate_active(); \ setup_spans_load_b(); \ sub y_a, y_a, #1; \ \ ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \ subs temp, temp, y_c; \ subgt height, height, temp; \ setup_spans_up_decrement_##alternate_active(); \ \ ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \ subs clip, y_a, temp; \ ble 0f; \ \ sub height, height, clip; \ sub y_a, y_a, clip; \ setup_spans_clip(decrement, alternate_active); \ \ 0: \ cmp height, #0; \ ble 1f; \ \ orr temp, y_a, y_a, lsl #16; \ sub temp, temp, #(1 << 16); \ sub y_a, temp, #2; \ sub y_a, y_a, #(2 << 16); \ vmov.u32 y_x4, temp, y_a; \ \ vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \ \ setup_spans_alternate_pre_increment_##alternate_active(); \ setup_spans_adjust_edges_alternate_##alternate_active(left_index, \ right_index); \ setup_spans_adjust_interpolants_up(); \ setup_spans_prologue_b(); \ \ strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \ \ 2: \ setup_spans_set_x4_alternate_##alternate_active(alternate, up); \ subs height, height, #4; \ bhi 2b; \ \ 1: \ #define setup_spans_epilogue() \ ldmia sp!, { r4 - r11, pc } \ #define setup_spans_up_up(minor, major) \ setup_spans_prologue(); \ sub height_minor_a, y_a, y_b; \ sub height_minor_b, y_b, y_c; \ sub height, y_a, y_c; \ \ vdup.u32 x_starts, x_a; \ vmov.u32 x_ends, x_c, x_b; \ \ compute_edge_delta_x3(x_b, height_major, height_minor_a); \ setup_spans_up(major, minor, minor, yes); \ setup_spans_epilogue() \ function(setup_spans_up_left) setup_spans_up_up(left, right) function(setup_spans_up_right) setup_spans_up_up(right, left) #define setup_spans_down_down(minor, major) \ setup_spans_prologue(); \ sub height_minor_a, y_b, y_a; \ sub height_minor_b, y_c, y_b; \ sub height, y_c, y_a; \ \ vdup.u32 x_starts, x_a; \ vmov.u32 x_ends, x_c, x_b; \ \ compute_edge_delta_x3(x_b, height_major, height_minor_a); \ setup_spans_down(major, minor, minor, yes); \ setup_spans_epilogue() \ function(setup_spans_down_left) setup_spans_down_down(left, right) function(setup_spans_down_right) setup_spans_down_down(right, left) #define setup_spans_up_flat() \ sub height, y_a, y_c; \ \ compute_edge_delta_x2(); \ setup_spans_up(left, right, none, no); \ setup_spans_epilogue() \ function(setup_spans_up_a) setup_spans_prologue() vmov.u32 x_starts, x_a, x_b vdup.u32 x_ends, x_c setup_spans_up_flat() function(setup_spans_up_b) setup_spans_prologue() vdup.u32 x_starts, x_a vmov.u32 x_ends, x_b, x_c setup_spans_up_flat() #define setup_spans_down_flat() \ sub height, y_c, y_a; \ \ compute_edge_delta_x2(); \ setup_spans_down(left, right, none, no); \ setup_spans_epilogue() \ function(setup_spans_down_a) setup_spans_prologue() vmov.u32 x_starts, x_a, x_b vdup.u32 x_ends, x_c setup_spans_down_flat() function(setup_spans_down_b) setup_spans_prologue() vdup.u32 x_starts, x_a vmov.u32 x_ends, x_b, x_c setup_spans_down_flat() #define middle_y r9 #define edges_xy_b q11 #define edges_dx_dy_b d26 #define edge_shifts_b d27 #define edges_dx_dy_and_shifts_b q13 #define height_increment d20 #define edges_dx_dy_and_shifts q1 #define edges_xy_b_left d22 #define edges_xy_b_right d23 #define setup_spans_up_down_load_edge_set_b() \ vmov edges_xy, edges_xy_b; \ vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \ function(setup_spans_up_down) setup_spans_prologue() // s32 middle_y = y_a; sub height_minor_a, y_a, y_b sub height_minor_b, y_c, y_a sub height_major, y_c, y_b vmov.u32 x_starts, x_a, x_c vdup.u32 x_ends, x_b compute_edge_delta_x3(x_a, height_minor_a, height_major) mov temp, #0 vmov.u32 height_increment, temp, height_minor_b vmlal.s32 edges_xy, edges_dx_dy, height_increment vmov edges_xy_b_left, edge_alt_low, edge_alt_high vmov edges_xy_b_right, edges_xy_right vmov edge_shifts_b, edge_shifts vmov.u32 edge_shifts_b[0], edge_shift_alt vneg.s32 edges_dx_dy_b, edges_dx_dy vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt mov middle_y, y_a setup_spans_load_b() sub y_a, y_a, #1 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ] subs temp, temp, y_b subgt height_minor_a, height_minor_a, temp ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ] subs clip, y_a, temp ble 0f sub height_minor_a, height_minor_a, clip sub y_a, y_a, clip setup_spans_clip(decrement, no) 0: cmp height_minor_a, #0 ble 3f orr temp, y_a, y_a, lsl #16 sub temp, temp, #(1 << 16) sub y_a, temp, #2 sub y_a, y_a, #(2 << 16) vmov.u32 y_x4, temp, y_a vaddw.s32 edges_xy, edges_xy, edges_dx_dy strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ] setup_spans_adjust_edges_alternate_no(left, right); setup_spans_adjust_interpolants_up() setup_spans_up_down_load_edge_set_b() setup_spans_prologue_b() 2: setup_spans_set_x4_alternate_no(none, up) subs height_minor_a, height_minor_a, #4 bhi 2b add span_edge_data, span_edge_data, height_minor_a, lsl #3 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4 add span_b_offset, span_b_offset, height_minor_a, lsl #2 4: add temp, psx_gpu, #psx_gpu_uvrg_offset vld1.32 { uvrg }, [ temp ] mov y_a, middle_y setup_spans_load_b() ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ] subs y_c, y_c, temp subgt height_minor_b, height_minor_b, y_c addgt height_minor_b, height_minor_b, #1 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ] subs clip, temp, y_a ble 0f sub height_minor_b, height_minor_b, clip add y_a, y_a, clip setup_spans_clip(increment, no) 0: cmp height_minor_b, #0 ble 1f orr temp, y_a, y_a, lsl #16 add temp, temp, #(1 << 16) add y_a, temp, #2 add y_a, y_a, #(2 << 16) vmov.u32 y_x4, temp, y_a setup_spans_adjust_edges_alternate_no(left, right) ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ] add temp, temp, height_minor_b strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ] 2: setup_spans_set_x4_alternate_no(none, down) subs height_minor_b, height_minor_b, #4 bhi 2b 1: setup_spans_epilogue() 3: setup_spans_up_down_load_edge_set_b() setup_spans_prologue_b() bal 4b #undef span_uvrg_offset #undef span_edge_data #undef span_b_offset #undef left_x #undef b #define psx_gpu r0 #define num_spans r1 #define span_uvrg_offset r2 #define span_edge_data r3 #define span_b_offset r4 #define b_dx r5 #define span_num_blocks r6 #define y r7 #define left_x r8 #define b r9 #define dither_offset_ptr r10 #define block_ptr_a r11 #define fb_ptr r12 #define num_blocks r14 #define uvrg_dx_ptr r2 #define texture_mask_ptr r3 #define dither_shift r8 #define dither_row r10 #define c_32 r7 #define b_dx4 r8 #define b_dx8 r9 #define block_ptr_b r10 #define block_span_ptr r10 #define right_mask r8 #define color r2 #define color_r r3 #define color_g r4 #define color_b r5 #undef uvrg #define u_block q0 #define v_block q1 #define r_block q2 #define g_block q3 #define b_block q4 #define uv_dx4 d10 #define rg_dx4 d11 #define uv_dx8 d12 #define rg_dx8 d13 #define b_whole_8 d14 #define fb_mask_ptrs d15 #define uvrg_dx4 q5 #define uvrg_dx8 q6 #define uv_dx8 d12 #define rg_dx8 d13 #define u_whole q8 #define v_whole q9 #define r_whole q10 #define g_whole q11 #define b_whole q12 #define u_whole_low d16 #define u_whole_high d17 #define v_whole_low d18 #define v_whole_high d19 #define r_whole_low d20 #define r_whole_high d21 #define g_whole_low d22 #define g_whole_high d23 #define b_whole_low d24 #define b_whole_high d25 #define dx4 q13 #define dx8 q13 #define u_whole_8 d26 #define v_whole_8 d27 #define u_whole_8b d24 #define r_whole_8 d24 #define g_whole_8 d25 #define uv_whole_8 q13 #define uv_whole_8b q14 #define dither_offsets q14 #define texture_mask q15 #define texture_mask_u d30 #define texture_mask_v d31 #define dither_offsets_short d28 #define v_left_x q8 #define uvrg q9 #define block_span q10 #define uv d18 #define rg d19 #define draw_mask q1 #define draw_mask_edge q13 #define test_mask q0 #define uvrg_dx q3 #define colors q2 #define setup_blocks_texture_swizzled() \ vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \ vsli.u8 u_whole_8, v_whole_8, #4; \ vsri.u8 v_whole_8, u_whole_8b, #4 \ #define setup_blocks_texture_unswizzled() \ #define setup_blocks_shaded_textured_builder(swizzling) \ .align 3; \ \ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \ add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \ \ vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \ add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \ \ cmp num_spans, #0; \ bxeq lr; \ \ stmdb sp!, { r4 - r11, r14 }; \ vshl.u32 uvrg_dx4, uvrg_dx, #2; \ \ ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \ vshl.u32 uvrg_dx8, uvrg_dx, #3; \ \ vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \ add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \ \ add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \ add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \ \ add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \ \ 0: \ vmov.u8 fb_mask_ptrs, #0; \ \ ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ \ ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \ add num_blocks, span_num_blocks, num_blocks; \ \ cmp num_blocks, #MAX_BLOCKS; \ bgt 2f; \ \ 3: \ ldr b, [ span_b_offset ]; \ add fb_ptr, fb_ptr, y, lsl #11; \ \ vdup.u32 v_left_x, left_x; \ and y, y, #0x3; \ \ ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \ add fb_ptr, fb_ptr, left_x, lsl #1; \ \ mla b, b_dx, left_x, b; \ and dither_shift, left_x, #0x03; \ \ vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \ vshr.u32 uvrg_dx, uvrg_dx4, #2; \ \ mov dither_shift, dither_shift, lsl #3; \ vmla.u32 uvrg, uvrg_dx, v_left_x; \ \ mov c_32, #32; \ subs span_num_blocks, span_num_blocks, #1; \ \ mov dither_row, dither_row, ror dither_shift; \ mov b_dx4, b_dx, lsl #2; \ \ vdup.u32 dither_offsets_short, dither_row; \ add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \ \ vdup.u32 b_block, b; \ vshll.s8 dither_offsets, dither_offsets_short, #4; \ \ vdup.u32 u_block, uv[0]; \ mov b_dx8, b_dx, lsl #3; \ \ vdup.u32 v_block, uv[1]; \ vdup.u32 r_block, rg[0]; \ vdup.u32 g_block, rg[1]; \ \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 u_block, u_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 v_block, v_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 r_block, r_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 g_block, g_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \ \ vadd.u32 b_block, b_block, block_span; \ add block_ptr_b, block_ptr_a, #16; \ \ vshrn.u32 u_whole_low, u_block, #16; \ vshrn.u32 v_whole_low, v_block, #16; \ vshrn.u32 r_whole_low, r_block, #16; \ vshrn.u32 g_whole_low, g_block, #16; \ \ vdup.u32 dx4, uv_dx4[0]; \ vshrn.u32 b_whole_low, b_block, #16; \ \ vaddhn.u32 u_whole_high, u_block, dx4; \ vdup.u32 dx4, uv_dx4[1]; \ \ vaddhn.u32 v_whole_high, v_block, dx4; \ vdup.u32 dx4, rg_dx4[0]; \ \ vaddhn.u32 r_whole_high, r_block, dx4; \ vdup.u32 dx4, rg_dx4[1]; \ \ vaddhn.u32 g_whole_high, g_block, dx4; \ vdup.u32 dx4, b_dx4; \ \ vaddhn.u32 b_whole_high, b_block, dx4; \ vdup.u32 dx8, uv_dx8[0]; \ \ vadd.u32 u_block, u_block, dx8; \ vdup.u32 dx8, uv_dx8[1]; \ \ vadd.u32 v_block, v_block, dx8; \ vdup.u32 dx8, rg_dx8[0]; \ \ vadd.u32 r_block, r_block, dx8; \ vdup.u32 dx8, rg_dx8[1]; \ \ vadd.u32 g_block, g_block, dx8; \ vdup.u32 dx8, b_dx8; \ \ vadd.u32 b_block, b_block, dx8; \ vmovn.u16 u_whole_8, u_whole; \ \ vmovn.u16 v_whole_8, v_whole; \ \ vmovn.u16 b_whole_8, b_whole; \ pld [ fb_ptr ]; \ vmov.u32 fb_mask_ptrs[1], fb_ptr; \ \ vand.u8 uv_whole_8, uv_whole_8, texture_mask; \ setup_blocks_texture_##swizzling(); \ \ vmovn.u16 r_whole_8, r_whole; \ beq 5f; \ \ 4: \ vmovn.u16 g_whole_8, g_whole; \ vshrn.u32 u_whole_low, u_block, #16; \ \ vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \ vshrn.u32 v_whole_low, v_block, #16; \ \ vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \ vshrn.u32 r_whole_low, r_block, #16; \ \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \ vshrn.u32 g_whole_low, g_block, #16; \ \ vdup.u32 dx4, uv_dx4[0]; \ vshrn.u32 b_whole_low, b_block, #16; \ \ vaddhn.u32 u_whole_high, u_block, dx4; \ vdup.u32 dx4, uv_dx4[1]; \ \ vaddhn.u32 v_whole_high, v_block, dx4; \ vdup.u32 dx4, rg_dx4[0]; \ \ vaddhn.u32 r_whole_high, r_block, dx4; \ vdup.u32 dx4, rg_dx4[1]; \ \ vaddhn.u32 g_whole_high, g_block, dx4; \ vdup.u32 dx4, b_dx4; \ \ vaddhn.u32 b_whole_high, b_block, dx4; \ vdup.u32 dx8, uv_dx8[0]; \ \ vadd.u32 u_block, u_block, dx8; \ vdup.u32 dx8, uv_dx8[1]; \ \ vadd.u32 v_block, v_block, dx8; \ vdup.u32 dx8, rg_dx8[0]; \ \ vadd.u32 r_block, r_block, dx8; \ vdup.u32 dx8, rg_dx8[1]; \ \ vadd.u32 g_block, g_block, dx8; \ vdup.u32 dx8, b_dx8; \ \ vadd.u32 b_block, b_block, dx8; \ vmovn.u16 u_whole_8, u_whole; \ \ add fb_ptr, fb_ptr, #16; \ vmovn.u16 v_whole_8, v_whole; \ \ vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \ vmovn.u16 b_whole_8, b_whole; \ \ pld [ fb_ptr ]; \ \ vmov.u32 fb_mask_ptrs[1], fb_ptr; \ subs span_num_blocks, span_num_blocks, #1; \ \ vand.u8 uv_whole_8, uv_whole_8, texture_mask; \ setup_blocks_texture_##swizzling(); \ \ vmovn.u16 r_whole_8, r_whole; \ bne 4b; \ \ 5: \ vmovn.u16 g_whole_8, g_whole; \ ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \ \ vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \ vdup.u8 draw_mask, right_mask; \ \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ \ vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \ vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \ vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \ vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \ \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_b_offset, span_b_offset, #4; \ \ add span_edge_data, span_edge_data, #8; \ subs num_spans, num_spans, #1; \ \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ bne 0b; \ \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ stmdb sp!, { r0 - r3, r12, r14 }; \ bl flush_render_block_buffer; \ ldmia sp!, { r0 - r3, r12, r14 }; \ \ vpop { uvrg_dx4 }; \ vpop { texture_mask }; \ \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ vmov.u8 fb_mask_ptrs, #0; \ \ mov num_blocks, span_num_blocks; \ add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \ bal 3b \ setup_blocks_shaded_textured_builder(swizzled) setup_blocks_shaded_textured_builder(unswizzled) #define setup_blocks_unshaded_textured_builder(swizzling) \ .align 3; \ \ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \ add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \ \ vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \ add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \ \ cmp num_spans, #0; \ bxeq lr; \ \ stmdb sp!, { r4 - r11, r14 }; \ vshl.u32 uvrg_dx4, uvrg_dx, #2; \ \ vshl.u32 uvrg_dx8, uvrg_dx, #3; \ \ vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \ add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \ \ add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \ \ add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \ \ 0: \ vmov.u8 fb_mask_ptrs, #0; \ \ ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ \ ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \ add num_blocks, span_num_blocks, num_blocks; \ \ cmp num_blocks, #MAX_BLOCKS; \ bgt 2f; \ \ 3: \ add fb_ptr, fb_ptr, y, lsl #11; \ \ vdup.u32 v_left_x, left_x; \ and y, y, #0x3; \ \ ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \ add fb_ptr, fb_ptr, left_x, lsl #1; \ \ and dither_shift, left_x, #0x03; \ \ vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \ vshr.u32 uvrg_dx, uvrg_dx4, #2; \ \ mov dither_shift, dither_shift, lsl #3; \ vmla.u32 uvrg, uvrg_dx, v_left_x; \ \ mov c_32, #32; \ subs span_num_blocks, span_num_blocks, #1; \ \ mov dither_row, dither_row, ror dither_shift; \ \ vdup.u32 dither_offsets_short, dither_row; \ add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \ \ vshll.s8 dither_offsets, dither_offsets_short, #4; \ \ vdup.u32 u_block, uv[0]; \ \ vdup.u32 v_block, uv[1]; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 u_block, u_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 v_block, v_block, block_span; \ add block_ptr_b, block_ptr_a, #16; \ \ vshrn.u32 u_whole_low, u_block, #16; \ vshrn.u32 v_whole_low, v_block, #16; \ \ vdup.u32 dx4, uv_dx4[0]; \ \ vaddhn.u32 u_whole_high, u_block, dx4; \ vdup.u32 dx4, uv_dx4[1]; \ \ vaddhn.u32 v_whole_high, v_block, dx4; \ vdup.u32 dx8, uv_dx8[0]; \ \ vadd.u32 u_block, u_block, dx8; \ vdup.u32 dx8, uv_dx8[1]; \ \ vadd.u32 v_block, v_block, dx8; \ vmovn.u16 u_whole_8, u_whole; \ \ vmovn.u16 v_whole_8, v_whole; \ \ pld [ fb_ptr ]; \ vmov.u32 fb_mask_ptrs[1], fb_ptr; \ \ vand.u8 uv_whole_8, uv_whole_8, texture_mask; \ setup_blocks_texture_##swizzling(); \ \ beq 5f; \ \ 4: \ vshrn.u32 u_whole_low, u_block, #16; \ \ vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \ vshrn.u32 v_whole_low, v_block, #16; \ \ add block_ptr_b, block_ptr_b, #32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \ \ vdup.u32 dx4, uv_dx4[0]; \ vaddhn.u32 u_whole_high, u_block, dx4; \ vdup.u32 dx4, uv_dx4[1]; \ \ vaddhn.u32 v_whole_high, v_block, dx4; \ vdup.u32 dx8, uv_dx8[0]; \ \ vadd.u32 u_block, u_block, dx8; \ vdup.u32 dx8, uv_dx8[1]; \ \ vadd.u32 v_block, v_block, dx8; \ vmovn.u16 u_whole_8, u_whole; \ \ add fb_ptr, fb_ptr, #16; \ vmovn.u16 v_whole_8, v_whole; \ \ vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \ pld [ fb_ptr ]; \ \ vmov.u32 fb_mask_ptrs[1], fb_ptr; \ subs span_num_blocks, span_num_blocks, #1; \ \ vand.u8 uv_whole_8, uv_whole_8, texture_mask; \ setup_blocks_texture_##swizzling(); \ \ bne 4b; \ \ 5: \ ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \ \ vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \ vdup.u8 draw_mask, right_mask; \ \ vmov.u32 fb_mask_ptrs[0], right_mask; \ vtst.u16 draw_mask, draw_mask, test_mask; \ vzip.u8 u_whole_8, v_whole_8; \ \ vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \ add block_ptr_b, block_ptr_b, #32; \ vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \ vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \ vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \ \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_edge_data, span_edge_data, #8; \ subs num_spans, num_spans, #1; \ \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ bne 0b; \ \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { texture_mask }; \ vpush { uvrg_dx4 }; \ \ stmdb sp!, { r0 - r3, r12, r14 }; \ bl flush_render_block_buffer; \ ldmia sp!, { r0 - r3, r12, r14 }; \ \ vpop { uvrg_dx4 }; \ vpop { texture_mask }; \ \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ vmov.u8 fb_mask_ptrs, #0; \ \ mov num_blocks, span_num_blocks; \ add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \ bal 3b \ setup_blocks_unshaded_textured_builder(swizzled) setup_blocks_unshaded_textured_builder(unswizzled) .align 3 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect) ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ] veor.u32 draw_mask, draw_mask, draw_mask cmp num_spans, #0 bxeq lr stmdb sp!, { r4 - r11, r14 } vld1.u32 { test_mask }, [ psx_gpu, :128 ] ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ] ubfx color_r, color, #3, #5 ubfx color_g, color, #11, #5 ubfx color_b, color, #19, #5 orr color, color_r, color_b, lsl #10 orr color, color, color_g, lsl #5 vdup.u16 colors, color ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset add block_ptr_a, block_ptr_a, num_blocks, lsl #6 0: ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ] ldrh y, [ span_edge_data, #edge_data_y_offset ] ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] cmp span_num_blocks, #0 beq 1f ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ] add num_blocks, span_num_blocks, num_blocks cmp num_blocks, #MAX_BLOCKS bgt 2f 3: add fb_ptr, fb_ptr, y, lsl #11 and y, y, #0x3 add fb_ptr, fb_ptr, left_x, lsl #1 mov c_32, #32 subs span_num_blocks, span_num_blocks, #1 add block_ptr_b, block_ptr_a, #16 pld [ fb_ptr ] vmov.u32 fb_mask_ptrs[1], fb_ptr beq 5f 4: vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32 add fb_ptr, fb_ptr, #16 add block_ptr_b, block_ptr_b, #32 pld [ fb_ptr ] vmov.u32 fb_mask_ptrs[1], fb_ptr subs span_num_blocks, span_num_blocks, #1 bne 4b 5: ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ] vdup.u8 draw_mask_edge, right_mask vtst.u16 draw_mask_edge, draw_mask_edge, test_mask vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32 add block_ptr_b, block_ptr_b, #32 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32 1: add span_edge_data, span_edge_data, #8 subs num_spans, num_spans, #1 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] bne 0b ldmia sp!, { r4 - r11, pc } 2: vpush { colors } stmdb sp!, { r0 - r3, r12, r14 } bl flush_render_block_buffer ldmia sp!, { r0 - r3, r12, r14 } vpop { colors } vld1.u32 { test_mask }, [ psx_gpu, :128 ] veor.u32 draw_mask, draw_mask, draw_mask mov num_blocks, span_num_blocks add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset bal 3b #define mask_msb_scalar r14 #define msb_mask q15 #define pixels_low d16 #define msb_mask_low d30 #define msb_mask_high d31 .align 3 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ] cmp num_spans, #0 bxeq lr stmdb sp!, { r4 - r11, r14 } ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ] ubfx color_r, color, #3, #5 ubfx color_g, color, #11, #5 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ] ubfx color_b, color, #19, #5 orr color, color_r, color_b, lsl #10 orr color, color, color_g, lsl #5 orr color, color, mask_msb_scalar vdup.u16 colors, color add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset 0: ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ] ldrh y, [ span_edge_data, #edge_data_y_offset ] ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] cmp span_num_blocks, #0 beq 1f ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ] add fb_ptr, fb_ptr, y, lsl #11 subs span_num_blocks, span_num_blocks, #1 add fb_ptr, fb_ptr, left_x, lsl #1 beq 3f 2: vst1.u32 { colors }, [ fb_ptr ]! subs span_num_blocks, span_num_blocks, #1 bne 2b 3: ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ] eor right_mask, right_mask, #0xFF 4: strh color, [ fb_ptr ], #2 movs right_mask, right_mask, lsr #1 bne 4b 1: add span_edge_data, span_edge_data, #8 subs num_spans, num_spans, #1 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] bne 0b ldmia sp!, { r4 - r11, pc } #undef c_64 #define c_64 r7 #define rg_dx_ptr r2 #undef r_block #undef g_block #undef b_block #undef r_whole #undef g_whole #undef b_whole #undef r_whole_low #undef r_whole_high #undef g_whole_low #undef g_whole_high #undef b_whole_low #undef b_whole_high #undef r_whole_8 #undef g_whole_8 #undef b_whole_8 #undef dither_offsets #undef rg_dx4 #undef rg_dx8 #undef dx4 #undef dx8 #undef v_left_x #undef uvrg #undef block_span #undef rg #undef draw_mask #undef test_mask #define r_block q0 #define g_block q1 #define b_block q2 #define r_whole q3 #define g_whole q4 #define b_whole q5 #define r_whole_low d6 #define r_whole_high d7 #define g_whole_low d8 #define g_whole_high d9 #define b_whole_low d10 #define b_whole_high d11 #define gb_whole_8 q6 #define g_whole_8 d12 #define b_whole_8 d13 #define r_whole_8 d14 #define pixels q8 #define rg_dx4 d18 #define rg_dx8 d19 #define dx4 q10 #define dx8 q10 #define v_left_x d6 #define uvrg q4 #define block_span q5 #define rg d9 #define d64_1 d22 #define d64_128 d23 #define d128_4 q12 #define d128_0x7 q13 #define d64_4 d24 #define dither_offsets q14 #define draw_mask q15 #define dither_offsets_low d28 #define rg_dx d0 #define test_mask q10 #define setup_blocks_shaded_untextured_dither_a_dithered() \ vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \ vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \ #define setup_blocks_shaded_untextured_dither_b_dithered() \ vqsub.u8 r_whole_8, r_whole_8, d64_4; \ vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \ #define setup_blocks_shaded_untextured_dither_a_undithered() \ #define setup_blocks_shaded_untextured_dither_b_undithered() \ #define setup_blocks_shaded_untextured_indirect_builder(dithering) \ .align 3; \ \ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \ add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \ \ vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \ \ cmp num_spans, #0; \ bxeq lr; \ \ stmdb sp!, { r4 - r11, r14 }; \ vshl.u32 rg_dx4, rg_dx, #2; \ \ ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \ vshl.u32 rg_dx8, rg_dx, #3; \ \ add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \ \ add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \ add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \ \ add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \ vmov.u8 d64_1, #1; \ \ vmov.u8 d128_4, #4; \ vmov.u8 d64_128, #128; \ \ vmov.u8 d128_0x7, #0x7; \ \ 0: \ ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ \ ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \ add num_blocks, span_num_blocks, num_blocks; \ \ cmp num_blocks, #MAX_BLOCKS; \ bgt 2f; \ \ 3: \ ldr b, [ span_b_offset ]; \ add fb_ptr, fb_ptr, y, lsl #11; \ \ vdup.u32 v_left_x, left_x; \ and y, y, #0x3; \ \ ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \ add fb_ptr, fb_ptr, left_x, lsl #1; \ \ mla b, b_dx, left_x, b; \ and dither_shift, left_x, #0x03; \ \ vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \ vshr.u32 rg_dx, rg_dx4, #2; \ \ mov dither_shift, dither_shift, lsl #3; \ vmla.u32 rg, rg_dx, v_left_x; \ \ mov c_64, #64; \ subs span_num_blocks, span_num_blocks, #1; \ \ mov dither_row, dither_row, ror dither_shift; \ mov b_dx4, b_dx, lsl #2; \ \ vdup.u32 dither_offsets, dither_row; \ add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \ \ vdup.u32 b_block, b; \ vadd.u8 dither_offsets, dither_offsets, d128_4; \ \ mov b_dx8, b_dx, lsl #3; \ vdup.u32 r_block, rg[0]; \ vdup.u32 g_block, rg[1]; \ \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 r_block, r_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 g_block, g_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \ \ vadd.u32 b_block, b_block, block_span; \ add block_ptr_b, block_ptr_a, #16; \ \ vshrn.u32 r_whole_low, r_block, #16; \ vshrn.u32 g_whole_low, g_block, #16; \ vshrn.u32 b_whole_low, b_block, #16; \ vdup.u32 dx4, rg_dx4[0]; \ \ vaddhn.u32 r_whole_high, r_block, dx4; \ vdup.u32 dx4, rg_dx4[1]; \ \ vaddhn.u32 g_whole_high, g_block, dx4; \ vdup.u32 dx4, b_dx4; \ \ vaddhn.u32 b_whole_high, b_block, dx4; \ vdup.u32 dx8, rg_dx8[0]; \ \ vadd.u32 r_block, r_block, dx8; \ vdup.u32 dx8, rg_dx8[1]; \ \ vadd.u32 g_block, g_block, dx8; \ vdup.u32 dx8, b_dx8; \ \ vadd.u32 b_block, b_block, dx8; \ \ vmovn.u16 r_whole_8, r_whole; \ vmovn.u16 g_whole_8, g_whole; \ vmovn.u16 b_whole_8, b_whole; \ \ beq 5f; \ veor.u32 draw_mask, draw_mask, draw_mask; \ \ 4: \ setup_blocks_shaded_untextured_dither_a_##dithering(); \ vshrn.u32 r_whole_low, r_block, #16; \ \ setup_blocks_shaded_untextured_dither_b_##dithering(); \ vshrn.u32 g_whole_low, g_block, #16; \ \ vshrn.u32 b_whole_low, b_block, #16; \ str fb_ptr, [ block_ptr_a, #44 ]; \ \ vdup.u32 dx4, rg_dx4[0]; \ vshr.u8 r_whole_8, r_whole_8, #3; \ vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \ \ vaddhn.u32 r_whole_high, r_block, dx4; \ vdup.u32 dx4, rg_dx4[1]; \ \ vaddhn.u32 g_whole_high, g_block, dx4; \ vdup.u32 dx4, b_dx4; \ \ vaddhn.u32 b_whole_high, b_block, dx4; \ vdup.u32 dx8, rg_dx8[0]; \ \ vmull.u8 pixels, r_whole_8, d64_1; \ vmlal.u8 pixels, g_whole_8, d64_4; \ vmlal.u8 pixels, b_whole_8, d64_128; \ \ vadd.u32 r_block, r_block, dx8; \ vdup.u32 dx8, rg_dx8[1]; \ \ vadd.u32 g_block, g_block, dx8; \ vdup.u32 dx8, b_dx8; \ \ vadd.u32 b_block, b_block, dx8; \ add fb_ptr, fb_ptr, #16; \ \ vmovn.u16 r_whole_8, r_whole; \ vmovn.u16 g_whole_8, g_whole; \ vmovn.u16 b_whole_8, b_whole; \ \ vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \ vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \ \ pld [ fb_ptr ]; \ \ subs span_num_blocks, span_num_blocks, #1; \ bne 4b; \ \ 5: \ str fb_ptr, [ block_ptr_a, #44 ]; \ setup_blocks_shaded_untextured_dither_a_##dithering(); \ \ ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \ setup_blocks_shaded_untextured_dither_b_##dithering(); \ \ vshr.u8 r_whole_8, r_whole_8, #3; \ vdup.u8 draw_mask, right_mask; \ \ vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \ vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \ \ vtst.u16 draw_mask, draw_mask, test_mask; \ \ vmull.u8 pixels, r_whole_8, d64_1; \ vmlal.u8 pixels, g_whole_8, d64_4; \ vmlal.u8 pixels, b_whole_8, d64_128; \ \ vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \ vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \ \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_b_offset, span_b_offset, #4; \ \ add span_edge_data, span_edge_data, #8; \ subs num_spans, num_spans, #1; \ \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ bne 0b; \ \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ /* TODO: Load from psx_gpu instead of saving/restoring these */\ vpush { rg_dx4 }; \ \ stmdb sp!, { r0 - r3, r12, r14 }; \ bl flush_render_block_buffer; \ ldmia sp!, { r0 - r3, r12, r14 }; \ \ vpop { rg_dx4 }; \ \ vmov.u8 d64_1, #1; \ vmov.u8 d128_4, #4; \ vmov.u8 d64_128, #128; \ vmov.u8 d128_0x7, #0x7; \ \ vadd.u32 rg_dx8, rg_dx4, rg_dx4; \ \ mov num_blocks, span_num_blocks; \ add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \ bal 3b \ setup_blocks_shaded_untextured_indirect_builder(undithered) setup_blocks_shaded_untextured_indirect_builder(dithered) #undef draw_mask #define mask_msb_ptr r14 #define draw_mask q0 #define pixels_low d16 #define setup_blocks_shaded_untextured_direct_builder(dithering) \ .align 3; \ \ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \ ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \ add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \ \ vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \ \ cmp num_spans, #0; \ bxeq lr; \ \ stmdb sp!, { r4 - r11, r14 }; \ vshl.u32 rg_dx4, rg_dx, #2; \ \ ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \ vshl.u32 rg_dx8, rg_dx, #3; \ \ add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \ add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \ \ add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \ vmov.u8 d64_1, #1; \ \ vmov.u8 d128_4, #4; \ vmov.u8 d64_128, #128; \ \ vmov.u8 d128_0x7, #0x7; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ 0: \ ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \ add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \ \ ldrh y, [ span_edge_data, #edge_data_y_offset ]; \ ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ \ cmp span_num_blocks, #0; \ beq 1f; \ \ ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \ add fb_ptr, fb_ptr, y, lsl #11; \ \ ldr b, [ span_b_offset ]; \ vdup.u32 v_left_x, left_x; \ and y, y, #0x3; \ \ ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \ add fb_ptr, fb_ptr, left_x, lsl #1; \ \ mla b, b_dx, left_x, b; \ and dither_shift, left_x, #0x03; \ \ vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \ vshr.u32 rg_dx, rg_dx4, #2; \ \ mov dither_shift, dither_shift, lsl #3; \ vmla.u32 rg, rg_dx, v_left_x; \ \ subs span_num_blocks, span_num_blocks, #1; \ \ mov dither_row, dither_row, ror dither_shift; \ mov b_dx4, b_dx, lsl #2; \ \ vdup.u32 dither_offsets, dither_row; \ add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \ \ vdup.u32 b_block, b; \ vadd.u8 dither_offsets, dither_offsets, d128_4; \ \ mov b_dx8, b_dx, lsl #3; \ vdup.u32 r_block, rg[0]; \ vdup.u32 g_block, rg[1]; \ \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 r_block, r_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \ \ vadd.u32 g_block, g_block, block_span; \ vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \ \ vadd.u32 b_block, b_block, block_span; \ add block_ptr_b, block_ptr_a, #16; \ \ vshrn.u32 r_whole_low, r_block, #16; \ vshrn.u32 g_whole_low, g_block, #16; \ vshrn.u32 b_whole_low, b_block, #16; \ vdup.u32 dx4, rg_dx4[0]; \ \ vaddhn.u32 r_whole_high, r_block, dx4; \ vdup.u32 dx4, rg_dx4[1]; \ \ vaddhn.u32 g_whole_high, g_block, dx4; \ vdup.u32 dx4, b_dx4; \ \ vaddhn.u32 b_whole_high, b_block, dx4; \ vdup.u32 dx8, rg_dx8[0]; \ \ vadd.u32 r_block, r_block, dx8; \ vdup.u32 dx8, rg_dx8[1]; \ \ vadd.u32 g_block, g_block, dx8; \ vdup.u32 dx8, b_dx8; \ \ vadd.u32 b_block, b_block, dx8; \ \ vmovn.u16 r_whole_8, r_whole; \ vmovn.u16 g_whole_8, g_whole; \ vmovn.u16 b_whole_8, b_whole; \ \ beq 3f; \ \ 2: \ setup_blocks_shaded_untextured_dither_a_##dithering(); \ vshrn.u32 r_whole_low, r_block, #16; \ \ setup_blocks_shaded_untextured_dither_b_##dithering(); \ vshrn.u32 g_whole_low, g_block, #16; \ \ vshrn.u32 b_whole_low, b_block, #16; \ \ vdup.u32 dx4, rg_dx4[0]; \ vshr.u8 r_whole_8, r_whole_8, #3; \ vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \ \ vaddhn.u32 r_whole_high, r_block, dx4; \ vdup.u32 dx4, rg_dx4[1]; \ \ vmov pixels, msb_mask; \ vaddhn.u32 g_whole_high, g_block, dx4; \ vdup.u32 dx4, b_dx4; \ \ vaddhn.u32 b_whole_high, b_block, dx4; \ vdup.u32 dx8, rg_dx8[0]; \ \ vmlal.u8 pixels, r_whole_8, d64_1; \ vmlal.u8 pixels, g_whole_8, d64_4; \ vmlal.u8 pixels, b_whole_8, d64_128; \ \ vadd.u32 r_block, r_block, dx8; \ vdup.u32 dx8, rg_dx8[1]; \ \ vadd.u32 g_block, g_block, dx8; \ vdup.u32 dx8, b_dx8; \ \ vadd.u32 b_block, b_block, dx8; \ \ vmovn.u16 r_whole_8, r_whole; \ vmovn.u16 g_whole_8, g_whole; \ vmovn.u16 b_whole_8, b_whole; \ \ vst1.u32 { pixels }, [ fb_ptr ]!; \ subs span_num_blocks, span_num_blocks, #1; \ bne 2b; \ \ 3: \ setup_blocks_shaded_untextured_dither_a_##dithering(); \ \ ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \ setup_blocks_shaded_untextured_dither_b_##dithering(); \ \ vshr.u8 r_whole_8, r_whole_8, #3; \ vmov pixels, msb_mask; \ vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \ eor right_mask, right_mask, #0xFF; \ \ vmlal.u8 pixels, r_whole_8, d64_1; \ vmlal.u8 pixels, g_whole_8, d64_4; \ vmlal.u8 pixels, b_whole_8, d64_128; \ \ 4: \ vst1.u16 { pixels_low[0] }, [ fb_ptr ]!; \ vext.16 pixels, pixels, #1; \ movs right_mask, right_mask, lsr #1; \ bne 4b; \ \ 1: \ add span_uvrg_offset, span_uvrg_offset, #16; \ add span_b_offset, span_b_offset, #4; \ \ add span_edge_data, span_edge_data, #8; \ subs num_spans, num_spans, #1; \ \ bne 0b; \ \ ldmia sp!, { r4 - r11, pc } \ setup_blocks_shaded_untextured_direct_builder(undithered) setup_blocks_shaded_untextured_direct_builder(dithered) #undef psx_gpu #undef num_blocks #undef triangle #undef c_64 #define psx_gpu r0 #define block_ptr r1 #define num_blocks r2 #define uv_01 r3 #define uv_23 r4 #define uv_45 r5 #define uv_67 r6 #define uv_0 r7 #define uv_1 r3 #define uv_2 r8 #define uv_3 r4 #define uv_4 r9 #define uv_5 r5 #define uv_6 r10 #define uv_7 r6 #define texture_ptr r11 #define pixel_0 r7 #define pixel_1 r3 #define pixel_2 r8 #define pixel_3 r4 #define pixel_4 r9 #define pixel_5 r5 #define pixel_6 r10 #define pixel_7 r6 #define pixels_a r7 #define pixels_b r9 #define pixels_c r8 #define pixels_d r10 #define c_64 r0 #define clut_ptr r12 #define current_texture_mask r5 #define dirty_textures_mask r6 #define texels d0 #define clut_low_a d2 #define clut_low_b d3 #define clut_high_a d4 #define clut_high_b d5 #define clut_a q1 #define clut_b q2 #define texels_low d6 #define texels_high d7 .align 3 function(texture_blocks_untextured) bx lr .align 3 function(texture_blocks_4bpp) stmdb sp!, { r3 - r11, r14 } add block_ptr, psx_gpu, #psx_gpu_blocks_offset ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ] vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ] ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ] vuzp.u8 clut_a, clut_b ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ] tst dirty_textures_mask, current_texture_mask bne 1f mov c_64, #64 0: ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 } uxtah uv_0, texture_ptr, uv_01 uxtah uv_1, texture_ptr, uv_01, ror #16 uxtah uv_2, texture_ptr, uv_23 uxtah uv_3, texture_ptr, uv_23, ror #16 uxtah uv_4, texture_ptr, uv_45 ldrb pixel_0, [ uv_0 ] uxtah uv_5, texture_ptr, uv_45, ror #16 ldrb pixel_1, [ uv_1 ] uxtah uv_6, texture_ptr, uv_67 ldrb pixel_2, [ uv_2 ] uxtah uv_7, texture_ptr, uv_67, ror #16 ldrb pixel_3, [ uv_3 ] ldrb pixel_4, [ uv_4 ] subs num_blocks, num_blocks, #1 ldrb pixel_5, [ uv_5 ] orr pixels_a, pixel_0, pixel_1, lsl #8 ldrb pixel_6, [ uv_6 ] orr pixels_b, pixel_4, pixel_5, lsl #8 ldrb pixel_7, [ uv_7 ] orr pixels_a, pixels_a, pixel_2, lsl #16 orr pixels_b, pixels_b, pixel_6, lsl #16 orr pixels_a, pixels_a, pixel_3, lsl #24 orr pixels_b, pixels_b, pixel_7, lsl #24 vmov.u32 texels, pixels_a, pixels_b vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64 bne 0b ldmia sp!, { r3 - r11, pc } 1: stmdb sp!, { r1 - r2 } bl update_texture_4bpp_cache mov c_64, #64 ldmia sp!, { r1 - r2 } bal 0b .align 3 function(texture_blocks_8bpp) stmdb sp!, { r3 - r11, r14 } add block_ptr, psx_gpu, #psx_gpu_blocks_offset ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ] ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ] ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ] tst dirty_textures_mask, current_texture_mask bne 1f nop 0: ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 } uxtah uv_0, texture_ptr, uv_01 uxtah uv_1, texture_ptr, uv_01, ror #16 uxtah uv_2, texture_ptr, uv_23 uxtah uv_3, texture_ptr, uv_23, ror #16 uxtah uv_4, texture_ptr, uv_45 ldrb pixel_0, [ uv_0 ] uxtah uv_5, texture_ptr, uv_45, ror #16 ldrb pixel_1, [ uv_1 ] uxtah uv_6, texture_ptr, uv_67 ldrb pixel_2, [ uv_2 ] uxtah uv_7, texture_ptr, uv_67, ror #16 ldrb pixel_3, [ uv_3 ] ldrb pixel_4, [ uv_4 ] add pixel_0, pixel_0, pixel_0 ldrb pixel_5, [ uv_5 ] add pixel_1, pixel_1, pixel_1 ldrb pixel_6, [ uv_6 ] add pixel_2, pixel_2, pixel_2 ldrb pixel_7, [ uv_7 ] add pixel_3, pixel_3, pixel_3 ldrh pixel_0, [ clut_ptr, pixel_0 ] add pixel_4, pixel_4, pixel_4 ldrh pixel_1, [ clut_ptr, pixel_1 ] add pixel_5, pixel_5, pixel_5 ldrh pixel_2, [ clut_ptr, pixel_2 ] add pixel_6, pixel_6, pixel_6 ldrh pixel_3, [ clut_ptr, pixel_3 ] add pixel_7, pixel_7, pixel_7 ldrh pixel_4, [ clut_ptr, pixel_4 ] orr pixels_a, pixel_0, pixel_1, lsl #16 ldrh pixel_5, [ clut_ptr, pixel_5 ] orr pixels_c, pixel_2, pixel_3, lsl #16 ldrh pixel_6, [ clut_ptr, pixel_6 ] subs num_blocks, num_blocks, #1 ldrh pixel_7, [ clut_ptr, pixel_7 ] orr pixels_b, pixel_4, pixel_5, lsl #16 orr pixels_d, pixel_6, pixel_7, lsl #16 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } add block_ptr, block_ptr, #64 bne 0b ldmia sp!, { r3 - r11, pc } 1: stmdb sp!, { r1 - r2, r12 } bl update_texture_8bpp_cache ldmia sp!, { r1 - r2, r12 } bal 0b #undef uv_0 #undef uv_1 #undef uv_2 #undef uv_3 #undef uv_4 #undef uv_5 #undef uv_6 #undef uv_7 #undef pixel_0 #undef pixel_1 #undef pixel_2 #undef pixel_3 #undef pixel_4 #undef pixel_5 #undef pixel_6 #undef pixel_7 #undef texture_ptr #undef pixels_a #undef pixels_b #undef pixels_c #undef pixels_d #define psx_gpu r0 #define block_ptr r1 #define num_blocks r2 #define uv_0 r3 #define uv_1 r4 #define u_0 r3 #define u_1 r4 #define v_0 r5 #define v_1 r6 #define uv_2 r5 #define uv_3 r6 #define u_2 r5 #define u_3 r6 #define v_2 r7 #define v_3 r8 #define uv_4 r7 #define uv_5 r8 #define u_4 r7 #define u_5 r8 #define v_4 r9 #define v_5 r10 #define uv_6 r9 #define uv_7 r10 #define u_6 r9 #define u_7 r10 #define v_6 r11 #define v_7 r0 #define pixel_0 r3 #define pixel_1 r4 #define pixel_2 r5 #define pixel_3 r6 #define pixel_4 r7 #define pixel_5 r8 #define pixel_6 r9 #define pixel_7 r10 #define pixels_a r3 #define pixels_b r5 #define pixels_c r7 #define pixels_d r9 #define texture_ptr r12 .align 3 function(texture_blocks_16bpp) stmdb sp!, { r3 - r11, r14 } add block_ptr, psx_gpu, #psx_gpu_blocks_offset ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] 0: ldrh uv_0, [ block_ptr ] subs num_blocks, num_blocks, #1 ldrh uv_1, [ block_ptr, #2 ] and v_0, uv_0, #0xFF00 and v_1, uv_1, #0xFF00 and u_0, uv_0, #0xFF and u_1, uv_1, #0xFF add uv_0, u_0, v_0, lsl #2 ldrh uv_2, [ block_ptr, #4 ] add uv_1, u_1, v_1, lsl #2 ldrh uv_3, [ block_ptr, #6 ] add uv_0, uv_0, uv_0 add uv_1, uv_1, uv_1 and v_2, uv_2, #0xFF00 and v_3, uv_3, #0xFF00 and u_2, uv_2, #0xFF and u_3, uv_3, #0xFF add uv_2, u_2, v_2, lsl #2 ldrh uv_4, [ block_ptr, #8 ] add uv_3, u_3, v_3, lsl #2 ldrh uv_5, [ block_ptr, #10 ] add uv_2, uv_2, uv_2 add uv_3, uv_3, uv_3 and v_4, uv_4, #0xFF00 and v_5, uv_5, #0xFF00 and u_4, uv_4, #0xFF and u_5, uv_5, #0xFF add uv_4, u_4, v_4, lsl #2 ldrh uv_6, [ block_ptr, #12 ] add uv_5, u_5, v_5, lsl #2 ldrh uv_7, [ block_ptr, #14 ] add uv_4, uv_4, uv_4 ldrh pixel_0, [ texture_ptr, uv_0 ] add uv_5, uv_5, uv_5 ldrh pixel_1, [ texture_ptr, uv_1 ] and v_6, uv_6, #0xFF00 ldrh pixel_2, [ texture_ptr, uv_2 ] and v_7, uv_7, #0xFF00 ldrh pixel_3, [ texture_ptr, uv_3 ] and u_6, uv_6, #0xFF ldrh pixel_4, [ texture_ptr, uv_4 ] and u_7, uv_7, #0xFF ldrh pixel_5, [ texture_ptr, uv_5 ] add uv_6, u_6, v_6, lsl #2 add uv_7, u_7, v_7, lsl #2 add uv_6, uv_6, uv_6 add uv_7, uv_7, uv_7 orr pixels_a, pixel_0, pixel_1, lsl #16 orr pixels_b, pixel_2, pixel_3, lsl #16 ldrh pixel_6, [ texture_ptr, uv_6 ] orr pixels_c, pixel_4, pixel_5, lsl #16 ldrh pixel_7, [ texture_ptr, uv_7 ] orr pixels_d, pixel_6, pixel_7, lsl #16 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d } add block_ptr, block_ptr, #64 bne 0b ldmia sp!, { r3 - r11, pc } #undef num_blocks #undef test_mask #undef texels #undef pixels_b #undef pixels #undef d64_1 #undef d64_4 #undef d64_128 #undef draw_mask #undef msb_mask #undef msb_mask_low #undef msb_mask_high #undef fb_pixels #undef c_32 #undef fb_ptr #undef mask_msb_ptr #define psx_gpu r0 #define num_blocks r1 #define color_ptr r2 #define mask_msb_ptr r2 #define block_ptr_load_a r0 #define block_ptr_store r3 #define block_ptr_load_b r12 #define c_32 r2 #define c_48 r4 #define fb_ptr r14 #define draw_mask_bits_scalar r5 #define d128_0x07 q0 #define d128_0x1F q1 #define d128_0x8000 q2 #define test_mask q3 #define texels q4 #define colors_rg q5 #define colors_b_dm_bits q6 #define texels_rg q7 #define pixels_r q8 #define pixels_g q9 #define pixels_b q10 #define pixels q11 #define zero_mask q4 #define draw_mask q12 #define msb_mask q13 #define fb_pixels q8 #define pixels_gb_low q9 #define colors_r d10 #define colors_g d11 #define colors_b d12 #define draw_mask_bits d13 #define texels_r d14 #define texels_g d15 #define pixels_r_low d16 #define pixels_g_low d18 #define pixels_b_low d19 #define msb_mask_low d26 #define msb_mask_high d27 #define d64_1 d28 #define d64_4 d29 #define d64_128 d30 #define texels_b d31 #define shade_blocks_textured_modulated_prologue_indirect() \ mov c_48, #48; \ add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \ #define shade_blocks_textured_modulated_prologue_direct() \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \ #define shade_blocks_textured_modulated_prologue_shaded() \ #define shade_blocks_textured_modulated_prologue_unshaded() \ add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \ vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \ vdup.u8 colors_g, colors_r[1]; \ vdup.u8 colors_b, colors_r[2]; \ vdup.u8 colors_r, colors_r[0] \ #define shade_blocks_textured_modulated_load_dithered(target) \ vld1.u32 { target }, [ block_ptr_load_b, :128 ] \ #define shade_blocks_textured_modulated_load_last_dithered(target) \ vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \ #define shade_blocks_textured_modulated_load_undithered(target) \ #define shade_blocks_textured_modulated_load_last_undithered(target) \ add block_ptr_load_b, block_ptr_load_b, #32 \ #define shade_blocks_textured_modulate_dithered(channel) \ vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \ #define shade_blocks_textured_modulate_undithered(channel) \ vmull.u8 pixels_##channel, texels_##channel, colors_##channel \ #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \ vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \ #define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \ ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \ vld1.u32 { fb_pixels }, [ fb_ptr ]; \ vbit.u16 pixels, fb_pixels, draw_mask \ #define shade_blocks_textured_modulated_store_pixels_indirect() \ vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \ #define shade_blocks_textured_modulated_store_pixels_direct() \ vst1.u32 { pixels }, [ fb_ptr ] \ #define shade_blocks_textured_modulated_load_rg_shaded() \ vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \ #define shade_blocks_textured_modulated_load_rg_unshaded() \ add block_ptr_load_b, block_ptr_load_b, #32 \ #define shade_blocks_textured_modulated_load_bdm_shaded() \ vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \ #define shade_blocks_textured_modulated_load_bdm_unshaded() \ ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \ add block_ptr_load_a, block_ptr_load_a, #32 \ #define shade_blocks_textured_modulated_expand_draw_mask_shaded() \ vdup.u16 draw_mask, draw_mask_bits[0] \ #define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \ vdup.u16 draw_mask, draw_mask_bits_scalar \ #define shade_blocks_textured_modulated_apply_msb_mask_indirect() \ #define shade_blocks_textured_modulated_apply_msb_mask_direct() \ vorr.u16 pixels, pixels, msb_mask \ #define shade_blocks_textured_modulated_builder(shading, dithering, target) \ .align 3; \ \ function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \ stmdb sp!, { r4 - r5, lr }; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \ \ shade_blocks_textured_modulated_prologue_##target(); \ shade_blocks_textured_modulated_prologue_##shading(); \ \ add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \ mov c_32, #32; \ \ add block_ptr_load_b, block_ptr_load_a, #16; \ vmov.u8 d64_1, #1; \ vmov.u8 d64_4, #4; \ vmov.u8 d64_128, #128; \ \ vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \ vmov.u8 d128_0x07, #0x07; \ \ shade_blocks_textured_modulated_load_rg_##shading(); \ vmov.u8 d128_0x1F, #0x1F; \ \ shade_blocks_textured_modulated_load_bdm_##shading(); \ vmov.u16 d128_0x8000, #0x8000; \ \ vmovn.u16 texels_r, texels; \ vshrn.u16 texels_g, texels, #5; \ \ vshrn.u16 texels_b, texels, #7; \ shade_blocks_textured_modulated_expand_draw_mask_##shading(); \ \ shade_blocks_textured_modulated_load_##dithering(pixels_r); \ vtst.u16 draw_mask, draw_mask, test_mask; \ \ shade_blocks_textured_modulated_load_##dithering(pixels_g); \ vand.u8 texels_rg, texels_rg, d128_0x1F; \ \ shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \ vshr.u8 texels_b, texels_b, #3; \ \ shade_blocks_textured_modulate_##dithering(r); \ shade_blocks_textured_modulate_##dithering(g); \ shade_blocks_textured_modulate_##dithering(b); \ \ vand.u16 pixels, texels, d128_0x8000; \ vceq.u16 zero_mask, texels, #0; \ \ vqshrun.s16 pixels_r_low, pixels_r, #4; \ vqshrun.s16 pixels_g_low, pixels_g, #4; \ vqshrun.s16 pixels_b_low, pixels_b, #4; \ \ shade_blocks_textured_modulated_apply_msb_mask_##target(); \ vorr.u16 draw_mask, draw_mask, zero_mask; \ vshr.u8 pixels_r_low, pixels_r_low, #3; \ vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ .align 3; \ \ 0: \ vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \ shade_blocks_textured_modulated_load_rg_##shading(); \ vshrn.u16 texels_g, texels, #5; \ \ shade_blocks_textured_modulated_load_bdm_##shading(); \ vshrn.u16 texels_b, texels, #7; \ \ vmovn.u16 texels_r, texels; \ vmlal.u8 pixels, pixels_r_low, d64_1; \ \ vmlal.u8 pixels, pixels_g_low, d64_4; \ vmlal.u8 pixels, pixels_b_low, d64_128; \ shade_blocks_textured_modulated_store_draw_mask_##target(-4); \ \ shade_blocks_textured_modulated_load_##dithering(pixels_r); \ shade_blocks_textured_modulated_expand_draw_mask_##shading(); \ \ shade_blocks_textured_modulated_load_##dithering(pixels_g); \ vand.u8 texels_rg, texels_rg, d128_0x1F; \ \ shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \ vtst.u16 draw_mask, draw_mask, test_mask; \ \ shade_blocks_textured_modulated_store_pixels_##target(); \ vshr.u8 texels_b, texels_b, #3; \ \ shade_blocks_textured_modulate_##dithering(r); \ shade_blocks_textured_modulate_##dithering(g); \ shade_blocks_textured_modulate_##dithering(b); \ \ vand.u16 pixels, texels, d128_0x8000; \ vceq.u16 zero_mask, texels, #0; \ \ subs num_blocks, num_blocks, #1; \ \ vqshrun.s16 pixels_r_low, pixels_r, #4; \ vqshrun.s16 pixels_g_low, pixels_g, #4; \ vqshrun.s16 pixels_b_low, pixels_b, #4; \ \ shade_blocks_textured_modulated_apply_msb_mask_##target(); \ vorr.u16 draw_mask, draw_mask, zero_mask; \ vshr.u8 pixels_r_low, pixels_r_low, #3; \ vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \ \ bne 0b; \ \ 1: \ vmlal.u8 pixels, pixels_r_low, d64_1; \ vmlal.u8 pixels, pixels_g_low, d64_4; \ vmlal.u8 pixels, pixels_b_low, d64_128; \ \ shade_blocks_textured_modulated_store_draw_mask_##target(28); \ shade_blocks_textured_modulated_store_pixels_##target(); \ \ ldmia sp!, { r4 - r5, pc } \ shade_blocks_textured_modulated_builder(shaded, dithered, direct); shade_blocks_textured_modulated_builder(shaded, undithered, direct); shade_blocks_textured_modulated_builder(unshaded, dithered, direct); shade_blocks_textured_modulated_builder(unshaded, undithered, direct); shade_blocks_textured_modulated_builder(shaded, dithered, indirect); shade_blocks_textured_modulated_builder(shaded, undithered, indirect); shade_blocks_textured_modulated_builder(unshaded, dithered, indirect); shade_blocks_textured_modulated_builder(unshaded, undithered, indirect); #undef c_64 #undef fb_ptr #undef color_ptr #undef color_r #undef color_g #undef color_b #undef test_mask #undef pixels #undef draw_mask #undef zero_mask #undef fb_pixels #undef msb_mask #undef msb_mask_low #undef msb_mask_high #define psx_gpu r0 #define num_blocks r1 #define mask_msb_ptr r2 #define color_ptr r3 #define block_ptr_load r0 #define draw_mask_store_ptr r3 #define draw_mask_bits_ptr r12 #define draw_mask_ptr r12 #define pixel_store_ptr r14 #define fb_ptr_cmp r4 #define fb_ptr r3 #define fb_ptr_next r14 #define c_64 r2 #define test_mask q0 #define pixels q1 #define draw_mask q2 #define zero_mask q3 #define draw_mask_combined q4 #define fb_pixels q5 #define fb_pixels_next q6 #define msb_mask q7 #define draw_mask_low d4 #define draw_mask_high d5 #define msb_mask_low d14 #define msb_mask_high d15 .align 3 function(shade_blocks_textured_unmodulated_indirect) str r14, [ sp, #-4 ] add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40) ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16) vld1.u32 { test_mask }, [ psx_gpu, :128 ] add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset mov c_64, #64 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \ [ draw_mask_bits_ptr, :16 ], c_64 vceq.u16 zero_mask, pixels, #0 vtst.u16 draw_mask, draw_mask, test_mask vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64 subs num_blocks, num_blocks, #1 beq 1f 0: vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64 vorr.u16 draw_mask_combined, draw_mask, zero_mask vld1.u16 { draw_mask_low[], draw_mask_high[] }, \ [ draw_mask_bits_ptr, :16 ], c_64 vceq.u16 zero_mask, pixels, #0 vtst.u16 draw_mask, draw_mask, test_mask vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64 subs num_blocks, num_blocks, #1 bne 0b 1: vorr.u16 draw_mask_combined, draw_mask, zero_mask vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64 ldr pc, [ sp, #-4 ] .align 3 function(shade_blocks_textured_unmodulated_direct) stmdb sp!, { r4, r14 } add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40) ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] mov c_64, #64 vld1.u32 { test_mask }, [ psx_gpu, :128 ] add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset vld1.u16 { draw_mask_low[], draw_mask_high[] }, \ [ draw_mask_bits_ptr, :16 ], c_64 ldr fb_ptr_next, [ block_ptr_load, #44 ] vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ] vceq.u16 zero_mask, pixels, #0 vtst.u16 draw_mask, draw_mask, test_mask subs num_blocks, num_blocks, #1 beq 1f 0: mov fb_ptr, fb_ptr_next ldr fb_ptr_next, [ block_ptr_load, #44 ] vorr.u16 pixels, pixels, msb_mask vorr.u16 draw_mask_combined, draw_mask, zero_mask vmov fb_pixels, fb_pixels_next vld1.u16 { draw_mask_low[], draw_mask_high[] }, \ [ draw_mask_bits_ptr, :16 ], c_64 vbif.u16 fb_pixels, pixels, draw_mask_combined vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64 sub fb_ptr_cmp, fb_ptr_next, fb_ptr add fb_ptr_cmp, fb_ptr_cmp, #14 cmp fb_ptr_cmp, #28 bls 4f vld1.u16 { fb_pixels_next }, [ fb_ptr_next ] vceq.u16 zero_mask, pixels, #0 vst1.u16 { fb_pixels }, [ fb_ptr ] vtst.u16 draw_mask, draw_mask, test_mask 3: subs num_blocks, num_blocks, #1 bne 0b 1: vorr.u16 draw_mask_combined, draw_mask, zero_mask vbif.u16 fb_pixels_next, pixels, draw_mask_combined vst1.u16 { fb_pixels_next }, [ fb_ptr_next ] ldmia sp!, { r4, pc } 4: vst1.u16 { fb_pixels }, [ fb_ptr ] vceq.u16 zero_mask, pixels, #0 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ] vtst.u16 draw_mask, draw_mask, test_mask bal 3b function(shade_blocks_unshaded_untextured_indirect) bx lr .align 3 function(shade_blocks_unshaded_untextured_direct) stmdb sp!, { r4, r14 } add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16) add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44) vld1.u16 { pixels }, [ color_ptr, :128 ] mov c_64, #64 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64 vorr.u16 pixels, pixels, msb_mask subs num_blocks, num_blocks, #1 ldr fb_ptr_next, [ block_ptr_load ], #64 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ] beq 1f 0: vmov fb_pixels, fb_pixels_next mov fb_ptr, fb_ptr_next ldr fb_ptr_next, [ block_ptr_load ], #64 vbif.u16 fb_pixels, pixels, draw_mask vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64 sub fb_ptr_cmp, fb_ptr_next, fb_ptr add fb_ptr_cmp, fb_ptr_cmp, #14 cmp fb_ptr_cmp, #28 bls 4f vld1.u16 { fb_pixels_next }, [ fb_ptr_next ] vst1.u16 { fb_pixels }, [ fb_ptr ] 3: subs num_blocks, num_blocks, #1 bne 0b 1: vbif.u16 fb_pixels_next, pixels, draw_mask vst1.u16 { fb_pixels_next }, [ fb_ptr_next ] ldmia sp!, { r4, pc } 4: vst1.u16 { fb_pixels }, [ fb_ptr ] vld1.u16 { fb_pixels_next }, [ fb_ptr_next ] bal 3b #undef draw_mask_ptr #undef c_64 #undef fb_ptr #undef fb_ptr_next #undef fb_ptr_cmp #define psx_gpu r0 #define num_blocks r1 #define msb_mask_ptr r2 #define pixel_ptr r3 #define draw_mask_ptr r0 #define c_64 r2 #define fb_ptr r12 #define fb_ptr_next r14 #define fb_ptr_cmp r4 #undef msb_mask #undef draw_mask #undef pixels #undef fb_pixels #undef d128_0x8000 #undef msb_mask_low #undef msb_mask_high #undef draw_mask_next #undef pixels_g #undef blend_pixels #undef fb_pixels_next #define msb_mask q0 #define draw_mask q1 #define pixels q2 #define fb_pixels q3 #define blend_pixels q4 #define pixels_no_msb q5 #define blend_mask q6 #define fb_pixels_no_msb q7 #define d128_0x8000 q8 #define d128_0x0421 q9 #define fb_pixels_next q10 #define blend_pixels_next q11 #define pixels_next q12 #define draw_mask_next q13 #define write_mask q14 #define pixels_rb q5 #define pixels_mg q7 #define pixels_g q7 #define d128_0x7C1F q8 #define d128_0x03E0 q9 #define fb_pixels_rb q10 #define fb_pixels_g q11 #define fb_pixels_masked q11 #define d128_0x83E0 q15 #define pixels_fourth q7 #define d128_0x1C07 q12 #define d128_0x00E0 q13 #define d128_0x80E0 q13 #define msb_mask_low d0 #define msb_mask_high d1 #define blend_blocks_average_set_blend_mask_textured(source) \ vclt.s16 blend_mask, source, #0 \ #define blend_blocks_average_set_stp_bit_textured() \ vorr.u16 blend_pixels, #0x8000 \ #define blend_blocks_average_combine_textured(source) \ vbif.u16 blend_pixels, source, blend_mask \ #define blend_blocks_average_set_blend_mask_untextured(source) \ #define blend_blocks_average_set_stp_bit_untextured() \ #define blend_blocks_average_combine_untextured(source) \ #define blend_blocks_average_mask_set_on() \ vclt.s16 write_mask, fb_pixels_next, #0 \ #define blend_blocks_average_mask_copy_on() \ vorr.u16 draw_mask, draw_mask_next, write_mask \ #define blend_blocks_average_mask_copy_b_on() \ vorr.u16 draw_mask_next, draw_mask_next, write_mask \ #define blend_blocks_average_mask_set_off() \ #define blend_blocks_average_mask_copy_off() \ vmov draw_mask, draw_mask_next \ #define blend_blocks_average_mask_copy_b_off() \ #define blend_blocks_average_builder(texturing, mask_evaluate) \ .align 3; \ \ function(blend_blocks_##texturing##_average_##mask_evaluate) \ stmdb sp!, { r4, r14 }; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \ mov c_64, #64; \ \ vmov.u16 d128_0x8000, #0x8000; \ vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vmov.u16 d128_0x0421, #0x0400; \ vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \ \ vorr.u16 d128_0x0421, #0x0021; \ vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \ \ veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \ vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \ vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \ vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \ blend_blocks_average_mask_set_##mask_evaluate(); \ vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vmov pixels, pixels_next; \ vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \ \ vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \ \ blend_blocks_average_mask_copy_##mask_evaluate(); \ vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \ \ blend_blocks_average_set_blend_mask_##texturing(pixels); \ blend_blocks_average_set_stp_bit_##texturing(); \ vmov fb_pixels, fb_pixels_next; \ blend_blocks_average_combine_##texturing(pixels); \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \ veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \ \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \ \ vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \ vbif.u16 fb_pixels, blend_pixels, draw_mask; \ \ vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \ vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \ blend_blocks_average_mask_set_##mask_evaluate(); \ vst1.u16 { fb_pixels }, [ fb_ptr ]; \ \ 3: \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ blend_blocks_average_mask_copy_b_##mask_evaluate(); \ vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \ \ blend_blocks_average_set_blend_mask_##texturing(pixels_next); \ blend_blocks_average_set_stp_bit_##texturing(); \ blend_blocks_average_combine_##texturing(pixels_next); \ \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \ vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \ \ ldmia sp!, { r4, pc }; \ \ 2: \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vbif.u16 fb_pixels, blend_pixels, draw_mask; \ vst1.u16 { fb_pixels }, [ fb_ptr ]; \ \ vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \ veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \ vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \ vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \ vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \ vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \ \ bal 3b \ blend_blocks_average_builder(textured, off) blend_blocks_average_builder(untextured, off) blend_blocks_average_builder(textured, on) blend_blocks_average_builder(untextured, on) #define blend_blocks_add_mask_set_on() \ vclt.s16 write_mask, fb_pixels, #0 \ #define blend_blocks_add_mask_copy_on() \ vorr.u16 draw_mask, draw_mask, write_mask \ #define blend_blocks_add_mask_set_off() \ #define blend_blocks_add_mask_copy_off() \ #define blend_blocks_add_textured_builder(mask_evaluate) \ .align 3; \ \ function(blend_blocks_textured_add_##mask_evaluate) \ stmdb sp!, { r4, r14 }; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \ mov c_64, #64; \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ vmov.u16 d128_0x83E0, #0x8000; \ vorr.u16 d128_0x03E0, #0x00E0; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vclt.s16 blend_mask, pixels, #0; \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vand.u16 pixels_rb, pixels, d128_0x7C1F; \ \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vorr.u16 pixels, pixels, msb_mask; \ vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ vand.u16 pixels_mg, pixels, d128_0x83E0; \ vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vclt.s16 blend_mask, pixels, #0; \ \ vorr.u16 pixels, pixels, msb_mask; \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vand.u16 pixels_mg, pixels, d128_0x83E0; \ \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 pixels_rb, pixels, d128_0x7C1F; \ vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ \ 3: \ vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ ldmia sp!, { r4, pc }; \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ vand.u16 pixels_rb, pixels, d128_0x7C1F; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ bal 3b \ #define blend_blocks_add_untextured_builder(mask_evaluate) \ .align 3; \ \ function(blend_blocks_untextured_add_##mask_evaluate) \ stmdb sp!, { r4, r14 }; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \ mov c_64, #64; \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vand.u16 pixels_rb, pixels, d128_0x7C1F; \ \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 pixels_g, pixels, d128_0x03E0; \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vand.u16 pixels_g, pixels, d128_0x03E0; \ \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 pixels_rb, pixels, d128_0x7C1F; \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ \ 3: \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ ldmia sp!, { r4, pc }; \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ vand.u16 pixels_rb, pixels, d128_0x7C1F; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ bal 3b \ blend_blocks_add_textured_builder(off) blend_blocks_add_textured_builder(on) blend_blocks_add_untextured_builder(off) blend_blocks_add_untextured_builder(on) #define blend_blocks_subtract_set_blend_mask_textured() \ vclt.s16 blend_mask, pixels_next, #0 \ #define blend_blocks_subtract_combine_textured() \ vbif.u16 blend_pixels, pixels, blend_mask \ #define blend_blocks_subtract_set_stb_textured() \ vorr.u16 blend_pixels, #0x8000 \ #define blend_blocks_subtract_msb_mask_textured() \ vorr.u16 pixels, pixels_next, msb_mask \ #define blend_blocks_subtract_set_blend_mask_untextured() \ #define blend_blocks_subtract_combine_untextured() \ #define blend_blocks_subtract_set_stb_untextured() \ vorr.u16 blend_pixels, blend_pixels, msb_mask \ #define blend_blocks_subtract_msb_mask_untextured() \ #define blend_blocks_subtract_mask_set_on() \ vclt.s16 write_mask, fb_pixels, #0 \ #define blend_blocks_subtract_mask_copy_on() \ vorr.u16 draw_mask, draw_mask_next, write_mask \ #define blend_blocks_subtract_mask_set_off() \ #define blend_blocks_subtract_mask_copy_off() \ vmov draw_mask, draw_mask_next \ #define blend_blocks_subtract_builder(texturing, mask_evaluate) \ .align 3; \ \ function(blend_blocks_##texturing##_subtract_##mask_evaluate) \ stmdb sp!, { r4, r14 }; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \ mov c_64, #64; \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ \ vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \ blend_blocks_subtract_set_blend_mask_##texturing(); \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_subtract_mask_set_##mask_evaluate(); \ vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \ \ vand.u16 pixels_g, pixels_next, d128_0x03E0; \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ blend_blocks_subtract_mask_copy_##mask_evaluate(); \ mov fb_ptr, fb_ptr_next; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \ blend_blocks_subtract_msb_mask_##texturing(); \ \ vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \ blend_blocks_subtract_set_stb_##texturing(); \ vand.u16 pixels_g, pixels_next, d128_0x03E0; \ blend_blocks_subtract_combine_##texturing(); \ blend_blocks_subtract_set_blend_mask_##texturing(); \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_subtract_mask_set_##mask_evaluate(); \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ \ 3: \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ blend_blocks_subtract_mask_copy_##mask_evaluate(); \ \ blend_blocks_subtract_msb_mask_##texturing(); \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ blend_blocks_subtract_set_stb_##texturing(); \ blend_blocks_subtract_combine_##texturing(); \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ ldmia sp!, { r4, pc }; \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_subtract_mask_set_##mask_evaluate(); \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ bal 3b \ blend_blocks_subtract_builder(textured, off) blend_blocks_subtract_builder(textured, on) blend_blocks_subtract_builder(untextured, off) blend_blocks_subtract_builder(untextured, on) #define blend_blocks_add_fourth_textured_builder(mask_evaluate) \ .align 3; \ \ function(blend_blocks_textured_add_fourth_##mask_evaluate) \ stmdb sp!, { r4, r14 }; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \ mov c_64, #64; \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ vmov.u16 d128_0x83E0, #0x8300; \ vmov.u16 d128_0x1C07, #0x1C00; \ vmov.u16 d128_0x80E0, #0x8000; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ vorr.u16 d128_0x83E0, #0x00E0; \ vorr.u16 d128_0x1C07, #0x0007; \ vorr.u16 d128_0x80E0, #0x00E0; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vclt.s16 blend_mask, pixels, #0; \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vshr.s16 pixels_fourth, pixels, #2; \ \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vorr.u16 pixels, pixels, msb_mask; \ vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vclt.s16 blend_mask, pixels, #0; \ \ vshr.s16 pixels_fourth, pixels, #2; \ vorr.u16 pixels, pixels, msb_mask; \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ \ 3: \ vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \ \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ ldmia sp!, { r4, pc }; \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \ bal 3b \ #define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \ .align 3; \ \ function(blend_blocks_untextured_add_fourth_##mask_evaluate) \ stmdb sp!, { r4, r14 }; \ add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ \ add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \ vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \ \ add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \ mov c_64, #64; \ \ vmov.u16 d128_0x7C1F, #0x7C00; \ vmov.u16 d128_0x03E0, #0x0300; \ vmov.u16 d128_0x83E0, #0x8300; \ vmov.u16 d128_0x1C07, #0x1C00; \ vmov.u16 d128_0x00E0, #0x00E0; \ vorr.u16 d128_0x7C1F, #0x001F; \ vorr.u16 d128_0x03E0, #0x00E0; \ vorr.u16 d128_0x83E0, #0x00E0; \ vorr.u16 d128_0x1C07, #0x0007; \ \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ vshr.s16 pixels_fourth, pixels, #2; \ vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ beq 1f; \ \ 0: \ mov fb_ptr, fb_ptr_next; \ \ ldr fb_ptr_next, [ pixel_ptr, #28 ]; \ \ vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \ \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vshr.s16 pixels_fourth, pixels, #2; \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \ \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \ \ sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \ add fb_ptr_cmp, fb_ptr_cmp, #14; \ cmp fb_ptr_cmp, #28; \ bls 2f; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ \ 3: \ vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \ vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \ vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \ vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \ vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \ \ subs num_blocks, num_blocks, #1; \ bne 0b; \ \ 1: \ vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \ vorr.u16 blend_pixels, blend_pixels, msb_mask; \ vbit.u16 blend_pixels, fb_pixels, draw_mask; \ vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \ \ ldmia sp!, { r4, pc }; \ \ 2: \ vst1.u16 { blend_pixels }, [ fb_ptr ]; \ vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \ \ vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \ blend_blocks_add_mask_set_##mask_evaluate(); \ blend_blocks_add_mask_copy_##mask_evaluate(); \ vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \ bal 3b \ blend_blocks_add_fourth_textured_builder(off) blend_blocks_add_fourth_textured_builder(on) blend_blocks_add_fourth_untextured_builder(off) blend_blocks_add_fourth_untextured_builder(on) // TODO: Optimize this more. Need a scene that actually uses it for // confirmation.. .align 3 function(blend_blocks_textured_unblended_on) stmdb sp!, { r4, r14 } add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16) vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset mov c_64, #64 ldr fb_ptr, [ pixel_ptr, #28 ] vld1.u16 { fb_pixels }, [ fb_ptr ] vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64 vclt.s16 write_mask, fb_pixels, #0 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64 subs num_blocks, num_blocks, #1 beq 1f 0: vorr.u16 draw_mask, draw_mask, write_mask vbif.u16 fb_pixels, pixels, draw_mask vst1.u16 { fb_pixels }, [ fb_ptr ] ldr fb_ptr, [ pixel_ptr, #28 ] vld1.u16 { fb_pixels }, [ fb_ptr ] vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64 vclt.s16 write_mask, fb_pixels, #0 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64 subs num_blocks, num_blocks, #1 bne 0b 1: vorr.u16 draw_mask, draw_mask, write_mask vbif.u16 fb_pixels, pixels, draw_mask vst1.u16 { fb_pixels }, [ fb_ptr ] ldmia sp!, { r4, pc } function(blend_blocks_textured_unblended_off) bx lr function(warmup) mov r3, #64 cmp r0, #0 bxeq lr 0: vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3 subs r0, r0, #1 bne 0b bx lr #undef color #undef y #undef height #define psx_gpu r0 #define color r1 #define x r2 #define y r3 #define vram_ptr r0 #define width r3 #define height r12 #define parameter_width_offset 0 #define parameter_height_offset 4 #define color_r r14 #define color_g r4 #define color_b r5 #define left_unaligned r14 #define right_unaligned r4 #define pitch r5 #define num_unaligned r2 #define num_width r6 #undef colors #define colors q0 .align 3 function(render_block_fill_body) ldr vram_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] ldr height, [ sp, #parameter_height_offset ] add vram_ptr, vram_ptr, y, lsl #11 ldr width, [ sp, #parameter_width_offset ] add vram_ptr, vram_ptr, x, lsl #1 stmdb sp!, { r4 - r6, r14 } ubfx color_r, color, #3, #5 ubfx color_g, color, #11, #5 ubfx color_b, color, #19, #5 orr color, color_r, color_g, lsl #5 orr color, color, color_b, lsl #10 add left_unaligned, x, #0x7 bic left_unaligned, left_unaligned, #0x7 vdup.u16 colors, color sub left_unaligned, left_unaligned, x mov pitch, #2048 sub pitch, pitch, width, lsl #1 sub width, width, left_unaligned and right_unaligned, width, #0x7 bic width, width, #0x7 0: mov num_width, width, lsr #3 movs num_unaligned, left_unaligned beq 2f 1: strh color, [ vram_ptr ], #2 subs num_unaligned, num_unaligned, #1 bne 1b 2: vst1.u32 { colors }, [ vram_ptr, :128 ]! subs num_width, num_width, #1 bne 2b movs num_unaligned, right_unaligned beq 4f 3: strh color, [ vram_ptr ], #2 subs num_unaligned, num_unaligned, #1 bne 3b 4: add vram_ptr, vram_ptr, pitch subs height, height, #1 bne 0b ldmia sp!, { r4 - r6, pc } #undef x #undef y #undef width #undef height #undef fb_ptr #undef texture_mask #undef num_blocks #undef temp #undef dirty_textures_mask #undef clut_ptr #undef current_texture_mask #define psx_gpu r0 #define x r1 #define y r2 #define u r3 #define v r4 #define width r5 #define height r6 #define offset_u r8 #define offset_v r9 #define offset_u_right r10 #define width_rounded r11 #define height_rounded r12 #define texture_offset_base r1 #define tile_width r2 #define tile_height r3 #define num_blocks r4 #define block r5 #define sub_tile_height r6 #define fb_ptr r7 #define texture_mask r8 #define column_data r9 #define texture_offset r10 #define tiles_remaining r11 #define fb_ptr_advance_column r12 #define texture_block_ptr r14 #define texture_page_ptr r3 #define left_block_mask r4 #define right_block_mask r5 #define texture_mask_rev r10 #define control_mask r11 #define dirty_textures_mask r4 #define clut_ptr r5 #define current_texture_mask r6 #undef texels #undef clut_low_a #undef clut_low_b #undef clut_high_a #undef clut_high_b #undef clut_a #undef clut_b #undef texels_low #undef texels_high #define texels d0 #define draw_masks_fb_ptrs q1 #define draw_mask_fb_ptr_left d2 #define draw_mask_fb_ptr_right d3 #define clut_low_a d4 #define clut_low_b d5 #define clut_high_a d6 #define clut_high_b d7 #define block_masks d8 #define block_masks_shifted d9 #define clut_a q2 #define clut_b q3 #define texels_low d10 #define texels_high d11 setup_sprite_flush_blocks_single: vpush { q1 - q4 } stmdb sp!, { r0 - r3, r12, r14 } bl flush_render_block_buffer ldmia sp!, { r0 - r3, r12, r14 } vpop { q1 - q4 } add block, psx_gpu, #psx_gpu_blocks_offset mov num_blocks, sub_tile_height bx lr setup_sprite_flush_blocks_double: vpush { q1 - q4 } stmdb sp!, { r0 - r3, r12, r14 } bl flush_render_block_buffer ldmia sp!, { r0 - r3, r12, r14 } vpop { q1 - q4 } add block, psx_gpu, #psx_gpu_blocks_offset mov num_blocks, sub_tile_height, lsl #1 bx lr setup_sprite_update_texture_4bpp_cache: stmdb sp!, { r0 - r3, r14 } bl update_texture_4bpp_cache ldmia sp!, { r0 - r3, pc } setup_sprite_update_texture_8bpp_cache: stmdb sp!, { r0 - r3, r14 } bl update_texture_8bpp_cache ldmia sp!, { r0 - r3, pc } #define setup_sprite_tiled_initialize_4bpp() \ ldr dirty_textures_mask, \ [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \ ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \ \ ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \ vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \ \ tst current_texture_mask, dirty_textures_mask; \ vuzp.u8 clut_a, clut_b; \ \ blne setup_sprite_update_texture_4bpp_cache \ #define setup_sprite_tiled_initialize_8bpp() \ ldr dirty_textures_mask, \ [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \ ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \ \ tst current_texture_mask, dirty_textures_mask; \ blne setup_sprite_update_texture_8bpp_cache \ #define setup_sprite_tile_setup_block_no(side, offset, texture_mode) \ #define setup_sprite_block_count_single() \ sub_tile_height \ #define setup_sprite_block_count_double() \ sub_tile_height, lsl #1 \ #define setup_sprite_tile_add_blocks(type) \ add num_blocks, num_blocks, setup_sprite_block_count_##type(); \ cmp num_blocks, #MAX_BLOCKS; \ \ blgt setup_sprite_flush_blocks_##type \ #define setup_sprite_tile_full_4bpp(edge) \ setup_sprite_tile_add_blocks(double); \ \ 4: \ and texture_block_ptr, texture_offset, texture_mask; \ vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \ \ pld [ fb_ptr ]; \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ \ vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \ vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \ \ vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \ add texture_block_ptr, texture_offset, #8; \ \ and texture_block_ptr, texture_block_ptr, texture_mask; \ add block, block, #40; \ \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ add fb_ptr, fb_ptr, #16; \ \ vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \ add block, block, #24; \ \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \ \ pld [ fb_ptr ]; \ vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \ vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \ \ vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \ add block, block, #40; \ \ add texture_offset, texture_offset, #0x10; \ add fb_ptr, fb_ptr, #(2048 - 16); \ \ vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \ add block, block, #24; \ \ subs sub_tile_height, sub_tile_height, #1; \ bne 4b; \ \ add texture_offset, texture_offset, #0xF00; \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \ #define setup_sprite_tile_half_4bpp(edge) \ setup_sprite_tile_add_blocks(single); \ \ 4: \ and texture_block_ptr, texture_offset, texture_mask; \ vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \ \ pld [ fb_ptr ]; \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ \ vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \ vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \ \ vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \ add block, block, #40; \ \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \ \ add block, block, #24; \ add texture_offset, texture_offset, #0x10; \ \ add fb_ptr, fb_ptr, #2048; \ subs sub_tile_height, sub_tile_height, #1; \ \ bne 4b; \ \ add texture_offset, texture_offset, #0xF00; \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \ #define setup_sprite_tile_full_8bpp(edge) \ setup_sprite_tile_add_blocks(double); \ add block, block, #16; \ \ 4: \ and texture_block_ptr, texture_offset, texture_mask; \ vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \ \ pld [ fb_ptr ]; \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ \ add texture_block_ptr, texture_offset, #8; \ vst1.u32 { texels }, [ block, :64 ]; \ \ and texture_block_ptr, texture_block_ptr, texture_mask; \ add block, block, #24; \ \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ \ add fb_ptr, fb_ptr, #16; \ vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \ \ add block, block, #40; \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ pld [ fb_ptr ]; \ \ vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \ vst1.u32 { texels }, [ block, :64 ]; \ add block, block, #24; \ \ add texture_offset, texture_offset, #0x10; \ add fb_ptr, fb_ptr, #(2048 - 16); \ \ vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \ add block, block, #40; \ \ subs sub_tile_height, sub_tile_height, #1; \ bne 4b; \ \ sub block, block, #16; \ add texture_offset, texture_offset, #0xF00; \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \ #define setup_sprite_tile_half_8bpp(edge) \ setup_sprite_tile_add_blocks(single); \ add block, block, #16; \ \ 4: \ and texture_block_ptr, texture_offset, texture_mask; \ vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \ pld [ fb_ptr ]; \ \ add texture_block_ptr, texture_page_ptr, texture_block_ptr; \ vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \ \ vst1.u32 { texels }, [ block, :64 ]; \ add block, block, #24; \ \ vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \ add block, block, #40; \ \ add texture_offset, texture_offset, #0x10; \ add fb_ptr, fb_ptr, #2048; \ \ subs sub_tile_height, sub_tile_height, #1; \ bne 4b; \ \ sub block, block, #16; \ add texture_offset, texture_offset, #0xF00; \ strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \ #define setup_sprite_tile_column_edge_pre_adjust_half_right() \ add texture_offset, texture_offset_base, #8; \ add fb_ptr, fb_ptr, #16 \ #define setup_sprite_tile_column_edge_pre_adjust_half_left() \ mov texture_offset, texture_offset_base \ #define setup_sprite_tile_column_edge_pre_adjust_half(edge) \ setup_sprite_tile_column_edge_pre_adjust_half_##edge() \ #define setup_sprite_tile_column_edge_pre_adjust_full(edge) \ mov texture_offset, texture_offset_base \ #define setup_sprite_tile_column_edge_post_adjust_half_right() \ sub fb_ptr, fb_ptr, #16 \ #define setup_sprite_tile_column_edge_post_adjust_half_left() \ #define setup_sprite_tile_column_edge_post_adjust_half(edge) \ setup_sprite_tile_column_edge_post_adjust_half_##edge() \ #define setup_sprite_tile_column_edge_post_adjust_full(edge) \ #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode) \ mov sub_tile_height, column_data; \ setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge); \ setup_sprite_tile_##edge_mode##_##texture_mode(edge); \ setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge) \ #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode) \ and sub_tile_height, column_data, #0xFF; \ mov tiles_remaining, column_data, lsr #16; \ setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge); \ setup_sprite_tile_##edge_mode##_##texture_mode(edge); \ \ subs tiles_remaining, tiles_remaining, #1; \ beq 2f; \ \ 3: \ mov sub_tile_height, #16; \ setup_sprite_tile_##edge_mode##_##texture_mode(edge); \ subs tiles_remaining, tiles_remaining, #1; \ bne 3b; \ \ 2: \ uxtb sub_tile_height, column_data, ror #8; \ setup_sprite_tile_##edge_mode##_##texture_mode(edge); \ setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge) \ #define setup_sprite_column_data_single() \ mov column_data, height; \ ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \ #define setup_sprite_column_data_multi() \ and height_rounded, height_rounded, #0xF; \ rsb column_data, offset_v, #16; \ \ add height_rounded, height_rounded, #1; \ sub tile_height, tile_height, #1; \ \ orr column_data, column_data, tile_height, lsl #16; \ ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \ \ orr column_data, column_data, height_rounded, lsl #8 \ #define setup_sprite_tile_column_width_single(texture_mode, multi_height, \ edge_mode, edge) \ setup_sprite_##texture_mode##_single_##multi_height##_##edge_mode##_##edge: \ setup_sprite_column_data_##multi_height(); \ vext.32 block_masks_shifted, block_masks, block_masks, #1; \ vorr.u32 block_masks, block_masks, block_masks_shifted; \ vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \ vdup.u8 draw_mask_fb_ptr_right, block_masks[1]; \ \ setup_sprite_tile_column_height_##multi_height(edge_mode, edge, \ texture_mode); \ ldmia sp!, { r4 - r11, pc } \ #define setup_sprite_tiled_advance_column() \ add texture_offset_base, texture_offset_base, #0x100; \ tst texture_offset_base, #0xF00; \ subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \ #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \ right_mode) \ setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode: \ setup_sprite_column_data_##multi_height(); \ mov fb_ptr_advance_column, #32; \ \ sub fb_ptr_advance_column, height, lsl #11; \ vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \ \ vdup.u8 draw_mask_fb_ptr_right, block_masks[1]; \ setup_sprite_tile_column_height_##multi_height(left_mode, right, tm); \ \ subs tile_width, tile_width, #2; \ add fb_ptr, fb_ptr, fb_ptr_advance_column; \ \ vmov.u8 draw_masks_fb_ptrs, #0; \ beq 1f; \ \ 0: \ setup_sprite_tiled_advance_column(); \ setup_sprite_tile_column_height_##multi_height(full, none, tm); \ add fb_ptr, fb_ptr, fb_ptr_advance_column; \ subs tile_width, tile_width, #1; \ bne 0b; \ \ 1: \ vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \ vdup.u8 draw_mask_fb_ptr_right, block_masks[5]; \ \ setup_sprite_tiled_advance_column(); \ setup_sprite_tile_column_height_##multi_height(right_mode, left, tm); \ ldmia sp!, { r4 - r11, pc } \ // r0: psx_gpu // r1: x // r2: y // r3: u // [ sp ]: v // [ sp + 4 ]: width // [ sp + 8 ]: height // [ sp + 12 ]: color (unused) #define setup_sprite_tiled_builder(texture_mode) \ \ setup_sprite_tile_column_width_multi(texture_mode, multi, full, full); \ setup_sprite_tile_column_width_single(texture_mode, multi, full, none); \ setup_sprite_tile_column_width_multi(texture_mode, single, full, full); \ setup_sprite_tile_column_width_single(texture_mode, single, full, none); \ setup_sprite_tile_column_width_multi(texture_mode, multi, half, full); \ setup_sprite_tile_column_width_single(texture_mode, multi, half, right); \ setup_sprite_tile_column_width_multi(texture_mode, single, half, full); \ setup_sprite_tile_column_width_single(texture_mode, single, half, right); \ setup_sprite_tile_column_width_multi(texture_mode, multi, full, half); \ setup_sprite_tile_column_width_single(texture_mode, multi, half, left); \ setup_sprite_tile_column_width_multi(texture_mode, single, full, half); \ setup_sprite_tile_column_width_single(texture_mode, single, half, left); \ setup_sprite_tile_column_width_multi(texture_mode, multi, half, half); \ setup_sprite_tile_column_width_multi(texture_mode, single, half, half); \ \ .align 4; \ \ function(setup_sprite_##texture_mode) \ stmdb sp!, { r4 - r11, r14 }; \ setup_sprite_tiled_initialize_##texture_mode(); \ \ ldr v, [ sp, #36 ]; \ and offset_u, u, #0xF; \ \ ldr width, [ sp, #40 ]; \ ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \ \ ldr height, [ sp, #44 ]; \ add fb_ptr, fb_ptr, y, lsl #11; \ \ add fb_ptr, fb_ptr, x, lsl #1; \ and offset_v, v, #0xF; \ \ sub fb_ptr, fb_ptr, offset_u, lsl #1; \ add width_rounded, offset_u, width; \ \ add height_rounded, offset_v, height; \ add width_rounded, width_rounded, #15; \ \ add height_rounded, height_rounded, #15; \ mov tile_width, width_rounded, lsr #4; \ \ /* texture_offset_base = VH-VL-00-00 */\ mov texture_offset_base, v, lsl #8; \ and offset_u_right, width_rounded, #0xF; \ \ /* texture_offset_base = VH-UH-UL-00 */\ bfi texture_offset_base, u, #4, #8; \ movw right_block_mask, #0xFFFE; \ \ /* texture_offset_base = VH-UH-VL-00 */\ bfi texture_offset_base, v, #4, #4; \ movw left_block_mask, #0xFFFF; \ \ mov tile_height, height_rounded, lsr #4; \ mvn left_block_mask, left_block_mask, lsl offset_u; \ \ /* texture_mask = HH-HL-WH-WL */\ ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \ mov right_block_mask, right_block_mask, lsl offset_u_right; \ \ /* texture_mask_rev = WH-WL-HH-HL */\ rev16 texture_mask_rev, texture_mask; \ vmov block_masks, left_block_mask, right_block_mask; \ \ /* texture_mask = HH-HL-HL-WL */\ bfi texture_mask, texture_mask_rev, #4, #4; \ /* texture_mask_rev = 00-00-00-WH */\ mov texture_mask_rev, texture_mask_rev, lsr #12; \ \ /* texture_mask = HH-WH-HL-WL */\ bfi texture_mask, texture_mask_rev, #8, #4; \ and left_block_mask, left_block_mask, #0xFF; \ \ mov control_mask, #0; \ cmp left_block_mask, #0xFF; \ \ uxtb right_block_mask, right_block_mask, ror #8; \ orreq control_mask, control_mask, #0x4; \ \ ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \ cmp right_block_mask, #0xFF; \ \ orreq control_mask, control_mask, #0x8; \ cmp tile_width, #1; \ \ add block, psx_gpu, #psx_gpu_blocks_offset; \ orreq control_mask, control_mask, #0x1; \ \ cmp tile_height, #1; \ add block, block, num_blocks, lsl #6; \ \ orreq control_mask, control_mask, #0x2; \ ldr pc, [ pc, control_mask, lsl #2 ]; \ nop; \ \ .word setup_sprite_##texture_mode##_multi_multi_full_full; \ .word setup_sprite_##texture_mode##_single_multi_full_none; \ .word setup_sprite_##texture_mode##_multi_single_full_full; \ .word setup_sprite_##texture_mode##_single_single_full_none; \ .word setup_sprite_##texture_mode##_multi_multi_half_full; \ .word setup_sprite_##texture_mode##_single_multi_half_right; \ .word setup_sprite_##texture_mode##_multi_single_half_full; \ .word setup_sprite_##texture_mode##_single_single_half_right; \ .word setup_sprite_##texture_mode##_multi_multi_full_half; \ .word setup_sprite_##texture_mode##_single_multi_half_left; \ .word setup_sprite_##texture_mode##_multi_single_full_half; \ .word setup_sprite_##texture_mode##_single_single_half_left; \ .word setup_sprite_##texture_mode##_multi_multi_half_half; \ .word 0x00000000; \ .word setup_sprite_##texture_mode##_multi_single_half_half \ setup_sprite_tiled_builder(4bpp); setup_sprite_tiled_builder(8bpp); #undef block_ptr #undef num_blocks #undef clut_ptr #define psx_gpu r0 #define block_ptr r0 #define num_blocks r1 #define clut_ptr r2 #define texel_shift_mask r3 #define block_pixels_a r4 #define block_pixels_b r5 #define texel_0 r6 #define texel_2 r7 #define texel_4 r8 #define texel_6 r9 #define texel_1 r10 #define texel_3 r11 #define texel_5 r12 #define texel_7 r14 #define texels_01 r6 #define texels_23 r7 #define texels_45 r8 #define texels_67 r9 function(texture_sprite_blocks_8bpp) stmdb sp!, { r4 - r11, r14 } movw texel_shift_mask, #(0xFF << 1) ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ] add block_ptr, psx_gpu, #psx_gpu_blocks_offset ldr block_pixels_a, [ block_ptr, #16 ] 0: and texel_0, texel_shift_mask, block_pixels_a, lsl #1 ldr block_pixels_b, [ block_ptr, #20 ] and texel_1, texel_shift_mask, block_pixels_a, lsr #7 ldrh texel_0, [ clut_ptr, texel_0 ] and texel_2, texel_shift_mask, block_pixels_a, lsr #15 ldrh texel_1, [ clut_ptr, texel_1 ] and texel_3, texel_shift_mask, block_pixels_a, lsr #23 ldr block_pixels_a, [ block_ptr, #(64 + 16) ] ldrh texel_2, [ clut_ptr, texel_2 ] and texel_4, texel_shift_mask, block_pixels_b, lsl #1 ldrh texel_3, [ clut_ptr, texel_3 ] and texel_5, texel_shift_mask, block_pixels_b, lsr #7 ldrh texel_4, [ clut_ptr, texel_4 ] and texel_6, texel_shift_mask, block_pixels_b, lsr #15 ldrh texel_5, [ clut_ptr, texel_5 ] and texel_7, texel_shift_mask, block_pixels_b, lsr #23 ldrh texel_6, [ clut_ptr, texel_6 ] orr texels_01, texel_0, texel_1, lsl #16 ldrh texel_7, [ clut_ptr, texel_7 ] orr texels_23, texel_2, texel_3, lsl #16 orr texels_45, texel_4, texel_5, lsl #16 str texels_01, [ block_ptr, #0 ] orr texels_67, texel_6, texel_7, lsl #16 str texels_23, [ block_ptr, #4 ] subs num_blocks, num_blocks, #1 str texels_45, [ block_ptr, #8 ] str texels_67, [ block_ptr, #12 ] add block_ptr, block_ptr, #64 bne 0b ldmia sp!, { r4 - r11, pc } #undef width_rounded #undef texture_mask #undef num_blocks #undef texture_offset #define psx_gpu r0 #define x r1 #define y r2 #define u r3 #define v r4 #define width r5 #define height r6 #define left_offset r8 #define width_rounded r9 #define right_width r10 #define block_width r11 #define texture_offset_base r1 #define texture_mask r2 #define texture_page_ptr r3 #define num_blocks r4 #define block r5 #define fb_ptr r7 #define texture_offset r8 #define blocks_remaining r9 #define fb_ptr_pitch r12 #define texture_block_ptr r14 #define texture_mask_width r2 #define texture_mask_height r3 #define left_mask_bits r4 #define right_mask_bits r5 #undef block_masks #undef block_masks_shifted #undef texels #define block_masks d0 #define block_masks_shifted d1 #define draw_mask_fb_ptr d2 #define texels q2 setup_sprites_16bpp_flush_single: vpush { d0 - d2 } stmdb sp!, { r0 - r3, r12, r14 } bl flush_render_block_buffer ldmia sp!, { r0 - r3, r12, r14 } vpop { d0 - d2 } add block, psx_gpu, #psx_gpu_blocks_offset mov num_blocks, #1 bx lr setup_sprites_16bpp_flush_row: vpush { d0 - d2 } stmdb sp!, { r0 - r3, r12, r14 } bl flush_render_block_buffer ldmia sp!, { r0 - r3, r12, r14 } vpop { d0 - d2 } add block, psx_gpu, #psx_gpu_blocks_offset mov num_blocks, block_width bx lr function(setup_sprite_16bpp) stmdb sp!, { r4 - r11, r14 } ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ] ldr v, [ sp, #36 ] add fb_ptr, fb_ptr, y, lsl #11 ldr width, [ sp, #40 ] add fb_ptr, fb_ptr, x, lsl #1 ldr height, [ sp, #44 ] and left_offset, u, #0x7 add texture_offset_base, u, u add width_rounded, width, #7 add texture_offset_base, v, lsl #11 mov left_mask_bits, #0xFF ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ] add width_rounded, width_rounded, left_offset ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ] sub fb_ptr, fb_ptr, left_offset, lsl #1 add texture_mask, texture_mask_width, texture_mask_width mov right_mask_bits, #0xFE and right_width, width_rounded, #0x7 mvn left_mask_bits, left_mask_bits, lsl left_offset add texture_mask, texture_mask_height, lsl #11 mov block_width, width_rounded, lsr #3 mov right_mask_bits, right_mask_bits, lsl right_width movw fb_ptr_pitch, #(2048 + 16) sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4 vmov block_masks, left_mask_bits, right_mask_bits ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] add block, psx_gpu, #psx_gpu_blocks_offset bic texture_offset_base, texture_offset_base, #0x7 cmp block_width, #1 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] add block, block, num_blocks, lsl #6 bne 0f vext.32 block_masks_shifted, block_masks, block_masks, #1 vorr.u32 block_masks, block_masks, block_masks_shifted vdup.u8 draw_mask_fb_ptr, block_masks[0] 1: add num_blocks, num_blocks, #1 cmp num_blocks, #MAX_BLOCKS blgt setup_sprites_16bpp_flush_single and texture_block_ptr, texture_offset_base, texture_mask subs height, height, #1 add texture_block_ptr, texture_page_ptr, texture_block_ptr vld1.u32 { texels }, [ texture_block_ptr, :128 ] vst1.u32 { texels }, [ block, :128 ] add block, block, #40 vmov.u32 draw_mask_fb_ptr[1], fb_ptr pld [ fb_ptr ] vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ] add block, block, #24 add texture_offset_base, texture_offset_base, #2048 add fb_ptr, fb_ptr, #2048 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] bne 1b ldmia sp!, { r4 - r11, pc } 0: add num_blocks, num_blocks, block_width mov texture_offset, texture_offset_base cmp num_blocks, #MAX_BLOCKS blgt setup_sprites_16bpp_flush_row add texture_offset_base, texture_offset_base, #2048 and texture_block_ptr, texture_offset, texture_mask add texture_block_ptr, texture_page_ptr, texture_block_ptr vld1.u32 { texels }, [ texture_block_ptr, :128 ] vst1.u32 { texels }, [ block, :128 ] add block, block, #40 vdup.u8 draw_mask_fb_ptr, block_masks[0] vmov.u32 draw_mask_fb_ptr[1], fb_ptr pld [ fb_ptr ] vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ] subs blocks_remaining, block_width, #2 add texture_offset, texture_offset, #16 add fb_ptr, fb_ptr, #16 vmov.u8 draw_mask_fb_ptr, #0 add block, block, #24 beq 2f 1: and texture_block_ptr, texture_offset, texture_mask subs blocks_remaining, blocks_remaining, #1 add texture_block_ptr, texture_page_ptr, texture_block_ptr vld1.u32 { texels }, [ texture_block_ptr, :128 ] vst1.u32 { texels }, [ block, :128 ] add block, block, #40 vmov.u32 draw_mask_fb_ptr[1], fb_ptr pld [ fb_ptr ] vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ] add texture_offset, texture_offset, #16 add fb_ptr, fb_ptr, #16 add block, block, #24 bne 1b 2: and texture_block_ptr, texture_offset, texture_mask add texture_block_ptr, texture_page_ptr, texture_block_ptr vld1.u32 { texels }, [ texture_block_ptr, :128 ] vdup.u8 draw_mask_fb_ptr, block_masks[4] vst1.u32 { texels }, [ block, :128 ] add block, block, #40 vmov.u32 draw_mask_fb_ptr[1], fb_ptr vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ] add block, block, #24 subs height, height, #1 add fb_ptr, fb_ptr, fb_ptr_pitch strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] bne 0b ldmia sp!, { r4 - r11, pc } #undef texture_page_ptr #undef vram_ptr #undef dirty_textures_mask #undef current_texture_mask #define psx_gpu r0 #define current_texture_page r1 #define texture_page_ptr r2 #define vram_ptr_a r3 #define current_texture_page_x r12 #define current_texture_page_y r4 #define dirty_textures_mask r5 #define tile_y r6 #define tile_x r7 #define sub_y r8 #define current_texture_mask r9 #define c_4096 r10 #define vram_ptr_b r11 #define texel_block_a d0 #define texel_block_b d1 #define texel_block_expanded_a q1 #define texel_block_expanded_b q2 #define texel_block_expanded_ab q2 #define texel_block_expanded_c q3 #define texel_block_expanded_d q4 #define texel_block_expanded_cd q3 function(update_texture_4bpp_cache) stmdb sp!, { r4 - r11, r14 } vpush { q0 - q3 } ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ] ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ] and current_texture_page_x, current_texture_page, #0xF ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ] mov current_texture_page_y, current_texture_page, lsr #4 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ] add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19 mov tile_y, #16 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7 bic dirty_textures_mask, current_texture_mask mov tile_x, #16 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ] mov sub_y, #8 movw c_4096, #4096 add vram_ptr_b, vram_ptr_a, #2048 0: vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096 vmovl.u8 texel_block_expanded_a, texel_block_a vshll.u8 texel_block_expanded_b, texel_block_a, #4 vmovl.u8 texel_block_expanded_c, texel_block_b vshll.u8 texel_block_expanded_d, texel_block_b, #4 vbic.u16 texel_block_expanded_a, #0x00F0 vbic.u16 texel_block_expanded_b, #0x00F0 vbic.u16 texel_block_expanded_c, #0x00F0 vbic.u16 texel_block_expanded_d, #0x00F0 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \ texel_block_expanded_b vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \ texel_block_expanded_d vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \ [ texture_page_ptr, :256 ]! subs sub_y, sub_y, #1 bne 0b mov sub_y, #8 add vram_ptr_a, vram_ptr_a, #8 add vram_ptr_b, vram_ptr_b, #8 sub vram_ptr_a, vram_ptr_a, #(16 * 2048) sub vram_ptr_b, vram_ptr_b, #(16 * 2048) subs tile_x, tile_x, #1 bne 0b mov tile_x, #16 add vram_ptr_a, vram_ptr_a, #(16 * 2048) add vram_ptr_b, vram_ptr_b, #(16 * 2048) sub vram_ptr_a, vram_ptr_a, #(8 * 16) sub vram_ptr_b, vram_ptr_b, #(8 * 16) subs tile_y, tile_y, #1 bne 0b vpop { q0 - q3 } ldmia sp!, { r4 - r11, pc } #undef current_texture_page #define psx_gpu r0 #define texture_page r1 #define texture_page_ptr r2 #define vram_ptr_a r3 #define texture_page_x r12 #define texture_page_y r4 #define current_texture_page r5 #define tile_y r6 #define tile_x r7 #define sub_y r8 #define c_4096 r10 #define vram_ptr_b r11 #undef texels_a #undef texels_b #define texels_a q0 #define texels_b q1 #define texels_c q2 #define texels_d q3 function(update_texture_8bpp_cache_slice) stmdb sp!, { r4 - r11, r14 } vpush { q0 - q3 } ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ] ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ] ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] mov tile_y, #16 and texture_page_x, texture_page, #0xF mov texture_page_y, texture_page, lsr #4 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7 mov tile_x, #8 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19 eor current_texture_page, current_texture_page, texture_page ands current_texture_page, current_texture_page, #0x1 mov sub_y, #4 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16) movw c_4096, #4096 add vram_ptr_b, vram_ptr_a, #2048 0: vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]! vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]! subs sub_y, sub_y, #1 bne 0b mov sub_y, #4 add vram_ptr_a, vram_ptr_a, #16 add vram_ptr_b, vram_ptr_b, #16 sub vram_ptr_a, vram_ptr_a, #(16 * 2048) sub vram_ptr_b, vram_ptr_b, #(16 * 2048) subs tile_x, tile_x, #1 bne 0b mov tile_x, #8 add vram_ptr_a, vram_ptr_a, #(16 * 2048) add vram_ptr_b, vram_ptr_b, #(16 * 2048) sub vram_ptr_a, vram_ptr_a, #(8 * 16) sub vram_ptr_b, vram_ptr_b, #(8 * 16) subs tile_y, tile_y, #1 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16) bne 0b vpop { q0 - q3 } ldmia sp!, { r4 - r11, pc }