2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of
7 * the License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
17 #define MAX_BLOCKS_PER_ROW 128
19 #define psx_gpu_test_mask_offset 0
20 #define psx_gpu_uvrg_offset 16
21 #define psx_gpu_uvrg_dx_offset 32
22 #define psx_gpu_uvrg_dy_offset 48
23 #define psx_gpu_u_block_span_offset 64
24 #define psx_gpu_v_block_span_offset 80
25 #define psx_gpu_r_block_span_offset 96
26 #define psx_gpu_g_block_span_offset 112
27 #define psx_gpu_b_block_span_offset 128
29 #define psx_gpu_b_dx_offset 132
31 #define psx_gpu_b_offset 144
32 #define psx_gpu_b_dy_offset 148
33 #define psx_gpu_triangle_area_offset 152
34 #define psx_gpu_texture_window_settings_offset 156
35 #define psx_gpu_current_texture_mask_offset 160
36 #define psx_gpu_viewport_mask_offset 164
37 #define psx_gpu_dirty_textures_4bpp_mask_offset 168
38 #define psx_gpu_dirty_textures_8bpp_mask_offset 172
39 #define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 176
40 #define psx_gpu_triangle_color_offset 180
41 #define psx_gpu_primitive_color_offset 184
42 #define psx_gpu_dither_table_offset 188
43 #define psx_gpu_render_block_handler_offset 204
44 #define psx_gpu_texture_page_ptr_offset 208
45 #define psx_gpu_clut_ptr_offset 212
46 #define psx_gpu_vram_ptr_offset 216
48 #define psx_gpu_render_state_base_offset 220
49 #define psx_gpu_render_state_offset 222
50 #define psx_gpu_num_spans_offset 224
51 #define psx_gpu_num_blocks_offset 226
52 #define psx_gpu_offset_x_offset 228
53 #define psx_gpu_offset_y_offset 230
54 #define psx_gpu_clut_settings_offset 232
55 #define psx_gpu_texture_settings_offset 234
56 #define psx_gpu_viewport_start_x_offset 236
57 #define psx_gpu_viewport_start_y_offset 238
58 #define psx_gpu_viewport_end_x_offset 240
59 #define psx_gpu_viewport_end_y_offset 242
60 #define psx_gpu_mask_msb_offset 244
62 #define psx_gpu_triangle_winding_offset 246
63 #define psx_gpu_display_area_draw_enable_offset 247
64 #define psx_gpu_current_texture_page_offset 248
65 #define psx_gpu_last_8bpp_texture_page_offset 249
66 #define psx_gpu_texture_mask_width_offset 250
67 #define psx_gpu_texture_mask_height_offset 251
68 #define psx_gpu_texture_window_x_offset 252
69 #define psx_gpu_texture_window_y_offset 253
70 #define psx_gpu_primitive_type_offset 254
72 #define psx_gpu_reserved_a_offset 255
74 #define psx_gpu_blocks_offset 0x0100
75 #define psx_gpu_span_uvrg_offset_offset 0x2100
76 #define psx_gpu_span_edge_data_offset 0x4100
77 #define psx_gpu_span_b_offset_offset 0x5100
79 #define psx_gpu__vram_offset 0x005900
81 #define edge_data_left_x_offset 0
82 #define edge_data_num_blocks_offset 2
83 #define edge_data_right_mask_offset 4
84 #define edge_data_y_offset 6
128 #define gw_by_l gw_bx_l
148 #define uvrg_xxxx0 q3
153 #define uvrg_xxxx1 q4
158 #define uvrg_xxxx2 q5
163 #define yyyy_uvrg0 q6
168 #define yyyy_uvrg1 q7
173 #define yyyy_uvrg2 q8
200 #define uvrg_base q11
246 #define function(name) \
255 function(compute_all_gradients)
256 // First compute the triangle area reciprocal and shift. The division will
257 // happen concurrently with much of the work which follows.
258 @ r12 = psx_gpu->triangle_area
259 ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
260 stmdb sp!, { r4 - r11, lr }
262 @ load exponent of 62 into upper half of double
264 clz r14, r12 @ r14 = shift
266 movt r4, #((62 + 1023) << 4)
267 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
269 @ load area normalized into lower half of double
271 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
273 movt r4, #((1022 + 31) << 4)
276 add r4, r4, r12, lsr #11
279 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
281 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
282 // ( d0 * d1 ) - ( d2 * d3 ) =
283 // ( m0 ) - ( m1 ) = gradient
285 // This is split to do 12 elements at a time over three sets: a, b, and c.
286 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
287 // two of the slots are unused.
289 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
292 // First type is: uvrg bxxx xxxx
293 // Second type is: yyyy ybyy uvrg
294 // Since x_a and y_c are the same the same variable is used for both.
296 vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 }
297 ldrsh x0, [ v_a, #8 ] @ load x0
299 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1}
300 ldrh x1, [ v_b, #8 ] @ load x1
302 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 }
303 ldrh x2, [ v_c, #8 ] @ load x2
305 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
306 ldrh y0, [ v_a, #10 ] @ load y0
308 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
309 ldrh y1, [ v_b, #10 ] @ load y1
311 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
312 ldrh y2, [ v_c, #10 ] @ load y2
314 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
315 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
317 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
318 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
320 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
321 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
323 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
324 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
326 ldrb b2, [ v_c, #4 ] @ load b2
327 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
329 ldrb b1, [ v_b, #4 ] @ load b1
330 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
332 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
333 vsub.s16 d0_ab, x1_ab, x0_ab
335 ldrb b0, [ v_a, #4 ] @ load b0
336 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
338 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
339 vsub.s16 d2_ab, x2_ab, x1_ab
341 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
342 vsub.s16 d1_ab, y2_ab, y1_ab
344 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
345 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
347 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
348 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
350 vsub.s16 d3_ab, y1_ab, y0_ab
351 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
352 @ ((x2 - X1) * (b1 - b0))
353 vmull.s16 ga_uvrg_x, d0_a, d1_a
354 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
355 @ ((b2 - b1) * (y1 - y0))
356 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
357 movs gs_bx, ga_bx, asr #31
359 vmull.s16 ga_uvrg_y, d0_b, d1_b
360 rsbmi ga_bx, ga_bx, #0
362 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
363 movs gs_by, ga_by, asr #31
365 vshr.u64 d0, d30, #22
366 mov b_base, b0, lsl #16
368 rsbmi ga_by, ga_by, #0
369 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
371 @ r12 = psx_gpu->triangle_winding_offset
372 ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
373 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
375 add b_base, b_base, #0x8000
376 rsb r12, r12, #0 @ r12 = -(triangle->winding)
378 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
379 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
381 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
382 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
384 vorr.u32 uvrg_base, #0x8000
385 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
387 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
388 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
390 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
391 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
392 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
393 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
395 vshl.u64 gw_rg_x, gw_rg_x, r_shift
396 vshl.u64 gw_uv_x, gw_uv_x, r_shift
397 vshl.u64 gw_rg_y, gw_rg_y, r_shift
398 vshl.u64 gw_uv_y, gw_uv_y, r_shift
400 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
401 vmovn.u64 g_uv_x, gw_uv_x
403 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
404 vmovn.u64 g_rg_x, gw_rg_x
406 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
407 vmovn.u64 g_uv_y, gw_uv_y
409 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
410 vmovn.u64 g_rg_y, gw_rg_y
412 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
413 mov ga_bx, ga_bx, lsl #13
415 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
416 mov ga_by, ga_by, lsl #13
419 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
421 vshl.u32 g_uvrg_x, g_uvrg_x, #4
422 vshl.u32 g_uvrg_y, g_uvrg_y, #4
424 umull gw_by_l, gw_by_h, ga_by, area_r_s
425 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
427 eor gs_bx, gs_bx, r12
428 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
430 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
431 eor gs_by, gs_by, r12
433 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
434 add store_a, psx_gpu, #psx_gpu_uvrg_offset
436 sub r11, r11, #(32 - 13)
438 add store_b, store_a, #16
441 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
442 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
444 vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
445 mov g_bx, gw_bx_h, lsr r11
447 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
448 mov g_by, gw_by_h, lsr r11
450 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
451 [ store_b, : 128 ], store_inc
452 eor g_bx, g_bx, gs_bx
454 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
455 [ store_b, : 128 ], store_inc
456 sub g_bx, g_bx, gs_bx
459 eor g_by, g_by, gs_by
461 mls b_base, g_bx, x0, b_base
462 sub g_by, g_by, gs_by
467 add g_bx2, g_bx, g_bx
468 add g_bx3, g_bx, g_bx2
470 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
472 ldmia sp!, { r4 - r11, pc }
489 #define height_minor_a r7
490 #define height_minor_b r8
491 #define height_major r9
494 #define reciprocal_table_ptr r10
496 #define edge_alt_low r4
497 #define edge_alt_high r5
498 #define edge_dx_dy_alt r6
499 #define edge_shift_alt r10
501 #define edge_dx_dy_alt_low r4
502 #define edge_dx_dy_alt_high r5
504 #define span_edge_data r4
505 #define span_uvrg_offset r5
506 #define span_b_offset r6
514 #define alternate_x q0
515 #define alternate_dx_dy q1
516 #define alternate_x_32 q2
518 #define alternate_x_low d0
519 #define alternate_x_high d1
520 #define alternate_dx_dy_low d2
521 #define alternate_dx_dy_high d3
522 #define alternate_x_32_low d4
523 #define alternate_x_32_high d5
527 #define left_dx_dy q5
528 #define right_dx_dy q6
530 #define right_edge q8
532 #define left_x_low d6
533 #define left_x_high d7
534 #define right_x_low d8
535 #define right_x_high d9
536 #define left_dx_dy_low d10
537 #define left_dx_dy_high d11
538 #define right_dx_dy_low d12
539 #define right_dx_dy_high d13
540 #define left_edge_low d14
541 #define left_edge_high d15
542 #define right_edge_low d16
543 #define right_edge_high d17
545 #define y_mid_point d18
548 #define left_right_x_16 q11
549 #define span_shifts_y q12
552 #define span_shifts d24
557 #define left_right_x_16_low d22
558 #define left_right_x_16_high d23
563 #define alternate_x_16 d4
566 #define v_clip_low d6
568 #define right_x_32 q10
569 #define left_x_32 q11
570 #define alternate_select d24
572 #define right_x_32_low d20
573 #define right_x_32_high d21
574 #define left_x_32_low d22
575 #define left_x_32_high d23
578 #define edges_dx_dy d2
579 #define edge_shifts d3
580 #define edge_shifts_64 q2
582 #define edges_xy_left d0
583 #define edges_xy_right d1
585 #define height_reciprocals d6
593 #define heights_b d12
594 #define edges_dx_dy_64 q10
596 #define edges_dx_dy_64_left d20
597 #define edges_dx_dy_64_right d21
600 #define setup_spans_prologue() \
601 stmdb sp!, { r4 - r11, lr }; \
603 ldrsh x_a, [ v_a, #8 ]; \
604 ldrsh x_b, [ v_b, #8 ]; \
605 ldrsh x_c, [ v_c, #8 ]; \
606 ldrsh y_a, [ v_a, #10 ]; \
607 ldrsh y_b, [ v_b, #10 ]; \
608 ldrsh y_c, [ v_c, #10 ]; \
610 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
611 vld1.32 { uvrg }, [ temp ]; \
612 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
613 vld1.32 { uvrg_dy }, [ temp ]; \
614 movw reciprocal_table_ptr, :lower16:reciprocal_table; \
615 movt reciprocal_table_ptr, :upper16:reciprocal_table; \
617 vmov.u32 c_0x01, #0x01 \
619 #define setup_spans_load_b() \
620 ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \
621 ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \
623 #define setup_spans_prologue_b() \
624 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
625 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
627 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
628 vmov.u16 c_0x0004, #0x0004; \
630 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
631 vmov.u16 c_0x0001, #0x0001; \
633 vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \
634 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
636 vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \
637 vadd.u16 right_edge, right_edge, c_0x0001; \
639 vmov.u16 c_0x0007, #0x0007; \
640 vmvn.u16 c_0xFFFE, #0x0001 \
643 #define compute_edge_delta_x2() \
644 ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \
646 vdup.u32 heights, height; \
647 vsub.u32 widths, x_ends, x_starts; \
649 vdup.u32 edge_shifts, temp; \
650 vsub.u32 heights_b, heights, c_0x01; \
651 vshr.u32 height_reciprocals, edge_shifts, #12; \
653 vmla.s32 heights_b, x_starts, heights; \
654 vbic.u16 edge_shifts, #0xE0; \
655 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
656 vmull.s32 edges_xy, heights_b, height_reciprocals \
659 #define height_reciprocal_alt r11
660 #define height_b_alt r12
662 #define compute_edge_delta_x3(start_c, height_a, height_b) \
663 vmov.u32 heights, height_a, height_b; \
664 ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
665 vmov.u32 edge_shifts[0], temp; \
666 ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
667 vmov.u32 edge_shifts[1], temp; \
668 ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \
670 vsub.u32 widths, x_ends, x_starts; \
671 sub width_alt, x_c, start_c; \
673 vsub.u32 heights_b, heights, c_0x01; \
674 sub height_b_alt, height_minor_b, #1; \
676 vshr.u32 height_reciprocals, edge_shifts, #12; \
677 lsr height_reciprocal_alt, edge_shift_alt, #12; \
679 vmla.s32 heights_b, x_starts, heights; \
680 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
682 vbic.u16 edge_shifts, #0xE0; \
683 and edge_shift_alt, edge_shift_alt, #0x1F; \
685 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
686 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
688 vmull.s32 edges_xy, heights_b, height_reciprocals; \
689 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
692 #define setup_spans_adjust_y_up() \
693 vsub.u32 y_x4, y_x4, c_0x0004 \
695 #define setup_spans_adjust_y_down() \
696 vadd.u32 y_x4, y_x4, c_0x0004 \
698 #define setup_spans_adjust_interpolants_up() \
699 vsub.u32 uvrg, uvrg, uvrg_dy; \
702 #define setup_spans_adjust_interpolants_down() \
703 vadd.u32 uvrg, uvrg, uvrg_dy; \
707 #define setup_spans_clip_interpolants_increment() \
708 mla b, b_dy, clip, b; \
709 vmla.s32 uvrg, uvrg_dy, v_clip \
711 #define setup_spans_clip_interpolants_decrement() \
712 mls b, b_dy, clip, b; \
713 vmls.s32 uvrg, uvrg_dy, v_clip \
715 #define setup_spans_clip_alternate_yes() \
716 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
718 #define setup_spans_clip_alternate_no() \
720 #define setup_spans_clip(direction, alternate_active) \
721 vdup.u32 v_clip, clip; \
722 setup_spans_clip_alternate_##alternate_active(); \
723 setup_spans_clip_interpolants_##direction(); \
724 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
727 #define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
728 vmovl.s32 edge_shifts_64, edge_shifts; \
729 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
731 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
732 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
734 vmov left_x_low, edges_xy_##left_index; \
735 vmov right_x_low, edges_xy_##right_index; \
737 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
738 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
739 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
740 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
742 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
743 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
745 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
746 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
749 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
750 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
752 vdup.u16 y_mid_point, y_b; \
753 rsb temp, edge_shift_alt, #32; \
755 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
756 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
757 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
758 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
760 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
761 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
762 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
763 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
765 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
766 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
769 #define setup_spans_y_select_up() \
770 vclt.s16 alternate_select, y_x4, y_mid_point \
772 #define setup_spans_y_select_down() \
773 vcgt.s16 alternate_select, y_x4, y_mid_point \
776 #define setup_spans_alternate_select_left() \
777 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
779 #define setup_spans_alternate_select_right() \
780 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
783 #define setup_spans_set_x4_alternate_yes(alternate, direction) \
784 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
785 vshrn.s64 left_x_32_low, left_x, #32; \
786 vshrn.s64 right_x_32_low, right_x, #32; \
788 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
789 vadd.u64 left_x, left_x, left_dx_dy; \
790 vadd.u64 right_x, right_x, right_dx_dy; \
792 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
793 vshrn.s64 left_x_32_high, left_x, #32; \
794 vshrn.s64 right_x_32_high, right_x, #32; \
796 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
797 vadd.u64 left_x, left_x, left_dx_dy; \
798 vadd.u64 right_x, right_x, right_dx_dy; \
800 vmovn.u32 alternate_x_16, alternate_x_32; \
801 setup_spans_y_select_##direction(); \
802 vmovn.u32 left_right_x_16_low, left_x_32; \
804 vmovn.u32 left_right_x_16_high, right_x_32; \
805 setup_spans_alternate_select_##alternate(); \
807 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
808 str b, [ span_b_offset ], #4; \
809 setup_spans_adjust_interpolants_##direction(); \
811 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
813 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
814 str b, [ span_b_offset ], #4; \
815 setup_spans_adjust_interpolants_##direction(); \
817 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
819 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
820 str b, [ span_b_offset ], #4; \
821 setup_spans_adjust_interpolants_##direction(); \
823 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
824 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
825 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
827 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
828 str b, [ span_b_offset ], #4; \
829 setup_spans_adjust_interpolants_##direction(); \
831 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
832 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
834 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
836 setup_spans_adjust_y_##direction() \
839 #define setup_spans_set_x4_alternate_no(alternate, direction) \
840 vshrn.s64 left_x_32_low, left_x, #32; \
841 vshrn.s64 right_x_32_low, right_x, #32; \
843 vadd.u64 left_x, left_x, left_dx_dy; \
844 vadd.u64 right_x, right_x, right_dx_dy; \
846 vshrn.s64 left_x_32_high, left_x, #32; \
847 vshrn.s64 right_x_32_high, right_x, #32; \
849 vadd.u64 left_x, left_x, left_dx_dy; \
850 vadd.u64 right_x, right_x, right_dx_dy; \
852 vmovn.u32 left_right_x_16_low, left_x_32; \
853 vmovn.u32 left_right_x_16_high, right_x_32; \
855 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
856 str b, [ span_b_offset ], #4; \
857 setup_spans_adjust_interpolants_##direction(); \
859 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
861 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
862 str b, [ span_b_offset ], #4; \
863 setup_spans_adjust_interpolants_##direction(); \
865 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
867 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
868 str b, [ span_b_offset ], #4; \
869 setup_spans_adjust_interpolants_##direction(); \
871 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
872 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
873 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
875 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
876 str b, [ span_b_offset ], #4; \
877 setup_spans_adjust_interpolants_##direction(); \
879 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
880 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
882 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
884 setup_spans_adjust_y_##direction() \
887 #define edge_adjust_low r11
888 #define edge_adjust_high r12
890 #define setup_spans_alternate_adjust_yes() \
891 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
892 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
893 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
895 #define setup_spans_alternate_adjust_no() \
898 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
899 setup_spans_alternate_adjust_##alternate_active(); \
900 setup_spans_load_b(); \
902 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
903 subs y_c, y_c, temp; \
904 subgt height, height, y_c; \
905 addgt height, height, #1; \
907 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
908 subs clip, temp, y_a; \
911 sub height, height, clip; \
912 add y_a, y_a, clip; \
913 setup_spans_clip(increment, alternate_active); \
919 orr temp, y_a, y_a, lsl #16; \
920 add temp, temp, #(1 << 16); \
922 add y_a, y_a, #(2 << 16); \
923 vmov.u32 y_x4, temp, y_a; \
925 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
927 setup_spans_prologue_b(); \
929 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
932 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
933 subs height, height, #4; \
939 #define setup_spans_alternate_pre_increment_yes() \
940 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
941 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
943 #define setup_spans_alternate_pre_increment_no() \
946 #define setup_spans_up_decrement_yes() \
947 suble height, height, #1 \
949 #define setup_spans_up_decrement_no() \
952 #define setup_spans_up(left_index, right_index, alternate, alternate_active) \
953 setup_spans_alternate_adjust_##alternate_active(); \
954 setup_spans_load_b(); \
957 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
958 subs temp, temp, y_c; \
959 subgt height, height, temp; \
960 setup_spans_up_decrement_##alternate_active(); \
962 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
963 subs clip, y_a, temp; \
966 sub height, height, clip; \
967 sub y_a, y_a, clip; \
968 setup_spans_clip(decrement, alternate_active); \
974 orr temp, y_a, y_a, lsl #16; \
975 sub temp, temp, #(1 << 16); \
977 sub y_a, y_a, #(2 << 16); \
978 vmov.u32 y_x4, temp, y_a; \
980 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
982 setup_spans_alternate_pre_increment_##alternate_active(); \
983 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
985 setup_spans_adjust_interpolants_up(); \
986 setup_spans_prologue_b(); \
988 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
991 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
992 subs height, height, #4; \
998 #define setup_spans_epilogue() \
999 ldmia sp!, { r4 - r11, pc } \
1002 #define setup_spans_up_up(minor, major) \
1003 setup_spans_prologue(); \
1004 sub height_minor_a, y_a, y_b; \
1005 sub height_minor_b, y_b, y_c; \
1006 sub height, y_a, y_c; \
1008 vdup.u32 x_starts, x_a; \
1009 vmov.u32 x_ends, x_c, x_b; \
1011 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1012 setup_spans_up(major, minor, minor, yes); \
1013 setup_spans_epilogue() \
1015 function(setup_spans_up_left)
1016 setup_spans_up_up(left, right)
1018 function(setup_spans_up_right)
1019 setup_spans_up_up(right, left)
1022 #define setup_spans_down_down(minor, major) \
1023 setup_spans_prologue(); \
1024 sub height_minor_a, y_b, y_a; \
1025 sub height_minor_b, y_c, y_b; \
1026 sub height, y_c, y_a; \
1028 vdup.u32 x_starts, x_a; \
1029 vmov.u32 x_ends, x_c, x_b; \
1031 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1032 setup_spans_down(major, minor, minor, yes); \
1033 setup_spans_epilogue() \
1035 function(setup_spans_down_left)
1036 setup_spans_down_down(left, right)
1038 function(setup_spans_down_right)
1039 setup_spans_down_down(right, left)
1042 #define setup_spans_up_flat() \
1043 sub height, y_a, y_c; \
1045 compute_edge_delta_x2(); \
1046 setup_spans_up(left, right, none, no); \
1047 setup_spans_epilogue() \
1049 function(setup_spans_up_a)
1050 setup_spans_prologue()
1052 vmov.u32 x_starts, x_a, x_b
1053 vdup.u32 x_ends, x_c
1055 setup_spans_up_flat()
1057 function(setup_spans_up_b)
1058 setup_spans_prologue()
1060 vdup.u32 x_starts, x_a
1061 vmov.u32 x_ends, x_b, x_c
1063 setup_spans_up_flat()
1065 #define setup_spans_down_flat() \
1066 sub height, y_c, y_a; \
1068 compute_edge_delta_x2(); \
1069 setup_spans_down(left, right, none, no); \
1070 setup_spans_epilogue() \
1072 function(setup_spans_down_a)
1073 setup_spans_prologue()
1075 vmov.u32 x_starts, x_a, x_b
1076 vdup.u32 x_ends, x_c
1078 setup_spans_down_flat()
1080 function(setup_spans_down_b)
1081 setup_spans_prologue()
1083 vdup.u32 x_starts, x_a
1084 vmov.u32 x_ends, x_b, x_c
1086 setup_spans_down_flat()
1091 #define edges_xy_b q11
1092 #define edges_dx_dy_b d26
1093 #define edge_shifts_b d27
1094 #define edges_dx_dy_and_shifts_b q13
1095 #define height_increment d20
1097 #define edges_dx_dy_and_shifts q1
1099 #define edges_xy_b_left d22
1100 #define edges_xy_b_right d23
1102 #define setup_spans_up_down_load_edge_set_b() \
1103 vmov edges_xy, edges_xy_b; \
1104 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1107 function(setup_spans_up_down)
1108 setup_spans_prologue()
1110 // s32 middle_y = y_a;
1111 sub height_minor_a, y_a, y_b
1112 sub height_minor_b, y_c, y_a
1113 sub height_major, y_c, y_b
1115 vmov.u32 x_starts, x_a, x_c
1116 vdup.u32 x_ends, x_b
1118 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1121 vmov.u32 height_increment, temp, height_minor_b
1122 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1124 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1125 vmov edges_xy_b_right, edges_xy_right
1127 vmov edge_shifts_b, edge_shifts
1128 vmov.u32 edge_shifts_b[0], edge_shift_alt
1130 vneg.s32 edges_dx_dy_b, edges_dx_dy
1131 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1135 setup_spans_load_b()
1138 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1139 subs temp, temp, y_b
1140 subgt height_minor_a, height_minor_a, temp
1142 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1143 subs clip, y_a, temp
1146 sub height_minor_a, height_minor_a, clip
1148 setup_spans_clip(decrement, no)
1151 cmp height_minor_a, #0
1154 orr temp, y_a, y_a, lsl #16
1155 sub temp, temp, #(1 << 16)
1157 sub y_a, y_a, #(2 << 16)
1158 vmov.u32 y_x4, temp, y_a
1160 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1162 strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1164 setup_spans_adjust_edges_alternate_no(left, right);
1165 setup_spans_adjust_interpolants_up()
1166 setup_spans_up_down_load_edge_set_b()
1168 setup_spans_prologue_b()
1172 setup_spans_set_x4_alternate_no(none, up)
1173 subs height_minor_a, height_minor_a, #4
1176 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1177 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1178 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1181 add temp, psx_gpu, #psx_gpu_uvrg_offset
1182 vld1.32 { uvrg }, [ temp ]
1185 setup_spans_load_b()
1187 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1189 subgt height_minor_b, height_minor_b, y_c
1190 addgt height_minor_b, height_minor_b, #1
1192 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1193 subs clip, temp, y_a
1196 sub height_minor_b, height_minor_b, clip
1198 setup_spans_clip(increment, no)
1201 cmp height_minor_b, #0
1204 orr temp, y_a, y_a, lsl #16
1205 add temp, temp, #(1 << 16)
1207 add y_a, y_a, #(2 << 16)
1208 vmov.u32 y_x4, temp, y_a
1210 setup_spans_adjust_edges_alternate_no(left, right)
1212 ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1213 add temp, temp, height_minor_b
1214 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1217 setup_spans_set_x4_alternate_no(none, down)
1218 subs height_minor_b, height_minor_b, #4
1222 setup_spans_epilogue()
1225 setup_spans_up_down_load_edge_set_b()
1226 setup_spans_prologue_b()
1230 #undef span_uvrg_offset
1231 #undef span_edge_data
1232 #undef span_b_offset
1237 #define num_spans r1
1238 #define span_uvrg_offset r2
1239 #define span_edge_data r3
1240 #define span_b_offset r4
1242 #define span_num_blocks r6
1246 #define dither_offset_ptr r10
1247 #define block_ptr_a r11
1249 #define num_blocks r14
1251 #define uvrg_dx_ptr r2
1252 #define texture_mask_ptr r3
1253 #define dither_shift r8
1254 #define dither_row r10
1259 #define block_ptr_b r10
1261 #define block_span_ptr r10
1262 #define right_mask r8
1281 #define b_whole_8 d14
1282 #define fb_mask_ptrs d15
1295 #define u_whole_low d16
1296 #define u_whole_high d17
1297 #define v_whole_low d18
1298 #define v_whole_high d19
1299 #define r_whole_low d20
1300 #define r_whole_high d21
1301 #define g_whole_low d22
1302 #define g_whole_high d23
1303 #define b_whole_low d24
1304 #define b_whole_high d25
1309 #define u_whole_8 d26
1310 #define v_whole_8 d27
1311 #define u_whole_8b d24
1312 #define r_whole_8 d24
1313 #define g_whole_8 d25
1315 #define uv_whole_8 q13
1316 #define uv_whole_8b q14
1318 #define dither_offsets q14
1319 #define texture_mask q15
1320 #define texture_mask_u d30
1321 #define texture_mask_v d31
1323 #define dither_offsets_short d28
1327 #define block_span q10
1332 #define draw_mask q1
1333 #define draw_mask_edge q13
1334 #define test_mask q0
1340 #define setup_blocks_texture_swizzled() \
1341 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1342 vsli.u8 u_whole_8, v_whole_8, #4; \
1343 vsri.u8 v_whole_8, u_whole_8b, #4 \
1345 #define setup_blocks_texture_unswizzled() \
1348 #define setup_blocks_shaded_textured_builder(swizzling) \
1351 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
1352 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1353 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1355 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1356 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1358 cmp num_spans, #0; \
1361 stmdb sp!, { r4 - r11, r14 }; \
1362 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1364 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
1365 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1367 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1368 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1370 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1371 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1373 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1374 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1376 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1379 vmov.u8 fb_mask_ptrs, #0; \
1381 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1382 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1384 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
1385 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
1387 cmp span_num_blocks, #0; \
1390 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1391 add num_blocks, span_num_blocks, num_blocks; \
1393 cmp num_blocks, #MAX_BLOCKS; \
1397 ldr b, [ span_b_offset ]; \
1398 add fb_ptr, fb_ptr, y, lsl #11; \
1400 vdup.u32 v_left_x, left_x; \
1403 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1404 add fb_ptr, fb_ptr, left_x, lsl #1; \
1406 mla b, b_dx, left_x, b; \
1407 and dither_shift, left_x, #0x03; \
1409 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1410 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1412 mov dither_shift, dither_shift, lsl #3; \
1413 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1416 subs span_num_blocks, span_num_blocks, #1; \
1418 mov dither_row, dither_row, ror dither_shift; \
1419 mov b_dx4, b_dx, lsl #2; \
1421 vdup.u32 dither_offsets_short, dither_row; \
1422 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1424 vdup.u32 b_block, b; \
1425 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1427 vdup.u32 u_block, uv[0]; \
1428 mov b_dx8, b_dx, lsl #3; \
1430 vdup.u32 v_block, uv[1]; \
1431 vdup.u32 r_block, rg[0]; \
1432 vdup.u32 g_block, rg[1]; \
1434 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1436 vadd.u32 u_block, u_block, block_span; \
1437 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1439 vadd.u32 v_block, v_block, block_span; \
1440 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1442 vadd.u32 r_block, r_block, block_span; \
1443 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1445 vadd.u32 g_block, g_block, block_span; \
1446 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
1448 vadd.u32 b_block, b_block, block_span; \
1449 add block_ptr_b, block_ptr_a, #16; \
1451 vshrn.u32 u_whole_low, u_block, #16; \
1452 vshrn.u32 v_whole_low, v_block, #16; \
1453 vshrn.u32 r_whole_low, r_block, #16; \
1454 vshrn.u32 g_whole_low, g_block, #16; \
1456 vdup.u32 dx4, uv_dx4[0]; \
1457 vshrn.u32 b_whole_low, b_block, #16; \
1459 vaddhn.u32 u_whole_high, u_block, dx4; \
1460 vdup.u32 dx4, uv_dx4[1]; \
1462 vaddhn.u32 v_whole_high, v_block, dx4; \
1463 vdup.u32 dx4, rg_dx4[0]; \
1465 vaddhn.u32 r_whole_high, r_block, dx4; \
1466 vdup.u32 dx4, rg_dx4[1]; \
1468 vaddhn.u32 g_whole_high, g_block, dx4; \
1469 vdup.u32 dx4, b_dx4; \
1471 vaddhn.u32 b_whole_high, b_block, dx4; \
1472 vdup.u32 dx8, uv_dx8[0]; \
1474 vadd.u32 u_block, u_block, dx8; \
1475 vdup.u32 dx8, uv_dx8[1]; \
1477 vadd.u32 v_block, v_block, dx8; \
1478 vdup.u32 dx8, rg_dx8[0]; \
1480 vadd.u32 r_block, r_block, dx8; \
1481 vdup.u32 dx8, rg_dx8[1]; \
1483 vadd.u32 g_block, g_block, dx8; \
1484 vdup.u32 dx8, b_dx8; \
1486 vadd.u32 b_block, b_block, dx8; \
1487 vmovn.u16 u_whole_8, u_whole; \
1489 vmovn.u16 v_whole_8, v_whole; \
1491 vmovn.u16 b_whole_8, b_whole; \
1493 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1495 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1496 setup_blocks_texture_##swizzling(); \
1498 vmovn.u16 r_whole_8, r_whole; \
1502 vmovn.u16 g_whole_8, g_whole; \
1503 vshrn.u32 u_whole_low, u_block, #16; \
1505 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1506 vshrn.u32 v_whole_low, v_block, #16; \
1508 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1509 vshrn.u32 r_whole_low, r_block, #16; \
1511 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1512 vshrn.u32 g_whole_low, g_block, #16; \
1514 vdup.u32 dx4, uv_dx4[0]; \
1515 vshrn.u32 b_whole_low, b_block, #16; \
1517 vaddhn.u32 u_whole_high, u_block, dx4; \
1518 vdup.u32 dx4, uv_dx4[1]; \
1520 vaddhn.u32 v_whole_high, v_block, dx4; \
1521 vdup.u32 dx4, rg_dx4[0]; \
1523 vaddhn.u32 r_whole_high, r_block, dx4; \
1524 vdup.u32 dx4, rg_dx4[1]; \
1526 vaddhn.u32 g_whole_high, g_block, dx4; \
1527 vdup.u32 dx4, b_dx4; \
1529 vaddhn.u32 b_whole_high, b_block, dx4; \
1530 vdup.u32 dx8, uv_dx8[0]; \
1532 vadd.u32 u_block, u_block, dx8; \
1533 vdup.u32 dx8, uv_dx8[1]; \
1535 vadd.u32 v_block, v_block, dx8; \
1536 vdup.u32 dx8, rg_dx8[0]; \
1538 vadd.u32 r_block, r_block, dx8; \
1539 vdup.u32 dx8, rg_dx8[1]; \
1541 vadd.u32 g_block, g_block, dx8; \
1542 vdup.u32 dx8, b_dx8; \
1544 vadd.u32 b_block, b_block, dx8; \
1545 vmovn.u16 u_whole_8, u_whole; \
1547 add fb_ptr, fb_ptr, #16; \
1548 vmovn.u16 v_whole_8, v_whole; \
1550 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1551 vmovn.u16 b_whole_8, b_whole; \
1555 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1556 subs span_num_blocks, span_num_blocks, #1; \
1558 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1559 setup_blocks_texture_##swizzling(); \
1561 vmovn.u16 r_whole_8, r_whole; \
1565 vmovn.u16 g_whole_8, g_whole; \
1566 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1568 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1569 vdup.u8 draw_mask, right_mask; \
1571 vmov.u32 fb_mask_ptrs[0], right_mask; \
1572 vtst.u16 draw_mask, draw_mask, test_mask; \
1573 vzip.u8 u_whole_8, v_whole_8; \
1575 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1576 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1577 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1578 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1579 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1582 add span_uvrg_offset, span_uvrg_offset, #16; \
1583 add span_b_offset, span_b_offset, #4; \
1585 add span_edge_data, span_edge_data, #8; \
1586 subs num_spans, num_spans, #1; \
1588 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1591 ldmia sp!, { r4 - r11, pc }; \
1594 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1595 vpush { texture_mask }; \
1596 vpush { uvrg_dx4 }; \
1598 stmdb sp!, { r0 - r3, r12, r14 }; \
1599 bl flush_render_block_buffer; \
1600 ldmia sp!, { r0 - r3, r12, r14 }; \
1602 vpop { uvrg_dx4 }; \
1603 vpop { texture_mask }; \
1605 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1606 vmov.u8 fb_mask_ptrs, #0; \
1608 mov num_blocks, span_num_blocks; \
1609 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1613 setup_blocks_shaded_textured_builder(swizzled)
1614 setup_blocks_shaded_textured_builder(unswizzled)
1617 #define setup_blocks_unshaded_textured_builder(swizzling) \
1620 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
1621 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1622 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1624 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1625 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1627 cmp num_spans, #0; \
1630 stmdb sp!, { r4 - r11, r14 }; \
1631 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1633 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1635 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1636 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1638 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1639 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1641 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1643 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1646 vmov.u8 fb_mask_ptrs, #0; \
1648 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1649 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1651 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
1652 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
1654 cmp span_num_blocks, #0; \
1657 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1658 add num_blocks, span_num_blocks, num_blocks; \
1660 cmp num_blocks, #MAX_BLOCKS; \
1664 add fb_ptr, fb_ptr, y, lsl #11; \
1666 vdup.u32 v_left_x, left_x; \
1669 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1670 add fb_ptr, fb_ptr, left_x, lsl #1; \
1672 and dither_shift, left_x, #0x03; \
1674 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1675 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1677 mov dither_shift, dither_shift, lsl #3; \
1678 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1681 subs span_num_blocks, span_num_blocks, #1; \
1683 mov dither_row, dither_row, ror dither_shift; \
1685 vdup.u32 dither_offsets_short, dither_row; \
1686 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1688 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1690 vdup.u32 u_block, uv[0]; \
1692 vdup.u32 v_block, uv[1]; \
1693 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1695 vadd.u32 u_block, u_block, block_span; \
1696 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1698 vadd.u32 v_block, v_block, block_span; \
1699 add block_ptr_b, block_ptr_a, #16; \
1701 vshrn.u32 u_whole_low, u_block, #16; \
1702 vshrn.u32 v_whole_low, v_block, #16; \
1704 vdup.u32 dx4, uv_dx4[0]; \
1706 vaddhn.u32 u_whole_high, u_block, dx4; \
1707 vdup.u32 dx4, uv_dx4[1]; \
1709 vaddhn.u32 v_whole_high, v_block, dx4; \
1710 vdup.u32 dx8, uv_dx8[0]; \
1712 vadd.u32 u_block, u_block, dx8; \
1713 vdup.u32 dx8, uv_dx8[1]; \
1715 vadd.u32 v_block, v_block, dx8; \
1716 vmovn.u16 u_whole_8, u_whole; \
1718 vmovn.u16 v_whole_8, v_whole; \
1721 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1723 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1724 setup_blocks_texture_##swizzling(); \
1729 vshrn.u32 u_whole_low, u_block, #16; \
1731 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1732 vshrn.u32 v_whole_low, v_block, #16; \
1734 add block_ptr_b, block_ptr_b, #32; \
1735 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1737 vdup.u32 dx4, uv_dx4[0]; \
1738 vaddhn.u32 u_whole_high, u_block, dx4; \
1739 vdup.u32 dx4, uv_dx4[1]; \
1741 vaddhn.u32 v_whole_high, v_block, dx4; \
1742 vdup.u32 dx8, uv_dx8[0]; \
1744 vadd.u32 u_block, u_block, dx8; \
1745 vdup.u32 dx8, uv_dx8[1]; \
1747 vadd.u32 v_block, v_block, dx8; \
1748 vmovn.u16 u_whole_8, u_whole; \
1750 add fb_ptr, fb_ptr, #16; \
1751 vmovn.u16 v_whole_8, v_whole; \
1753 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1756 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1757 subs span_num_blocks, span_num_blocks, #1; \
1759 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1760 setup_blocks_texture_##swizzling(); \
1765 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1767 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1768 vdup.u8 draw_mask, right_mask; \
1770 vmov.u32 fb_mask_ptrs[0], right_mask; \
1771 vtst.u16 draw_mask, draw_mask, test_mask; \
1772 vzip.u8 u_whole_8, v_whole_8; \
1774 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1775 add block_ptr_b, block_ptr_b, #32; \
1776 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1777 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1778 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1781 add span_uvrg_offset, span_uvrg_offset, #16; \
1782 add span_edge_data, span_edge_data, #8; \
1783 subs num_spans, num_spans, #1; \
1785 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1788 ldmia sp!, { r4 - r11, pc }; \
1791 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1792 vpush { texture_mask }; \
1793 vpush { uvrg_dx4 }; \
1795 stmdb sp!, { r0 - r3, r12, r14 }; \
1796 bl flush_render_block_buffer; \
1797 ldmia sp!, { r0 - r3, r12, r14 }; \
1799 vpop { uvrg_dx4 }; \
1800 vpop { texture_mask }; \
1802 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1803 vmov.u8 fb_mask_ptrs, #0; \
1805 mov num_blocks, span_num_blocks; \
1806 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1810 setup_blocks_unshaded_textured_builder(swizzled)
1811 setup_blocks_unshaded_textured_builder(unswizzled)
1816 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1817 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1818 veor.u32 draw_mask, draw_mask, draw_mask
1823 stmdb sp!, { r4 - r11, r14 }
1824 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1826 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1828 ubfx color_r, color, #3, #5
1829 ubfx color_g, color, #11, #5
1830 ubfx color_b, color, #19, #5
1832 orr color, color_r, color_b, lsl #10
1833 orr color, color, color_g, lsl #5
1835 vdup.u16 colors, color
1837 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1838 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1840 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1841 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1844 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1845 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1847 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
1849 cmp span_num_blocks, #0
1852 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1853 add num_blocks, span_num_blocks, num_blocks
1855 cmp num_blocks, #MAX_BLOCKS
1859 add fb_ptr, fb_ptr, y, lsl #11
1862 add fb_ptr, fb_ptr, left_x, lsl #1
1865 subs span_num_blocks, span_num_blocks, #1
1867 add block_ptr_b, block_ptr_a, #16
1870 vmov.u32 fb_mask_ptrs[1], fb_ptr
1874 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1875 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1876 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1878 add fb_ptr, fb_ptr, #16
1879 add block_ptr_b, block_ptr_b, #32
1883 vmov.u32 fb_mask_ptrs[1], fb_ptr
1884 subs span_num_blocks, span_num_blocks, #1
1889 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1891 vdup.u8 draw_mask_edge, right_mask
1892 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1894 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1895 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1896 add block_ptr_b, block_ptr_b, #32
1897 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1900 add span_edge_data, span_edge_data, #8
1901 subs num_spans, num_spans, #1
1903 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1906 ldmia sp!, { r4 - r11, pc }
1911 stmdb sp!, { r0 - r3, r12, r14 }
1912 bl flush_render_block_buffer
1913 ldmia sp!, { r0 - r3, r12, r14 }
1917 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1918 veor.u32 draw_mask, draw_mask, draw_mask
1920 mov num_blocks, span_num_blocks
1921 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1925 #define mask_msb_scalar r14
1927 #define msb_mask q15
1929 #define pixels_low d16
1931 #define msb_mask_low d30
1932 #define msb_mask_high d31
1937 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1938 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1943 stmdb sp!, { r4 - r11, r14 }
1945 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1947 ubfx color_r, color, #3, #5
1948 ubfx color_g, color, #11, #5
1950 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1951 ubfx color_b, color, #19, #5
1953 orr color, color_r, color_b, lsl #10
1954 orr color, color, color_g, lsl #5
1955 orr color, color, mask_msb_scalar
1957 vdup.u16 colors, color
1959 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1962 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1963 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1965 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
1967 cmp span_num_blocks, #0
1970 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1972 add fb_ptr, fb_ptr, y, lsl #11
1973 subs span_num_blocks, span_num_blocks, #1
1975 add fb_ptr, fb_ptr, left_x, lsl #1
1979 vst1.u32 { colors }, [ fb_ptr ]!
1980 subs span_num_blocks, span_num_blocks, #1
1985 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1986 eor right_mask, right_mask, #0xFF
1989 strh color, [ fb_ptr ], #2
1990 movs right_mask, right_mask, lsr #1
1994 add span_edge_data, span_edge_data, #8
1995 subs num_spans, num_spans, #1
1997 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2000 ldmia sp!, { r4 - r11, pc }
2007 #define rg_dx_ptr r2
2025 #undef dither_offsets
2045 #define r_whole_low d6
2046 #define r_whole_high d7
2047 #define g_whole_low d8
2048 #define g_whole_high d9
2049 #define b_whole_low d10
2050 #define b_whole_high d11
2052 #define gb_whole_8 q6
2054 #define g_whole_8 d12
2055 #define b_whole_8 d13
2057 #define r_whole_8 d14
2069 #define block_span q5
2077 #define d128_0x7 q13
2081 #define dither_offsets q14
2082 #define draw_mask q15
2084 #define dither_offsets_low d28
2087 #define test_mask q10
2090 #define setup_blocks_shaded_untextured_dither_a_dithered() \
2091 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2092 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2094 #define setup_blocks_shaded_untextured_dither_b_dithered() \
2095 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2096 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2098 #define setup_blocks_shaded_untextured_dither_a_undithered() \
2100 #define setup_blocks_shaded_untextured_dither_b_undithered() \
2103 #define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2106 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
2107 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2108 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2110 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2112 cmp num_spans, #0; \
2115 stmdb sp!, { r4 - r11, r14 }; \
2116 vshl.u32 rg_dx4, rg_dx, #2; \
2118 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2119 vshl.u32 rg_dx8, rg_dx, #3; \
2121 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2123 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2124 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2126 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2127 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2129 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2130 vmov.u8 d64_1, #1; \
2132 vmov.u8 d128_4, #4; \
2133 vmov.u8 d64_128, #128; \
2135 vmov.u8 d128_0x7, #0x7; \
2138 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2139 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2141 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
2142 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
2144 cmp span_num_blocks, #0; \
2147 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2148 add num_blocks, span_num_blocks, num_blocks; \
2150 cmp num_blocks, #MAX_BLOCKS; \
2154 ldr b, [ span_b_offset ]; \
2155 add fb_ptr, fb_ptr, y, lsl #11; \
2157 vdup.u32 v_left_x, left_x; \
2160 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2161 add fb_ptr, fb_ptr, left_x, lsl #1; \
2163 mla b, b_dx, left_x, b; \
2164 and dither_shift, left_x, #0x03; \
2166 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2167 vshr.u32 rg_dx, rg_dx4, #2; \
2169 mov dither_shift, dither_shift, lsl #3; \
2170 vmla.u32 rg, rg_dx, v_left_x; \
2173 subs span_num_blocks, span_num_blocks, #1; \
2175 mov dither_row, dither_row, ror dither_shift; \
2176 mov b_dx4, b_dx, lsl #2; \
2178 vdup.u32 dither_offsets, dither_row; \
2179 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2181 vdup.u32 b_block, b; \
2182 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2184 mov b_dx8, b_dx, lsl #3; \
2185 vdup.u32 r_block, rg[0]; \
2186 vdup.u32 g_block, rg[1]; \
2188 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2190 vadd.u32 r_block, r_block, block_span; \
2191 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2193 vadd.u32 g_block, g_block, block_span; \
2194 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2196 vadd.u32 b_block, b_block, block_span; \
2197 add block_ptr_b, block_ptr_a, #16; \
2199 vshrn.u32 r_whole_low, r_block, #16; \
2200 vshrn.u32 g_whole_low, g_block, #16; \
2201 vshrn.u32 b_whole_low, b_block, #16; \
2202 vdup.u32 dx4, rg_dx4[0]; \
2204 vaddhn.u32 r_whole_high, r_block, dx4; \
2205 vdup.u32 dx4, rg_dx4[1]; \
2207 vaddhn.u32 g_whole_high, g_block, dx4; \
2208 vdup.u32 dx4, b_dx4; \
2210 vaddhn.u32 b_whole_high, b_block, dx4; \
2211 vdup.u32 dx8, rg_dx8[0]; \
2213 vadd.u32 r_block, r_block, dx8; \
2214 vdup.u32 dx8, rg_dx8[1]; \
2216 vadd.u32 g_block, g_block, dx8; \
2217 vdup.u32 dx8, b_dx8; \
2219 vadd.u32 b_block, b_block, dx8; \
2221 vmovn.u16 r_whole_8, r_whole; \
2222 vmovn.u16 g_whole_8, g_whole; \
2223 vmovn.u16 b_whole_8, b_whole; \
2226 veor.u32 draw_mask, draw_mask, draw_mask; \
2229 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2230 vshrn.u32 r_whole_low, r_block, #16; \
2232 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2233 vshrn.u32 g_whole_low, g_block, #16; \
2235 vshrn.u32 b_whole_low, b_block, #16; \
2236 str fb_ptr, [ block_ptr_a, #44 ]; \
2238 vdup.u32 dx4, rg_dx4[0]; \
2239 vshr.u8 r_whole_8, r_whole_8, #3; \
2240 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2242 vaddhn.u32 r_whole_high, r_block, dx4; \
2243 vdup.u32 dx4, rg_dx4[1]; \
2245 vaddhn.u32 g_whole_high, g_block, dx4; \
2246 vdup.u32 dx4, b_dx4; \
2248 vaddhn.u32 b_whole_high, b_block, dx4; \
2249 vdup.u32 dx8, rg_dx8[0]; \
2251 vmull.u8 pixels, r_whole_8, d64_1; \
2252 vmlal.u8 pixels, g_whole_8, d64_4; \
2253 vmlal.u8 pixels, b_whole_8, d64_128; \
2255 vadd.u32 r_block, r_block, dx8; \
2256 vdup.u32 dx8, rg_dx8[1]; \
2258 vadd.u32 g_block, g_block, dx8; \
2259 vdup.u32 dx8, b_dx8; \
2261 vadd.u32 b_block, b_block, dx8; \
2262 add fb_ptr, fb_ptr, #16; \
2264 vmovn.u16 r_whole_8, r_whole; \
2265 vmovn.u16 g_whole_8, g_whole; \
2266 vmovn.u16 b_whole_8, b_whole; \
2268 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2269 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2273 subs span_num_blocks, span_num_blocks, #1; \
2277 str fb_ptr, [ block_ptr_a, #44 ]; \
2278 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2280 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2281 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2283 vshr.u8 r_whole_8, r_whole_8, #3; \
2284 vdup.u8 draw_mask, right_mask; \
2286 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2287 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
2289 vtst.u16 draw_mask, draw_mask, test_mask; \
2291 vmull.u8 pixels, r_whole_8, d64_1; \
2292 vmlal.u8 pixels, g_whole_8, d64_4; \
2293 vmlal.u8 pixels, b_whole_8, d64_128; \
2295 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2296 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2299 add span_uvrg_offset, span_uvrg_offset, #16; \
2300 add span_b_offset, span_b_offset, #4; \
2302 add span_edge_data, span_edge_data, #8; \
2303 subs num_spans, num_spans, #1; \
2305 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2308 ldmia sp!, { r4 - r11, pc }; \
2311 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2314 stmdb sp!, { r0 - r3, r12, r14 }; \
2315 bl flush_render_block_buffer; \
2316 ldmia sp!, { r0 - r3, r12, r14 }; \
2320 vmov.u8 d64_1, #1; \
2321 vmov.u8 d128_4, #4; \
2322 vmov.u8 d64_128, #128; \
2323 vmov.u8 d128_0x7, #0x7; \
2325 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2327 mov num_blocks, span_num_blocks; \
2328 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2332 setup_blocks_shaded_untextured_indirect_builder(undithered)
2333 setup_blocks_shaded_untextured_indirect_builder(dithered)
2338 #define mask_msb_ptr r14
2340 #define draw_mask q0
2341 #define pixels_low d16
2345 #define setup_blocks_shaded_untextured_direct_builder(dithering) \
2348 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
2349 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2350 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2352 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2354 cmp num_spans, #0; \
2357 stmdb sp!, { r4 - r11, r14 }; \
2358 vshl.u32 rg_dx4, rg_dx, #2; \
2360 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2361 vshl.u32 rg_dx8, rg_dx, #3; \
2363 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2364 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2366 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2367 vmov.u8 d64_1, #1; \
2369 vmov.u8 d128_4, #4; \
2370 vmov.u8 d64_128, #128; \
2372 vmov.u8 d128_0x7, #0x7; \
2373 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
2374 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
2377 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2378 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2380 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
2381 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
2383 cmp span_num_blocks, #0; \
2386 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2387 add fb_ptr, fb_ptr, y, lsl #11; \
2389 ldr b, [ span_b_offset ]; \
2390 vdup.u32 v_left_x, left_x; \
2393 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2394 add fb_ptr, fb_ptr, left_x, lsl #1; \
2396 mla b, b_dx, left_x, b; \
2397 and dither_shift, left_x, #0x03; \
2399 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2400 vshr.u32 rg_dx, rg_dx4, #2; \
2402 mov dither_shift, dither_shift, lsl #3; \
2403 vmla.u32 rg, rg_dx, v_left_x; \
2405 subs span_num_blocks, span_num_blocks, #1; \
2407 mov dither_row, dither_row, ror dither_shift; \
2408 mov b_dx4, b_dx, lsl #2; \
2410 vdup.u32 dither_offsets, dither_row; \
2411 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2413 vdup.u32 b_block, b; \
2414 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2416 mov b_dx8, b_dx, lsl #3; \
2417 vdup.u32 r_block, rg[0]; \
2418 vdup.u32 g_block, rg[1]; \
2420 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2422 vadd.u32 r_block, r_block, block_span; \
2423 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2425 vadd.u32 g_block, g_block, block_span; \
2426 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2428 vadd.u32 b_block, b_block, block_span; \
2429 add block_ptr_b, block_ptr_a, #16; \
2431 vshrn.u32 r_whole_low, r_block, #16; \
2432 vshrn.u32 g_whole_low, g_block, #16; \
2433 vshrn.u32 b_whole_low, b_block, #16; \
2434 vdup.u32 dx4, rg_dx4[0]; \
2436 vaddhn.u32 r_whole_high, r_block, dx4; \
2437 vdup.u32 dx4, rg_dx4[1]; \
2439 vaddhn.u32 g_whole_high, g_block, dx4; \
2440 vdup.u32 dx4, b_dx4; \
2442 vaddhn.u32 b_whole_high, b_block, dx4; \
2443 vdup.u32 dx8, rg_dx8[0]; \
2445 vadd.u32 r_block, r_block, dx8; \
2446 vdup.u32 dx8, rg_dx8[1]; \
2448 vadd.u32 g_block, g_block, dx8; \
2449 vdup.u32 dx8, b_dx8; \
2451 vadd.u32 b_block, b_block, dx8; \
2453 vmovn.u16 r_whole_8, r_whole; \
2454 vmovn.u16 g_whole_8, g_whole; \
2455 vmovn.u16 b_whole_8, b_whole; \
2460 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2461 vshrn.u32 r_whole_low, r_block, #16; \
2463 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2464 vshrn.u32 g_whole_low, g_block, #16; \
2466 vshrn.u32 b_whole_low, b_block, #16; \
2468 vdup.u32 dx4, rg_dx4[0]; \
2469 vshr.u8 r_whole_8, r_whole_8, #3; \
2470 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2472 vaddhn.u32 r_whole_high, r_block, dx4; \
2473 vdup.u32 dx4, rg_dx4[1]; \
2475 vmov pixels, msb_mask; \
2476 vaddhn.u32 g_whole_high, g_block, dx4; \
2477 vdup.u32 dx4, b_dx4; \
2479 vaddhn.u32 b_whole_high, b_block, dx4; \
2480 vdup.u32 dx8, rg_dx8[0]; \
2482 vmlal.u8 pixels, r_whole_8, d64_1; \
2483 vmlal.u8 pixels, g_whole_8, d64_4; \
2484 vmlal.u8 pixels, b_whole_8, d64_128; \
2486 vadd.u32 r_block, r_block, dx8; \
2487 vdup.u32 dx8, rg_dx8[1]; \
2489 vadd.u32 g_block, g_block, dx8; \
2490 vdup.u32 dx8, b_dx8; \
2492 vadd.u32 b_block, b_block, dx8; \
2494 vmovn.u16 r_whole_8, r_whole; \
2495 vmovn.u16 g_whole_8, g_whole; \
2496 vmovn.u16 b_whole_8, b_whole; \
2498 vst1.u32 { pixels }, [ fb_ptr ]!; \
2499 subs span_num_blocks, span_num_blocks, #1; \
2503 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2505 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2506 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2508 vshr.u8 r_whole_8, r_whole_8, #3; \
2509 vmov pixels, msb_mask; \
2510 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2511 eor right_mask, right_mask, #0xFF; \
2513 vmlal.u8 pixels, r_whole_8, d64_1; \
2514 vmlal.u8 pixels, g_whole_8, d64_4; \
2515 vmlal.u8 pixels, b_whole_8, d64_128; \
2518 vst1.u16 { pixels_low[0] }, [ fb_ptr ]!; \
2519 vext.16 pixels, pixels, #1; \
2520 movs right_mask, right_mask, lsr #1; \
2524 add span_uvrg_offset, span_uvrg_offset, #16; \
2525 add span_b_offset, span_b_offset, #4; \
2527 add span_edge_data, span_edge_data, #8; \
2528 subs num_spans, num_spans, #1; \
2532 ldmia sp!, { r4 - r11, pc } \
2534 setup_blocks_shaded_untextured_direct_builder(undithered)
2535 setup_blocks_shaded_untextured_direct_builder(dithered)
2544 #define block_ptr r1
2545 #define num_blocks r2
2558 #define texture_ptr r11
2572 #define pixels_d r10
2576 #define clut_ptr r12
2577 #define current_texture_mask r5
2578 #define dirty_textures_mask r6
2582 #define clut_low_a d2
2583 #define clut_low_b d3
2584 #define clut_high_a d4
2585 #define clut_high_b d5
2590 #define texels_low d6
2591 #define texels_high d7
2595 function(texture_blocks_untextured)
2601 function(texture_blocks_4bpp)
2602 stmdb sp!, { r3 - r11, r14 }
2603 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2605 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2606 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2608 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2609 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2611 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2612 vuzp.u8 clut_a, clut_b
2614 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2615 tst dirty_textures_mask, current_texture_mask
2621 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2623 uxtah uv_0, texture_ptr, uv_01
2624 uxtah uv_1, texture_ptr, uv_01, ror #16
2626 uxtah uv_2, texture_ptr, uv_23
2627 uxtah uv_3, texture_ptr, uv_23, ror #16
2629 uxtah uv_4, texture_ptr, uv_45
2630 ldrb pixel_0, [ uv_0 ]
2632 uxtah uv_5, texture_ptr, uv_45, ror #16
2633 ldrb pixel_1, [ uv_1 ]
2635 uxtah uv_6, texture_ptr, uv_67
2636 ldrb pixel_2, [ uv_2 ]
2638 uxtah uv_7, texture_ptr, uv_67, ror #16
2639 ldrb pixel_3, [ uv_3 ]
2641 ldrb pixel_4, [ uv_4 ]
2642 subs num_blocks, num_blocks, #1
2644 ldrb pixel_5, [ uv_5 ]
2645 orr pixels_a, pixel_0, pixel_1, lsl #8
2647 ldrb pixel_6, [ uv_6 ]
2648 orr pixels_b, pixel_4, pixel_5, lsl #8
2650 ldrb pixel_7, [ uv_7 ]
2651 orr pixels_a, pixels_a, pixel_2, lsl #16
2653 orr pixels_b, pixels_b, pixel_6, lsl #16
2654 orr pixels_a, pixels_a, pixel_3, lsl #24
2656 orr pixels_b, pixels_b, pixel_7, lsl #24
2657 vmov.u32 texels, pixels_a, pixels_b
2659 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2660 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2662 vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2665 ldmia sp!, { r3 - r11, pc }
2668 stmdb sp!, { r1 - r2 }
2669 bl update_texture_4bpp_cache
2672 ldmia sp!, { r1 - r2 }
2678 function(texture_blocks_8bpp)
2679 stmdb sp!, { r3 - r11, r14 }
2680 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2682 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2683 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2685 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2686 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2688 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2689 tst dirty_textures_mask, current_texture_mask
2695 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2697 uxtah uv_0, texture_ptr, uv_01
2698 uxtah uv_1, texture_ptr, uv_01, ror #16
2700 uxtah uv_2, texture_ptr, uv_23
2701 uxtah uv_3, texture_ptr, uv_23, ror #16
2703 uxtah uv_4, texture_ptr, uv_45
2704 ldrb pixel_0, [ uv_0 ]
2706 uxtah uv_5, texture_ptr, uv_45, ror #16
2707 ldrb pixel_1, [ uv_1 ]
2709 uxtah uv_6, texture_ptr, uv_67
2710 ldrb pixel_2, [ uv_2 ]
2712 uxtah uv_7, texture_ptr, uv_67, ror #16
2713 ldrb pixel_3, [ uv_3 ]
2715 ldrb pixel_4, [ uv_4 ]
2716 add pixel_0, pixel_0, pixel_0
2718 ldrb pixel_5, [ uv_5 ]
2719 add pixel_1, pixel_1, pixel_1
2721 ldrb pixel_6, [ uv_6 ]
2722 add pixel_2, pixel_2, pixel_2
2724 ldrb pixel_7, [ uv_7 ]
2725 add pixel_3, pixel_3, pixel_3
2727 ldrh pixel_0, [ clut_ptr, pixel_0 ]
2728 add pixel_4, pixel_4, pixel_4
2730 ldrh pixel_1, [ clut_ptr, pixel_1 ]
2731 add pixel_5, pixel_5, pixel_5
2733 ldrh pixel_2, [ clut_ptr, pixel_2 ]
2734 add pixel_6, pixel_6, pixel_6
2736 ldrh pixel_3, [ clut_ptr, pixel_3 ]
2737 add pixel_7, pixel_7, pixel_7
2739 ldrh pixel_4, [ clut_ptr, pixel_4 ]
2740 orr pixels_a, pixel_0, pixel_1, lsl #16
2742 ldrh pixel_5, [ clut_ptr, pixel_5 ]
2743 orr pixels_c, pixel_2, pixel_3, lsl #16
2745 ldrh pixel_6, [ clut_ptr, pixel_6 ]
2746 subs num_blocks, num_blocks, #1
2748 ldrh pixel_7, [ clut_ptr, pixel_7 ]
2749 orr pixels_b, pixel_4, pixel_5, lsl #16
2751 orr pixels_d, pixel_6, pixel_7, lsl #16
2752 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2754 add block_ptr, block_ptr, #64
2757 ldmia sp!, { r3 - r11, pc }
2760 stmdb sp!, { r1 - r2, r12 }
2762 bl update_texture_8bpp_cache
2764 ldmia sp!, { r1 - r2, r12 }
2794 #define block_ptr r1
2795 #define num_blocks r2
2839 #define texture_ptr r12
2844 function(texture_blocks_16bpp)
2845 stmdb sp!, { r3 - r11, r14 }
2846 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2848 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2849 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2852 ldrh uv_0, [ block_ptr ]
2853 subs num_blocks, num_blocks, #1
2855 ldrh uv_1, [ block_ptr, #2 ]
2857 and v_0, uv_0, #0xFF00
2858 and v_1, uv_1, #0xFF00
2860 and u_0, uv_0, #0xFF
2861 and u_1, uv_1, #0xFF
2863 add uv_0, u_0, v_0, lsl #2
2864 ldrh uv_2, [ block_ptr, #4 ]
2866 add uv_1, u_1, v_1, lsl #2
2867 ldrh uv_3, [ block_ptr, #6 ]
2869 add uv_0, uv_0, uv_0
2870 add uv_1, uv_1, uv_1
2872 and v_2, uv_2, #0xFF00
2873 and v_3, uv_3, #0xFF00
2875 and u_2, uv_2, #0xFF
2876 and u_3, uv_3, #0xFF
2878 add uv_2, u_2, v_2, lsl #2
2879 ldrh uv_4, [ block_ptr, #8 ]
2881 add uv_3, u_3, v_3, lsl #2
2882 ldrh uv_5, [ block_ptr, #10 ]
2884 add uv_2, uv_2, uv_2
2885 add uv_3, uv_3, uv_3
2887 and v_4, uv_4, #0xFF00
2888 and v_5, uv_5, #0xFF00
2890 and u_4, uv_4, #0xFF
2891 and u_5, uv_5, #0xFF
2893 add uv_4, u_4, v_4, lsl #2
2894 ldrh uv_6, [ block_ptr, #12 ]
2896 add uv_5, u_5, v_5, lsl #2
2897 ldrh uv_7, [ block_ptr, #14 ]
2899 add uv_4, uv_4, uv_4
2900 ldrh pixel_0, [ texture_ptr, uv_0 ]
2902 add uv_5, uv_5, uv_5
2903 ldrh pixel_1, [ texture_ptr, uv_1 ]
2905 and v_6, uv_6, #0xFF00
2906 ldrh pixel_2, [ texture_ptr, uv_2 ]
2908 and v_7, uv_7, #0xFF00
2909 ldrh pixel_3, [ texture_ptr, uv_3 ]
2911 and u_6, uv_6, #0xFF
2912 ldrh pixel_4, [ texture_ptr, uv_4 ]
2914 and u_7, uv_7, #0xFF
2915 ldrh pixel_5, [ texture_ptr, uv_5 ]
2917 add uv_6, u_6, v_6, lsl #2
2918 add uv_7, u_7, v_7, lsl #2
2920 add uv_6, uv_6, uv_6
2921 add uv_7, uv_7, uv_7
2923 orr pixels_a, pixel_0, pixel_1, lsl #16
2924 orr pixels_b, pixel_2, pixel_3, lsl #16
2926 ldrh pixel_6, [ texture_ptr, uv_6 ]
2927 orr pixels_c, pixel_4, pixel_5, lsl #16
2929 ldrh pixel_7, [ texture_ptr, uv_7 ]
2930 orr pixels_d, pixel_6, pixel_7, lsl #16
2932 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2933 add block_ptr, block_ptr, #64
2937 ldmia sp!, { r3 - r11, pc }
2952 #undef msb_mask_high
2960 #define num_blocks r1
2961 #define color_ptr r2
2962 #define mask_msb_ptr r2
2964 #define block_ptr_load_a r0
2965 #define block_ptr_store r3
2966 #define block_ptr_load_b r12
2971 #define draw_mask_bits_scalar r5
2973 #define d128_0x07 q0
2974 #define d128_0x1F q1
2975 #define d128_0x8000 q2
2976 #define test_mask q3
2978 #define colors_rg q5
2979 #define colors_b_dm_bits q6
2980 #define texels_rg q7
2983 #define pixels_b q10
2985 #define zero_mask q4
2986 #define draw_mask q12
2987 #define msb_mask q13
2989 #define fb_pixels q8
2991 #define pixels_gb_low q9
2993 #define colors_r d10
2994 #define colors_g d11
2995 #define colors_b d12
2996 #define draw_mask_bits d13
2997 #define texels_r d14
2998 #define texels_g d15
2999 #define pixels_r_low d16
3000 #define pixels_g_low d18
3001 #define pixels_b_low d19
3002 #define msb_mask_low d26
3003 #define msb_mask_high d27
3008 #define texels_b d31
3010 #define shade_blocks_textured_modulated_prologue_indirect() \
3012 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3014 #define shade_blocks_textured_modulated_prologue_direct() \
3015 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3016 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \
3018 #define shade_blocks_textured_modulated_prologue_shaded() \
3020 #define shade_blocks_textured_modulated_prologue_unshaded() \
3021 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
3022 vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \
3023 vdup.u8 colors_g, colors_r[1]; \
3024 vdup.u8 colors_b, colors_r[2]; \
3025 vdup.u8 colors_r, colors_r[0] \
3028 #define shade_blocks_textured_modulated_load_dithered(target) \
3029 vld1.u32 { target }, [ block_ptr_load_b, :128 ] \
3031 #define shade_blocks_textured_modulated_load_last_dithered(target) \
3032 vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \
3034 #define shade_blocks_textured_modulated_load_undithered(target) \
3036 #define shade_blocks_textured_modulated_load_last_undithered(target) \
3037 add block_ptr_load_b, block_ptr_load_b, #32 \
3039 #define shade_blocks_textured_modulate_dithered(channel) \
3040 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3042 #define shade_blocks_textured_modulate_undithered(channel) \
3043 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3046 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
3047 vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \
3049 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
3050 ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \
3051 vld1.u32 { fb_pixels }, [ fb_ptr ]; \
3052 vbit.u16 pixels, fb_pixels, draw_mask \
3054 #define shade_blocks_textured_modulated_store_pixels_indirect() \
3055 vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \
3057 #define shade_blocks_textured_modulated_store_pixels_direct() \
3058 vst1.u32 { pixels }, [ fb_ptr ] \
3061 #define shade_blocks_textured_modulated_load_rg_shaded() \
3062 vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \
3064 #define shade_blocks_textured_modulated_load_rg_unshaded() \
3065 add block_ptr_load_b, block_ptr_load_b, #32 \
3067 #define shade_blocks_textured_modulated_load_bdm_shaded() \
3068 vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \
3070 #define shade_blocks_textured_modulated_load_bdm_unshaded() \
3071 ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \
3072 add block_ptr_load_a, block_ptr_load_a, #32 \
3074 #define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3075 vdup.u16 draw_mask, draw_mask_bits[0] \
3077 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3078 vdup.u16 draw_mask, draw_mask_bits_scalar \
3081 #define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3083 #define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3084 vorr.u16 pixels, pixels, msb_mask \
3087 #define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3090 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3091 stmdb sp!, { r4 - r5, lr }; \
3092 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3094 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
3096 shade_blocks_textured_modulated_prologue_##target(); \
3097 shade_blocks_textured_modulated_prologue_##shading(); \
3099 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3102 add block_ptr_load_b, block_ptr_load_a, #16; \
3103 vmov.u8 d64_1, #1; \
3104 vmov.u8 d64_4, #4; \
3105 vmov.u8 d64_128, #128; \
3107 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3108 vmov.u8 d128_0x07, #0x07; \
3110 shade_blocks_textured_modulated_load_rg_##shading(); \
3111 vmov.u8 d128_0x1F, #0x1F; \
3113 shade_blocks_textured_modulated_load_bdm_##shading(); \
3114 vmov.u16 d128_0x8000, #0x8000; \
3116 vmovn.u16 texels_r, texels; \
3117 vshrn.u16 texels_g, texels, #5; \
3119 vshrn.u16 texels_b, texels, #7; \
3120 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3122 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3123 vtst.u16 draw_mask, draw_mask, test_mask; \
3125 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3126 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3128 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3129 vshr.u8 texels_b, texels_b, #3; \
3131 shade_blocks_textured_modulate_##dithering(r); \
3132 shade_blocks_textured_modulate_##dithering(g); \
3133 shade_blocks_textured_modulate_##dithering(b); \
3135 vand.u16 pixels, texels, d128_0x8000; \
3136 vceq.u16 zero_mask, texels, #0; \
3138 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3139 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3140 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3142 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3143 vorr.u16 draw_mask, draw_mask, zero_mask; \
3144 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3145 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3147 subs num_blocks, num_blocks, #1; \
3153 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3154 shade_blocks_textured_modulated_load_rg_##shading(); \
3155 vshrn.u16 texels_g, texels, #5; \
3157 shade_blocks_textured_modulated_load_bdm_##shading(); \
3158 vshrn.u16 texels_b, texels, #7; \
3160 vmovn.u16 texels_r, texels; \
3161 vmlal.u8 pixels, pixels_r_low, d64_1; \
3163 vmlal.u8 pixels, pixels_g_low, d64_4; \
3164 vmlal.u8 pixels, pixels_b_low, d64_128; \
3165 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3167 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3168 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3170 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3171 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3173 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3174 vtst.u16 draw_mask, draw_mask, test_mask; \
3176 shade_blocks_textured_modulated_store_pixels_##target(); \
3177 vshr.u8 texels_b, texels_b, #3; \
3179 shade_blocks_textured_modulate_##dithering(r); \
3180 shade_blocks_textured_modulate_##dithering(g); \
3181 shade_blocks_textured_modulate_##dithering(b); \
3183 vand.u16 pixels, texels, d128_0x8000; \
3184 vceq.u16 zero_mask, texels, #0; \
3186 subs num_blocks, num_blocks, #1; \
3188 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3189 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3190 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3192 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3193 vorr.u16 draw_mask, draw_mask, zero_mask; \
3194 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3195 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3200 vmlal.u8 pixels, pixels_r_low, d64_1; \
3201 vmlal.u8 pixels, pixels_g_low, d64_4; \
3202 vmlal.u8 pixels, pixels_b_low, d64_128; \
3204 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3205 shade_blocks_textured_modulated_store_pixels_##target(); \
3207 ldmia sp!, { r4 - r5, pc } \
3210 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3211 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3212 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3213 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3215 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3216 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3217 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3218 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3236 #undef msb_mask_high
3239 #define num_blocks r1
3240 #define mask_msb_ptr r2
3241 #define color_ptr r3
3243 #define block_ptr_load r0
3244 #define draw_mask_store_ptr r3
3245 #define draw_mask_bits_ptr r12
3246 #define draw_mask_ptr r12
3247 #define pixel_store_ptr r14
3249 #define fb_ptr_cmp r4
3252 #define fb_ptr_next r14
3256 #define test_mask q0
3258 #define draw_mask q2
3259 #define zero_mask q3
3260 #define draw_mask_combined q4
3261 #define fb_pixels q5
3262 #define fb_pixels_next q6
3265 #define draw_mask_low d4
3266 #define draw_mask_high d5
3267 #define msb_mask_low d14
3268 #define msb_mask_high d15
3271 function(shade_blocks_textured_unmodulated_indirect)
3272 str r14, [ sp, #-4 ]
3273 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3275 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3276 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3278 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3279 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3282 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3284 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3285 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3286 [ draw_mask_bits_ptr, :16 ], c_64
3287 vceq.u16 zero_mask, pixels, #0
3289 vtst.u16 draw_mask, draw_mask, test_mask
3290 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3292 subs num_blocks, num_blocks, #1
3296 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3297 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3299 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3300 [ draw_mask_bits_ptr, :16 ], c_64
3301 vceq.u16 zero_mask, pixels, #0
3303 vtst.u16 draw_mask, draw_mask, test_mask
3304 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3306 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3307 subs num_blocks, num_blocks, #1
3312 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3313 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3320 function(shade_blocks_textured_unmodulated_direct)
3321 stmdb sp!, { r4, r14 }
3322 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3324 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3325 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3327 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3330 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3331 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3333 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3334 [ draw_mask_bits_ptr, :16 ], c_64
3335 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3337 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3338 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3339 vceq.u16 zero_mask, pixels, #0
3340 vtst.u16 draw_mask, draw_mask, test_mask
3342 subs num_blocks, num_blocks, #1
3346 mov fb_ptr, fb_ptr_next
3347 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3349 vorr.u16 pixels, pixels, msb_mask
3351 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3352 vmov fb_pixels, fb_pixels_next
3354 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3355 [ draw_mask_bits_ptr, :16 ], c_64
3356 vbif.u16 fb_pixels, pixels, draw_mask_combined
3358 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3360 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3361 add fb_ptr_cmp, fb_ptr_cmp, #14
3365 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3366 vceq.u16 zero_mask, pixels, #0
3368 vst1.u16 { fb_pixels }, [ fb_ptr ]
3369 vtst.u16 draw_mask, draw_mask, test_mask
3372 subs num_blocks, num_blocks, #1
3376 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3377 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3379 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3381 ldmia sp!, { r4, pc }
3384 vst1.u16 { fb_pixels }, [ fb_ptr ]
3385 vceq.u16 zero_mask, pixels, #0
3387 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3388 vtst.u16 draw_mask, draw_mask, test_mask
3393 function(shade_blocks_unshaded_untextured_indirect)
3398 function(shade_blocks_unshaded_untextured_direct)
3399 stmdb sp!, { r4, r14 }
3400 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3402 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3403 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3405 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3406 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3408 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3409 vld1.u16 { pixels }, [ color_ptr, :128 ]
3412 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3414 vorr.u16 pixels, pixels, msb_mask
3415 subs num_blocks, num_blocks, #1
3417 ldr fb_ptr_next, [ block_ptr_load ], #64
3419 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3423 vmov fb_pixels, fb_pixels_next
3424 mov fb_ptr, fb_ptr_next
3425 ldr fb_ptr_next, [ block_ptr_load ], #64
3427 vbif.u16 fb_pixels, pixels, draw_mask
3428 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3430 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3431 add fb_ptr_cmp, fb_ptr_cmp, #14
3435 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3436 vst1.u16 { fb_pixels }, [ fb_ptr ]
3439 subs num_blocks, num_blocks, #1
3443 vbif.u16 fb_pixels_next, pixels, draw_mask
3444 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3446 ldmia sp!, { r4, pc }
3449 vst1.u16 { fb_pixels }, [ fb_ptr ]
3450 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3454 #undef draw_mask_ptr
3461 #define num_blocks r1
3462 #define msb_mask_ptr r2
3463 #define pixel_ptr r3
3464 #define draw_mask_ptr r0
3467 #define fb_ptr_next r14
3468 #define fb_ptr_cmp r4
3476 #undef msb_mask_high
3477 #undef draw_mask_next
3480 #undef fb_pixels_next
3483 #define draw_mask q1
3485 #define fb_pixels q3
3486 #define blend_pixels q4
3487 #define pixels_no_msb q5
3488 #define blend_mask q6
3489 #define fb_pixels_no_msb q7
3490 #define d128_0x8000 q8
3491 #define d128_0x0421 q9
3492 #define fb_pixels_next q10
3493 #define blend_pixels_next q11
3494 #define pixels_next q12
3495 #define draw_mask_next q13
3496 #define write_mask q14
3498 #define pixels_rb q5
3499 #define pixels_mg q7
3501 #define d128_0x7C1F q8
3502 #define d128_0x03E0 q9
3503 #define fb_pixels_rb q10
3504 #define fb_pixels_g q11
3505 #define fb_pixels_masked q11
3506 #define d128_0x83E0 q15
3507 #define pixels_fourth q7
3508 #define d128_0x1C07 q12
3509 #define d128_0x00E0 q13
3510 #define d128_0x80E0 q13
3512 #define msb_mask_low d0
3513 #define msb_mask_high d1
3515 #define blend_blocks_average_set_blend_mask_textured(source) \
3516 vclt.s16 blend_mask, source, #0 \
3518 #define blend_blocks_average_set_stp_bit_textured() \
3519 vorr.u16 blend_pixels, #0x8000 \
3521 #define blend_blocks_average_combine_textured(source) \
3522 vbif.u16 blend_pixels, source, blend_mask \
3524 #define blend_blocks_average_set_blend_mask_untextured(source) \
3526 #define blend_blocks_average_set_stp_bit_untextured() \
3528 #define blend_blocks_average_combine_untextured(source) \
3530 #define blend_blocks_average_mask_set_on() \
3531 vclt.s16 write_mask, fb_pixels_next, #0 \
3533 #define blend_blocks_average_mask_copy_on() \
3534 vorr.u16 draw_mask, draw_mask_next, write_mask \
3536 #define blend_blocks_average_mask_copy_b_on() \
3537 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3539 #define blend_blocks_average_mask_set_off() \
3541 #define blend_blocks_average_mask_copy_off() \
3542 vmov draw_mask, draw_mask_next \
3544 #define blend_blocks_average_mask_copy_b_off() \
3546 #define blend_blocks_average_builder(texturing, mask_evaluate) \
3549 function(blend_blocks_##texturing##_average_##mask_evaluate) \
3550 stmdb sp!, { r4, r14 }; \
3551 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3552 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3554 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3555 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3557 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3560 vmov.u16 d128_0x8000, #0x8000; \
3561 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3562 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3564 vmov.u16 d128_0x0421, #0x0400; \
3565 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3567 vorr.u16 d128_0x0421, #0x0021; \
3568 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3570 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3571 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3572 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3573 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3574 blend_blocks_average_mask_set_##mask_evaluate(); \
3575 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3577 subs num_blocks, num_blocks, #1; \
3581 mov fb_ptr, fb_ptr_next; \
3582 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3584 vmov pixels, pixels_next; \
3585 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3587 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3589 blend_blocks_average_mask_copy_##mask_evaluate(); \
3590 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3592 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3593 blend_blocks_average_set_stp_bit_##texturing(); \
3594 vmov fb_pixels, fb_pixels_next; \
3595 blend_blocks_average_combine_##texturing(pixels); \
3597 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3598 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3599 cmp fb_ptr_cmp, #28; \
3602 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3603 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3605 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3606 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3608 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3609 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3611 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3612 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3613 blend_blocks_average_mask_set_##mask_evaluate(); \
3614 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3617 subs num_blocks, num_blocks, #1; \
3621 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3622 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3624 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3625 blend_blocks_average_set_stp_bit_##texturing(); \
3626 blend_blocks_average_combine_##texturing(pixels_next); \
3628 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3629 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
3630 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3632 ldmia sp!, { r4, pc }; \
3635 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3636 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3637 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3639 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3640 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3641 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3642 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3643 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3644 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3648 blend_blocks_average_builder(textured, off)
3649 blend_blocks_average_builder(untextured, off)
3650 blend_blocks_average_builder(textured, on)
3651 blend_blocks_average_builder(untextured, on)
3654 #define blend_blocks_add_mask_set_on() \
3655 vclt.s16 write_mask, fb_pixels, #0 \
3657 #define blend_blocks_add_mask_copy_on() \
3658 vorr.u16 draw_mask, draw_mask, write_mask \
3660 #define blend_blocks_add_mask_set_off() \
3662 #define blend_blocks_add_mask_copy_off() \
3665 #define blend_blocks_add_textured_builder(mask_evaluate) \
3668 function(blend_blocks_textured_add_##mask_evaluate) \
3669 stmdb sp!, { r4, r14 }; \
3670 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3671 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3673 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3674 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3676 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3679 vmov.u16 d128_0x7C1F, #0x7C00; \
3680 vmov.u16 d128_0x03E0, #0x0300; \
3681 vmov.u16 d128_0x83E0, #0x8000; \
3682 vorr.u16 d128_0x03E0, #0x00E0; \
3683 vorr.u16 d128_0x7C1F, #0x001F; \
3684 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3686 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3687 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3688 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3689 vclt.s16 blend_mask, pixels, #0; \
3690 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3691 blend_blocks_add_mask_set_##mask_evaluate(); \
3692 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3694 blend_blocks_add_mask_copy_##mask_evaluate(); \
3695 vorr.u16 pixels, pixels, msb_mask; \
3696 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3697 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3698 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3699 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3700 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3701 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3702 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3703 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3705 subs num_blocks, num_blocks, #1; \
3709 mov fb_ptr, fb_ptr_next; \
3711 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3713 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3714 vclt.s16 blend_mask, pixels, #0; \
3716 vorr.u16 pixels, pixels, msb_mask; \
3717 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3718 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3720 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3721 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3723 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3724 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3725 cmp fb_ptr_cmp, #28; \
3728 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3729 blend_blocks_add_mask_set_##mask_evaluate(); \
3730 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3731 blend_blocks_add_mask_copy_##mask_evaluate(); \
3732 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3733 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3734 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3737 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3738 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3739 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3740 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3741 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3743 subs num_blocks, num_blocks, #1; \
3747 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3748 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3749 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3751 ldmia sp!, { r4, pc }; \
3754 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3755 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3757 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3758 blend_blocks_add_mask_set_##mask_evaluate(); \
3759 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3760 blend_blocks_add_mask_copy_##mask_evaluate(); \
3761 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3765 #define blend_blocks_add_untextured_builder(mask_evaluate) \
3768 function(blend_blocks_untextured_add_##mask_evaluate) \
3769 stmdb sp!, { r4, r14 }; \
3770 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3771 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3773 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3774 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3776 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3779 vmov.u16 d128_0x7C1F, #0x7C00; \
3780 vmov.u16 d128_0x03E0, #0x0300; \
3781 vorr.u16 d128_0x7C1F, #0x001F; \
3782 vorr.u16 d128_0x03E0, #0x00E0; \
3784 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3785 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3786 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3787 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3788 blend_blocks_add_mask_set_##mask_evaluate(); \
3789 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3791 blend_blocks_add_mask_copy_##mask_evaluate(); \
3792 vand.u16 pixels_g, pixels, d128_0x03E0; \
3793 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3794 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3795 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3796 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3797 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3798 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3800 subs num_blocks, num_blocks, #1; \
3804 mov fb_ptr, fb_ptr_next; \
3806 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3808 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3810 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3811 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3812 vand.u16 pixels_g, pixels, d128_0x03E0; \
3814 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3815 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3817 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3818 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3819 cmp fb_ptr_cmp, #28; \
3822 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3823 blend_blocks_add_mask_set_##mask_evaluate(); \
3824 blend_blocks_add_mask_copy_##mask_evaluate(); \
3825 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3826 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3827 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3830 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3831 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3832 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3833 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3834 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3836 subs num_blocks, num_blocks, #1; \
3840 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3841 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3842 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3843 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3845 ldmia sp!, { r4, pc }; \
3848 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3849 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3851 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3852 blend_blocks_add_mask_set_##mask_evaluate(); \
3853 blend_blocks_add_mask_copy_##mask_evaluate(); \
3854 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3858 blend_blocks_add_textured_builder(off)
3859 blend_blocks_add_textured_builder(on)
3860 blend_blocks_add_untextured_builder(off)
3861 blend_blocks_add_untextured_builder(on)
3863 #define blend_blocks_subtract_set_blend_mask_textured() \
3864 vclt.s16 blend_mask, pixels_next, #0 \
3866 #define blend_blocks_subtract_combine_textured() \
3867 vbif.u16 blend_pixels, pixels, blend_mask \
3869 #define blend_blocks_subtract_set_stb_textured() \
3870 vorr.u16 blend_pixels, #0x8000 \
3872 #define blend_blocks_subtract_msb_mask_textured() \
3873 vorr.u16 pixels, pixels_next, msb_mask \
3875 #define blend_blocks_subtract_set_blend_mask_untextured() \
3877 #define blend_blocks_subtract_combine_untextured() \
3879 #define blend_blocks_subtract_set_stb_untextured() \
3880 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3882 #define blend_blocks_subtract_msb_mask_untextured() \
3885 #define blend_blocks_subtract_mask_set_on() \
3886 vclt.s16 write_mask, fb_pixels, #0 \
3888 #define blend_blocks_subtract_mask_copy_on() \
3889 vorr.u16 draw_mask, draw_mask_next, write_mask \
3891 #define blend_blocks_subtract_mask_set_off() \
3893 #define blend_blocks_subtract_mask_copy_off() \
3894 vmov draw_mask, draw_mask_next \
3897 #define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3900 function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3901 stmdb sp!, { r4, r14 }; \
3902 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3903 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3905 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3906 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3908 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3911 vmov.u16 d128_0x7C1F, #0x7C00; \
3912 vmov.u16 d128_0x03E0, #0x0300; \
3913 vorr.u16 d128_0x7C1F, #0x001F; \
3914 vorr.u16 d128_0x03E0, #0x00E0; \
3916 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3917 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3918 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3919 blend_blocks_subtract_set_blend_mask_##texturing(); \
3920 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3921 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3922 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3924 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3925 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3926 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3927 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3928 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3930 subs num_blocks, num_blocks, #1; \
3934 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3935 mov fb_ptr, fb_ptr_next; \
3936 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3938 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3939 blend_blocks_subtract_msb_mask_##texturing(); \
3941 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3942 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3943 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3944 blend_blocks_subtract_set_stb_##texturing(); \
3945 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3946 blend_blocks_subtract_combine_##texturing(); \
3947 blend_blocks_subtract_set_blend_mask_##texturing(); \
3948 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3950 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3951 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3952 cmp fb_ptr_cmp, #28; \
3955 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3956 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3957 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3958 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3959 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3960 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3961 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3964 subs num_blocks, num_blocks, #1; \
3968 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3970 blend_blocks_subtract_msb_mask_##texturing(); \
3971 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3972 blend_blocks_subtract_set_stb_##texturing(); \
3973 blend_blocks_subtract_combine_##texturing(); \
3974 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3975 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3977 ldmia sp!, { r4, pc }; \
3980 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3981 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3982 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3983 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3984 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3985 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3986 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3990 blend_blocks_subtract_builder(textured, off)
3991 blend_blocks_subtract_builder(textured, on)
3992 blend_blocks_subtract_builder(untextured, off)
3993 blend_blocks_subtract_builder(untextured, on)
3996 #define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
3999 function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4000 stmdb sp!, { r4, r14 }; \
4001 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4002 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4004 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4005 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4007 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4010 vmov.u16 d128_0x7C1F, #0x7C00; \
4011 vmov.u16 d128_0x03E0, #0x0300; \
4012 vmov.u16 d128_0x83E0, #0x8300; \
4013 vmov.u16 d128_0x1C07, #0x1C00; \
4014 vmov.u16 d128_0x80E0, #0x8000; \
4015 vorr.u16 d128_0x7C1F, #0x001F; \
4016 vorr.u16 d128_0x03E0, #0x00E0; \
4017 vorr.u16 d128_0x83E0, #0x00E0; \
4018 vorr.u16 d128_0x1C07, #0x0007; \
4019 vorr.u16 d128_0x80E0, #0x00E0; \
4021 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4022 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4023 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4024 vclt.s16 blend_mask, pixels, #0; \
4025 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4026 blend_blocks_add_mask_set_##mask_evaluate(); \
4027 vshr.s16 pixels_fourth, pixels, #2; \
4029 blend_blocks_add_mask_copy_##mask_evaluate(); \
4030 vorr.u16 pixels, pixels, msb_mask; \
4031 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4032 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
4033 vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \
4034 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
4035 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
4036 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4037 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
4038 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4039 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
4041 subs num_blocks, num_blocks, #1; \
4045 mov fb_ptr, fb_ptr_next; \
4047 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4049 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4050 vclt.s16 blend_mask, pixels, #0; \
4052 vshr.s16 pixels_fourth, pixels, #2; \
4053 vorr.u16 pixels, pixels, msb_mask; \
4054 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4055 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4057 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4058 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4060 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4061 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4062 cmp fb_ptr_cmp, #28; \
4065 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4066 blend_blocks_add_mask_set_##mask_evaluate(); \
4067 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
4068 blend_blocks_add_mask_copy_##mask_evaluate(); \
4069 vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \
4070 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
4071 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4074 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
4075 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4076 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
4077 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4078 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
4080 subs num_blocks, num_blocks, #1; \
4084 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4085 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4086 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4088 ldmia sp!, { r4, pc }; \
4091 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4092 vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \
4094 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4095 blend_blocks_add_mask_set_##mask_evaluate(); \
4096 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
4097 blend_blocks_add_mask_copy_##mask_evaluate(); \
4098 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
4102 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4105 function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4106 stmdb sp!, { r4, r14 }; \
4107 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4108 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4110 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4111 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4113 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4116 vmov.u16 d128_0x7C1F, #0x7C00; \
4117 vmov.u16 d128_0x03E0, #0x0300; \
4118 vmov.u16 d128_0x83E0, #0x8300; \
4119 vmov.u16 d128_0x1C07, #0x1C00; \
4120 vmov.u16 d128_0x00E0, #0x00E0; \
4121 vorr.u16 d128_0x7C1F, #0x001F; \
4122 vorr.u16 d128_0x03E0, #0x00E0; \
4123 vorr.u16 d128_0x83E0, #0x00E0; \
4124 vorr.u16 d128_0x1C07, #0x0007; \
4126 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4127 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4128 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4129 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4130 blend_blocks_add_mask_set_##mask_evaluate(); \
4131 vshr.s16 pixels_fourth, pixels, #2; \
4132 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4134 blend_blocks_add_mask_copy_##mask_evaluate(); \
4135 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4136 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4137 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4138 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4139 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4140 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4141 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4143 subs num_blocks, num_blocks, #1; \
4147 mov fb_ptr, fb_ptr_next; \
4149 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4151 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4153 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4154 vshr.s16 pixels_fourth, pixels, #2; \
4155 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4156 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4158 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4159 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4161 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4162 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4163 cmp fb_ptr_cmp, #28; \
4166 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4167 blend_blocks_add_mask_set_##mask_evaluate(); \
4168 blend_blocks_add_mask_copy_##mask_evaluate(); \
4169 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4170 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4171 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4174 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4175 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4176 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4177 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4178 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4180 subs num_blocks, num_blocks, #1; \
4184 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4185 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4186 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4187 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4189 ldmia sp!, { r4, pc }; \
4192 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4193 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4195 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4196 blend_blocks_add_mask_set_##mask_evaluate(); \
4197 blend_blocks_add_mask_copy_##mask_evaluate(); \
4198 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4202 blend_blocks_add_fourth_textured_builder(off)
4203 blend_blocks_add_fourth_textured_builder(on)
4204 blend_blocks_add_fourth_untextured_builder(off)
4205 blend_blocks_add_fourth_untextured_builder(on)
4207 // TODO: Optimize this more. Need a scene that actually uses it for
4212 function(blend_blocks_textured_unblended_on)
4213 stmdb sp!, { r4, r14 }
4214 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4215 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4217 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4218 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4220 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4223 ldr fb_ptr, [ pixel_ptr, #28 ]
4224 vld1.u16 { fb_pixels }, [ fb_ptr ]
4225 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4226 vclt.s16 write_mask, fb_pixels, #0
4227 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4229 subs num_blocks, num_blocks, #1
4233 vorr.u16 draw_mask, draw_mask, write_mask
4234 vbif.u16 fb_pixels, pixels, draw_mask
4235 vst1.u16 { fb_pixels }, [ fb_ptr ]
4237 ldr fb_ptr, [ pixel_ptr, #28 ]
4238 vld1.u16 { fb_pixels }, [ fb_ptr ]
4239 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4240 vclt.s16 write_mask, fb_pixels, #0
4241 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4243 subs num_blocks, num_blocks, #1
4247 vorr.u16 draw_mask, draw_mask, write_mask
4248 vbif.u16 fb_pixels, pixels, draw_mask
4249 vst1.u16 { fb_pixels }, [ fb_ptr ]
4251 ldmia sp!, { r4, pc }
4254 function(blend_blocks_textured_unblended_off)
4264 vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4284 #define parameter_width_offset 0
4285 #define parameter_height_offset 4
4291 #define left_unaligned r14
4292 #define right_unaligned r4
4294 #define num_unaligned r2
4295 #define num_width r6
4303 function(render_block_fill_body)
4304 ldr vram_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
4305 ldr height, [ sp, #parameter_height_offset ]
4307 add vram_ptr, vram_ptr, y, lsl #11
4308 ldr width, [ sp, #parameter_width_offset ]
4310 add vram_ptr, vram_ptr, x, lsl #1
4311 stmdb sp!, { r4 - r6, r14 }
4313 ubfx color_r, color, #3, #5
4314 ubfx color_g, color, #11, #5
4316 ubfx color_b, color, #19, #5
4317 orr color, color_r, color_g, lsl #5
4319 orr color, color, color_b, lsl #10
4320 add left_unaligned, x, #0x7
4322 bic left_unaligned, left_unaligned, #0x7
4323 vdup.u16 colors, color
4325 sub left_unaligned, left_unaligned, x
4328 sub pitch, pitch, width, lsl #1
4329 sub width, width, left_unaligned
4331 and right_unaligned, width, #0x7
4332 bic width, width, #0x7
4335 mov num_width, width, lsr #3
4337 movs num_unaligned, left_unaligned
4341 strh color, [ vram_ptr ], #2
4343 subs num_unaligned, num_unaligned, #1
4347 vst1.u32 { colors }, [ vram_ptr, :128 ]!
4348 subs num_width, num_width, #1
4351 movs num_unaligned, right_unaligned
4355 strh color, [ vram_ptr ], #2
4357 subs num_unaligned, num_unaligned, #1
4361 add vram_ptr, vram_ptr, pitch
4362 subs height, height, #1
4365 ldmia sp!, { r4 - r6, pc }
4376 #undef dirty_textures_mask
4378 #undef current_texture_mask
4389 #define offset_u_right r10
4390 #define width_rounded r11
4391 #define height_rounded r12
4393 #define texture_offset_base r1
4394 #define tile_width r2
4395 #define tile_height r3
4396 #define num_blocks r4
4398 #define sub_tile_height r6
4400 #define texture_mask r8
4401 #define column_data r9
4402 #define texture_offset r10
4403 #define tiles_remaining r11
4404 #define fb_ptr_advance_column r12
4405 #define texture_block_ptr r14
4407 #define texture_page_ptr r3
4408 #define left_block_mask r4
4409 #define right_block_mask r5
4410 #define texture_mask_rev r10
4411 #define control_mask r11
4413 #define dirty_textures_mask r4
4415 #define current_texture_mask r6
4429 #define draw_masks_fb_ptrs q1
4431 #define draw_mask_fb_ptr_left d2
4432 #define draw_mask_fb_ptr_right d3
4434 #define clut_low_a d4
4435 #define clut_low_b d5
4436 #define clut_high_a d6
4437 #define clut_high_b d7
4439 #define block_masks d8
4440 #define block_masks_shifted d9
4445 #define texels_low d10
4446 #define texels_high d11
4449 setup_sprite_flush_blocks_single:
4452 stmdb sp!, { r0 - r3, r12, r14 }
4453 bl flush_render_block_buffer
4454 ldmia sp!, { r0 - r3, r12, r14 }
4458 add block, psx_gpu, #psx_gpu_blocks_offset
4460 mov num_blocks, sub_tile_height
4464 setup_sprite_flush_blocks_double:
4467 stmdb sp!, { r0 - r3, r12, r14 }
4468 bl flush_render_block_buffer
4469 ldmia sp!, { r0 - r3, r12, r14 }
4473 add block, psx_gpu, #psx_gpu_blocks_offset
4475 mov num_blocks, sub_tile_height, lsl #1
4479 setup_sprite_update_texture_4bpp_cache:
4480 stmdb sp!, { r0 - r3, r14 }
4481 bl update_texture_4bpp_cache
4482 ldmia sp!, { r0 - r3, pc }
4485 setup_sprite_update_texture_8bpp_cache:
4486 stmdb sp!, { r0 - r3, r14 }
4487 bl update_texture_8bpp_cache
4488 ldmia sp!, { r0 - r3, pc }
4491 #define setup_sprite_tiled_initialize_4bpp() \
4492 ldr dirty_textures_mask, \
4493 [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \
4494 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4496 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4497 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4499 tst current_texture_mask, dirty_textures_mask; \
4500 vuzp.u8 clut_a, clut_b; \
4502 blne setup_sprite_update_texture_4bpp_cache \
4504 #define setup_sprite_tiled_initialize_8bpp() \
4505 ldr dirty_textures_mask, \
4506 [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \
4507 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4509 tst current_texture_mask, dirty_textures_mask; \
4510 blne setup_sprite_update_texture_8bpp_cache \
4513 #define setup_sprite_tile_setup_block_no(side, offset, texture_mode) \
4515 #define setup_sprite_block_count_single() \
4518 #define setup_sprite_block_count_double() \
4519 sub_tile_height, lsl #1 \
4521 #define setup_sprite_tile_add_blocks(type) \
4522 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4523 cmp num_blocks, #MAX_BLOCKS; \
4525 blgt setup_sprite_flush_blocks_##type \
4528 #define setup_sprite_tile_full_4bpp(edge) \
4529 setup_sprite_tile_add_blocks(double); \
4532 and texture_block_ptr, texture_offset, texture_mask; \
4533 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4536 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4537 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4539 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4540 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4542 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4543 add texture_block_ptr, texture_offset, #8; \
4545 and texture_block_ptr, texture_block_ptr, texture_mask; \
4546 add block, block, #40; \
4548 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4549 add fb_ptr, fb_ptr, #16; \
4551 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4552 add block, block, #24; \
4554 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4555 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4558 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4559 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4561 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4562 add block, block, #40; \
4564 add texture_offset, texture_offset, #0x10; \
4565 add fb_ptr, fb_ptr, #(2048 - 16); \
4567 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4568 add block, block, #24; \
4570 subs sub_tile_height, sub_tile_height, #1; \
4573 add texture_offset, texture_offset, #0xF00; \
4574 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4577 #define setup_sprite_tile_half_4bpp(edge) \
4578 setup_sprite_tile_add_blocks(single); \
4581 and texture_block_ptr, texture_offset, texture_mask; \
4582 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4585 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4586 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4588 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4589 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4591 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4592 add block, block, #40; \
4594 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4595 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4597 add block, block, #24; \
4598 add texture_offset, texture_offset, #0x10; \
4600 add fb_ptr, fb_ptr, #2048; \
4601 subs sub_tile_height, sub_tile_height, #1; \
4605 add texture_offset, texture_offset, #0xF00; \
4606 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4609 #define setup_sprite_tile_full_8bpp(edge) \
4610 setup_sprite_tile_add_blocks(double); \
4611 add block, block, #16; \
4614 and texture_block_ptr, texture_offset, texture_mask; \
4615 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4618 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4619 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4621 add texture_block_ptr, texture_offset, #8; \
4622 vst1.u32 { texels }, [ block, :64 ]; \
4624 and texture_block_ptr, texture_block_ptr, texture_mask; \
4625 add block, block, #24; \
4627 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4629 add fb_ptr, fb_ptr, #16; \
4630 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4632 add block, block, #40; \
4633 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4636 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4637 vst1.u32 { texels }, [ block, :64 ]; \
4638 add block, block, #24; \
4640 add texture_offset, texture_offset, #0x10; \
4641 add fb_ptr, fb_ptr, #(2048 - 16); \
4643 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4644 add block, block, #40; \
4646 subs sub_tile_height, sub_tile_height, #1; \
4649 sub block, block, #16; \
4650 add texture_offset, texture_offset, #0xF00; \
4651 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4654 #define setup_sprite_tile_half_8bpp(edge) \
4655 setup_sprite_tile_add_blocks(single); \
4656 add block, block, #16; \
4659 and texture_block_ptr, texture_offset, texture_mask; \
4660 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4663 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4664 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4666 vst1.u32 { texels }, [ block, :64 ]; \
4667 add block, block, #24; \
4669 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4670 add block, block, #40; \
4672 add texture_offset, texture_offset, #0x10; \
4673 add fb_ptr, fb_ptr, #2048; \
4675 subs sub_tile_height, sub_tile_height, #1; \
4678 sub block, block, #16; \
4679 add texture_offset, texture_offset, #0xF00; \
4680 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4683 #define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4684 add texture_offset, texture_offset_base, #8; \
4685 add fb_ptr, fb_ptr, #16 \
4687 #define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4688 mov texture_offset, texture_offset_base \
4690 #define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4691 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4693 #define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4694 mov texture_offset, texture_offset_base \
4696 #define setup_sprite_tile_column_edge_post_adjust_half_right() \
4697 sub fb_ptr, fb_ptr, #16 \
4699 #define setup_sprite_tile_column_edge_post_adjust_half_left() \
4701 #define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4702 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4704 #define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4707 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode) \
4708 mov sub_tile_height, column_data; \
4709 setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge); \
4710 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4711 setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge) \
4713 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode) \
4714 and sub_tile_height, column_data, #0xFF; \
4715 mov tiles_remaining, column_data, lsr #16; \
4716 setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge); \
4717 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4719 subs tiles_remaining, tiles_remaining, #1; \
4723 mov sub_tile_height, #16; \
4724 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4725 subs tiles_remaining, tiles_remaining, #1; \
4729 uxtb sub_tile_height, column_data, ror #8; \
4730 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4731 setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge) \
4734 #define setup_sprite_column_data_single() \
4735 mov column_data, height; \
4736 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \
4738 #define setup_sprite_column_data_multi() \
4739 and height_rounded, height_rounded, #0xF; \
4740 rsb column_data, offset_v, #16; \
4742 add height_rounded, height_rounded, #1; \
4743 sub tile_height, tile_height, #1; \
4745 orr column_data, column_data, tile_height, lsl #16; \
4746 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \
4748 orr column_data, column_data, height_rounded, lsl #8 \
4750 #define setup_sprite_tile_column_width_single(texture_mode, multi_height, \
4752 setup_sprite_##texture_mode##_single_##multi_height##_##edge_mode##_##edge: \
4753 setup_sprite_column_data_##multi_height(); \
4754 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4755 vorr.u32 block_masks, block_masks, block_masks_shifted; \
4756 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4757 vdup.u8 draw_mask_fb_ptr_right, block_masks[1]; \
4759 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, \
4761 ldmia sp!, { r4 - r11, pc } \
4763 #define setup_sprite_tiled_advance_column() \
4764 add texture_offset_base, texture_offset_base, #0x100; \
4765 tst texture_offset_base, #0xF00; \
4766 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4768 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
4770 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode: \
4771 setup_sprite_column_data_##multi_height(); \
4772 mov fb_ptr_advance_column, #32; \
4774 sub fb_ptr_advance_column, height, lsl #11; \
4775 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4777 vdup.u8 draw_mask_fb_ptr_right, block_masks[1]; \
4778 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm); \
4780 subs tile_width, tile_width, #2; \
4781 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4783 vmov.u8 draw_masks_fb_ptrs, #0; \
4787 setup_sprite_tiled_advance_column(); \
4788 setup_sprite_tile_column_height_##multi_height(full, none, tm); \
4789 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4790 subs tile_width, tile_width, #1; \
4794 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4795 vdup.u8 draw_mask_fb_ptr_right, block_masks[5]; \
4797 setup_sprite_tiled_advance_column(); \
4798 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm); \
4799 ldmia sp!, { r4 - r11, pc } \
4807 // [ sp + 4 ]: width
4808 // [ sp + 8 ]: height
4809 // [ sp + 12 ]: color (unused)
4811 #define setup_sprite_tiled_builder(texture_mode) \
4813 setup_sprite_tile_column_width_multi(texture_mode, multi, full, full); \
4814 setup_sprite_tile_column_width_single(texture_mode, multi, full, none); \
4815 setup_sprite_tile_column_width_multi(texture_mode, single, full, full); \
4816 setup_sprite_tile_column_width_single(texture_mode, single, full, none); \
4817 setup_sprite_tile_column_width_multi(texture_mode, multi, half, full); \
4818 setup_sprite_tile_column_width_single(texture_mode, multi, half, right); \
4819 setup_sprite_tile_column_width_multi(texture_mode, single, half, full); \
4820 setup_sprite_tile_column_width_single(texture_mode, single, half, right); \
4821 setup_sprite_tile_column_width_multi(texture_mode, multi, full, half); \
4822 setup_sprite_tile_column_width_single(texture_mode, multi, half, left); \
4823 setup_sprite_tile_column_width_multi(texture_mode, single, full, half); \
4824 setup_sprite_tile_column_width_single(texture_mode, single, half, left); \
4825 setup_sprite_tile_column_width_multi(texture_mode, multi, half, half); \
4826 setup_sprite_tile_column_width_multi(texture_mode, single, half, half); \
4830 function(setup_sprite_##texture_mode) \
4831 stmdb sp!, { r4 - r11, r14 }; \
4832 setup_sprite_tiled_initialize_##texture_mode(); \
4834 ldr v, [ sp, #36 ]; \
4835 and offset_u, u, #0xF; \
4837 ldr width, [ sp, #40 ]; \
4838 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
4840 ldr height, [ sp, #44 ]; \
4841 add fb_ptr, fb_ptr, y, lsl #11; \
4843 add fb_ptr, fb_ptr, x, lsl #1; \
4844 and offset_v, v, #0xF; \
4846 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4847 add width_rounded, offset_u, width; \
4849 add height_rounded, offset_v, height; \
4850 add width_rounded, width_rounded, #15; \
4852 add height_rounded, height_rounded, #15; \
4853 mov tile_width, width_rounded, lsr #4; \
4855 /* texture_offset_base = VH-VL-00-00 */\
4856 mov texture_offset_base, v, lsl #8; \
4857 and offset_u_right, width_rounded, #0xF; \
4859 /* texture_offset_base = VH-UH-UL-00 */\
4860 bfi texture_offset_base, u, #4, #8; \
4861 movw right_block_mask, #0xFFFE; \
4863 /* texture_offset_base = VH-UH-VL-00 */\
4864 bfi texture_offset_base, v, #4, #4; \
4865 movw left_block_mask, #0xFFFF; \
4867 mov tile_height, height_rounded, lsr #4; \
4868 mvn left_block_mask, left_block_mask, lsl offset_u; \
4870 /* texture_mask = HH-HL-WH-WL */\
4871 ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \
4872 mov right_block_mask, right_block_mask, lsl offset_u_right; \
4874 /* texture_mask_rev = WH-WL-HH-HL */\
4875 rev16 texture_mask_rev, texture_mask; \
4876 vmov block_masks, left_block_mask, right_block_mask; \
4878 /* texture_mask = HH-HL-HL-WL */\
4879 bfi texture_mask, texture_mask_rev, #4, #4; \
4880 /* texture_mask_rev = 00-00-00-WH */\
4881 mov texture_mask_rev, texture_mask_rev, lsr #12; \
4883 /* texture_mask = HH-WH-HL-WL */\
4884 bfi texture_mask, texture_mask_rev, #8, #4; \
4885 and left_block_mask, left_block_mask, #0xFF; \
4887 mov control_mask, #0; \
4888 cmp left_block_mask, #0xFF; \
4890 uxtb right_block_mask, right_block_mask, ror #8; \
4891 orreq control_mask, control_mask, #0x4; \
4893 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4894 cmp right_block_mask, #0xFF; \
4896 orreq control_mask, control_mask, #0x8; \
4897 cmp tile_width, #1; \
4899 add block, psx_gpu, #psx_gpu_blocks_offset; \
4900 orreq control_mask, control_mask, #0x1; \
4902 cmp tile_height, #1; \
4903 add block, block, num_blocks, lsl #6; \
4905 orreq control_mask, control_mask, #0x2; \
4906 ldr pc, [ pc, control_mask, lsl #2 ]; \
4909 .word setup_sprite_##texture_mode##_multi_multi_full_full; \
4910 .word setup_sprite_##texture_mode##_single_multi_full_none; \
4911 .word setup_sprite_##texture_mode##_multi_single_full_full; \
4912 .word setup_sprite_##texture_mode##_single_single_full_none; \
4913 .word setup_sprite_##texture_mode##_multi_multi_half_full; \
4914 .word setup_sprite_##texture_mode##_single_multi_half_right; \
4915 .word setup_sprite_##texture_mode##_multi_single_half_full; \
4916 .word setup_sprite_##texture_mode##_single_single_half_right; \
4917 .word setup_sprite_##texture_mode##_multi_multi_full_half; \
4918 .word setup_sprite_##texture_mode##_single_multi_half_left; \
4919 .word setup_sprite_##texture_mode##_multi_single_full_half; \
4920 .word setup_sprite_##texture_mode##_single_single_half_left; \
4921 .word setup_sprite_##texture_mode##_multi_multi_half_half; \
4923 .word setup_sprite_##texture_mode##_multi_single_half_half \
4926 setup_sprite_tiled_builder(4bpp);
4927 setup_sprite_tiled_builder(8bpp);
4935 #define block_ptr r0
4936 #define num_blocks r1
4938 #define texel_shift_mask r3
4939 #define block_pixels_a r4
4940 #define block_pixels_b r5
4949 #define texels_01 r6
4950 #define texels_23 r7
4951 #define texels_45 r8
4952 #define texels_67 r9
4954 function(texture_sprite_blocks_8bpp)
4955 stmdb sp!, { r4 - r11, r14 }
4956 movw texel_shift_mask, #(0xFF << 1)
4958 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4959 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
4961 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
4962 ldr block_pixels_a, [ block_ptr, #16 ]
4965 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
4966 ldr block_pixels_b, [ block_ptr, #20 ]
4968 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
4969 ldrh texel_0, [ clut_ptr, texel_0 ]
4971 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
4972 ldrh texel_1, [ clut_ptr, texel_1 ]
4974 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
4975 ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
4977 ldrh texel_2, [ clut_ptr, texel_2 ]
4978 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
4980 ldrh texel_3, [ clut_ptr, texel_3 ]
4981 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
4983 ldrh texel_4, [ clut_ptr, texel_4 ]
4984 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
4986 ldrh texel_5, [ clut_ptr, texel_5 ]
4987 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
4989 ldrh texel_6, [ clut_ptr, texel_6 ]
4990 orr texels_01, texel_0, texel_1, lsl #16
4992 ldrh texel_7, [ clut_ptr, texel_7 ]
4993 orr texels_23, texel_2, texel_3, lsl #16
4995 orr texels_45, texel_4, texel_5, lsl #16
4996 str texels_01, [ block_ptr, #0 ]
4998 orr texels_67, texel_6, texel_7, lsl #16
4999 str texels_23, [ block_ptr, #4 ]
5001 subs num_blocks, num_blocks, #1
5002 str texels_45, [ block_ptr, #8 ]
5004 str texels_67, [ block_ptr, #12 ]
5005 add block_ptr, block_ptr, #64
5009 ldmia sp!, { r4 - r11, pc }
5012 #undef width_rounded
5015 #undef texture_offset
5024 #define left_offset r8
5025 #define width_rounded r9
5026 #define right_width r10
5027 #define block_width r11
5029 #define texture_offset_base r1
5030 #define texture_mask r2
5031 #define texture_page_ptr r3
5032 #define num_blocks r4
5035 #define texture_offset r8
5036 #define blocks_remaining r9
5037 #define fb_ptr_pitch r12
5038 #define texture_block_ptr r14
5040 #define texture_mask_width r2
5041 #define texture_mask_height r3
5042 #define left_mask_bits r4
5043 #define right_mask_bits r5
5047 #undef block_masks_shifted
5050 #define block_masks d0
5051 #define block_masks_shifted d1
5052 #define draw_mask_fb_ptr d2
5056 setup_sprites_16bpp_flush_single:
5059 stmdb sp!, { r0 - r3, r12, r14 }
5060 bl flush_render_block_buffer
5061 ldmia sp!, { r0 - r3, r12, r14 }
5065 add block, psx_gpu, #psx_gpu_blocks_offset
5070 setup_sprites_16bpp_flush_row:
5073 stmdb sp!, { r0 - r3, r12, r14 }
5074 bl flush_render_block_buffer
5075 ldmia sp!, { r0 - r3, r12, r14 }
5079 add block, psx_gpu, #psx_gpu_blocks_offset
5080 mov num_blocks, block_width
5084 function(setup_sprite_16bpp)
5085 stmdb sp!, { r4 - r11, r14 }
5086 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5089 add fb_ptr, fb_ptr, y, lsl #11
5091 ldr width, [ sp, #40 ]
5092 add fb_ptr, fb_ptr, x, lsl #1
5094 ldr height, [ sp, #44 ]
5095 and left_offset, u, #0x7
5097 add texture_offset_base, u, u
5098 add width_rounded, width, #7
5100 add texture_offset_base, v, lsl #11
5101 mov left_mask_bits, #0xFF
5103 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5104 add width_rounded, width_rounded, left_offset
5106 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5107 sub fb_ptr, fb_ptr, left_offset, lsl #1
5109 add texture_mask, texture_mask_width, texture_mask_width
5110 mov right_mask_bits, #0xFE
5112 and right_width, width_rounded, #0x7
5113 mvn left_mask_bits, left_mask_bits, lsl left_offset
5115 add texture_mask, texture_mask_height, lsl #11
5116 mov block_width, width_rounded, lsr #3
5118 mov right_mask_bits, right_mask_bits, lsl right_width
5119 movw fb_ptr_pitch, #(2048 + 16)
5121 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5122 vmov block_masks, left_mask_bits, right_mask_bits
5124 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5125 add block, psx_gpu, #psx_gpu_blocks_offset
5127 bic texture_offset_base, texture_offset_base, #0x7
5130 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5131 add block, block, num_blocks, lsl #6
5135 vext.32 block_masks_shifted, block_masks, block_masks, #1
5136 vorr.u32 block_masks, block_masks, block_masks_shifted
5137 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5140 add num_blocks, num_blocks, #1
5141 cmp num_blocks, #MAX_BLOCKS
5142 blgt setup_sprites_16bpp_flush_single
5144 and texture_block_ptr, texture_offset_base, texture_mask
5145 subs height, height, #1
5147 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5148 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5150 vst1.u32 { texels }, [ block, :128 ]
5151 add block, block, #40
5153 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5156 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5158 add block, block, #24
5159 add texture_offset_base, texture_offset_base, #2048
5160 add fb_ptr, fb_ptr, #2048
5161 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5164 ldmia sp!, { r4 - r11, pc }
5167 add num_blocks, num_blocks, block_width
5168 mov texture_offset, texture_offset_base
5170 cmp num_blocks, #MAX_BLOCKS
5171 blgt setup_sprites_16bpp_flush_row
5173 add texture_offset_base, texture_offset_base, #2048
5174 and texture_block_ptr, texture_offset, texture_mask
5176 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5177 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5179 vst1.u32 { texels }, [ block, :128 ]
5180 add block, block, #40
5182 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5183 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5186 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5187 subs blocks_remaining, block_width, #2
5189 add texture_offset, texture_offset, #16
5190 add fb_ptr, fb_ptr, #16
5192 vmov.u8 draw_mask_fb_ptr, #0
5194 add block, block, #24
5198 and texture_block_ptr, texture_offset, texture_mask
5199 subs blocks_remaining, blocks_remaining, #1
5201 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5202 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5204 vst1.u32 { texels }, [ block, :128 ]
5205 add block, block, #40
5207 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5210 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5212 add texture_offset, texture_offset, #16
5213 add fb_ptr, fb_ptr, #16
5215 add block, block, #24
5219 and texture_block_ptr, texture_offset, texture_mask
5220 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5222 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5223 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5225 vst1.u32 { texels }, [ block, :128 ]
5226 add block, block, #40
5228 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5229 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5231 add block, block, #24
5232 subs height, height, #1
5234 add fb_ptr, fb_ptr, fb_ptr_pitch
5235 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5239 ldmia sp!, { r4 - r11, pc }
5242 #undef texture_page_ptr
5244 #undef dirty_textures_mask
5245 #undef current_texture_mask
5248 #define current_texture_page r1
5249 #define texture_page_ptr r2
5250 #define vram_ptr_a r3
5251 #define current_texture_page_x r12
5252 #define current_texture_page_y r4
5253 #define dirty_textures_mask r5
5257 #define current_texture_mask r9
5259 #define vram_ptr_b r11
5261 #define texel_block_a d0
5262 #define texel_block_b d1
5263 #define texel_block_expanded_a q1
5264 #define texel_block_expanded_b q2
5265 #define texel_block_expanded_ab q2
5266 #define texel_block_expanded_c q3
5267 #define texel_block_expanded_d q4
5268 #define texel_block_expanded_cd q3
5270 function(update_texture_4bpp_cache)
5271 stmdb sp!, { r4 - r11, r14 }
5274 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5276 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5277 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5279 and current_texture_page_x, current_texture_page, #0xF
5280 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5282 mov current_texture_page_y, current_texture_page, lsr #4
5283 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5285 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5288 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5289 bic dirty_textures_mask, current_texture_mask
5292 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5297 add vram_ptr_b, vram_ptr_a, #2048
5300 vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5301 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5303 vmovl.u8 texel_block_expanded_a, texel_block_a
5304 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5305 vmovl.u8 texel_block_expanded_c, texel_block_b
5306 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5308 vbic.u16 texel_block_expanded_a, #0x00F0
5309 vbic.u16 texel_block_expanded_b, #0x00F0
5310 vbic.u16 texel_block_expanded_c, #0x00F0
5311 vbic.u16 texel_block_expanded_d, #0x00F0
5313 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5314 texel_block_expanded_b
5315 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5316 texel_block_expanded_d
5318 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
5319 [ texture_page_ptr, :256 ]!
5321 subs sub_y, sub_y, #1
5325 add vram_ptr_a, vram_ptr_a, #8
5326 add vram_ptr_b, vram_ptr_b, #8
5328 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5329 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5331 subs tile_x, tile_x, #1
5335 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5336 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5338 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5339 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5341 subs tile_y, tile_y, #1
5345 ldmia sp!, { r4 - r11, pc }
5348 #undef current_texture_page
5351 #define texture_page r1
5352 #define texture_page_ptr r2
5353 #define vram_ptr_a r3
5354 #define texture_page_x r12
5355 #define texture_page_y r4
5356 #define current_texture_page r5
5361 #define vram_ptr_b r11
5373 function(update_texture_8bpp_cache_slice)
5374 stmdb sp!, { r4 - r11, r14 }
5377 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5378 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5380 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5383 and texture_page_x, texture_page, #0xF
5384 mov texture_page_y, texture_page, lsr #4
5386 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
5389 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
5390 eor current_texture_page, current_texture_page, texture_page
5392 ands current_texture_page, current_texture_page, #0x1
5395 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5398 add vram_ptr_b, vram_ptr_a, #2048
5401 vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
5402 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
5403 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
5404 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
5406 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
5407 vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
5409 subs sub_y, sub_y, #1
5414 add vram_ptr_a, vram_ptr_a, #16
5415 add vram_ptr_b, vram_ptr_b, #16
5417 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5418 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5420 subs tile_x, tile_x, #1
5425 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5426 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5428 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5429 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5431 subs tile_y, tile_y, #1
5432 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5437 ldmia sp!, { r4 - r11, pc }