add NEON GPU rasterizer
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of
7 * the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 */
14
15#define MAX_SPANS 512
16#define MAX_BLOCKS 64
17#define MAX_BLOCKS_PER_ROW 128
18
19#define psx_gpu_test_mask_offset 0
20#define psx_gpu_uvrg_offset 16
21#define psx_gpu_uvrg_dx_offset 32
22#define psx_gpu_uvrg_dy_offset 48
23#define psx_gpu_u_block_span_offset 64
24#define psx_gpu_v_block_span_offset 80
25#define psx_gpu_r_block_span_offset 96
26#define psx_gpu_g_block_span_offset 112
27#define psx_gpu_b_block_span_offset 128
28
29#define psx_gpu_b_dx_offset 132
30
31#define psx_gpu_b_offset 144
32#define psx_gpu_b_dy_offset 148
33#define psx_gpu_triangle_area_offset 152
34#define psx_gpu_texture_window_settings_offset 156
35#define psx_gpu_current_texture_mask_offset 160
36#define psx_gpu_viewport_mask_offset 164
37#define psx_gpu_dirty_textures_4bpp_mask_offset 168
38#define psx_gpu_dirty_textures_8bpp_mask_offset 172
39#define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 176
40#define psx_gpu_triangle_color_offset 180
41#define psx_gpu_primitive_color_offset 184
42#define psx_gpu_dither_table_offset 188
43#define psx_gpu_render_block_handler_offset 204
44#define psx_gpu_texture_page_ptr_offset 208
45#define psx_gpu_clut_ptr_offset 212
46#define psx_gpu_vram_ptr_offset 216
47
48#define psx_gpu_render_state_base_offset 220
49#define psx_gpu_render_state_offset 222
50#define psx_gpu_num_spans_offset 224
51#define psx_gpu_num_blocks_offset 226
52#define psx_gpu_offset_x_offset 228
53#define psx_gpu_offset_y_offset 230
54#define psx_gpu_clut_settings_offset 232
55#define psx_gpu_texture_settings_offset 234
56#define psx_gpu_viewport_start_x_offset 236
57#define psx_gpu_viewport_start_y_offset 238
58#define psx_gpu_viewport_end_x_offset 240
59#define psx_gpu_viewport_end_y_offset 242
60#define psx_gpu_mask_msb_offset 244
61
62#define psx_gpu_triangle_winding_offset 246
63#define psx_gpu_display_area_draw_enable_offset 247
64#define psx_gpu_current_texture_page_offset 248
65#define psx_gpu_last_8bpp_texture_page_offset 249
66#define psx_gpu_texture_mask_width_offset 250
67#define psx_gpu_texture_mask_height_offset 251
68#define psx_gpu_texture_window_x_offset 252
69#define psx_gpu_texture_window_y_offset 253
70#define psx_gpu_primitive_type_offset 254
71
72#define psx_gpu_reserved_a_offset 255
73
74#define psx_gpu_blocks_offset 0x0100
75#define psx_gpu_span_uvrg_offset_offset 0x2100
76#define psx_gpu_span_edge_data_offset 0x4100
77#define psx_gpu_span_b_offset_offset 0x5100
78
79#define psx_gpu__vram_offset 0x005900
80
81#define edge_data_left_x_offset 0
82#define edge_data_num_blocks_offset 2
83#define edge_data_right_mask_offset 4
84#define edge_data_y_offset 6
85
86
87#define psx_gpu r0
88#define v_a r1
89#define v_b r2
90#define v_c r3
91
92#define x0 r4
93#define x1 r5
94#define x2 r6
95#define x0_x1 r5
96#define x1_x2 r6
97#define y0 r7
98#define y1 r8
99#define y2 r9
100#define y0_y1 r7
101#define y1_y2 r8
102#define b0 r9
103#define b1 r10
104#define b2 r11
105#define b0_b1 r10
106#define b1_b2 r11
107
108
109#define area_r_s r5
110
111#define g_bx0 r2
112#define g_bx r3
113#define g_bx2 r4
114#define g_bx3 r5
115#define b_base r6
116#define g_by r8
117
118#define gs_bx r7
119#define gs_by r10
120
121#define ga_bx g_bx
122#define ga_by g_by
123
124#define gw_bx_h g_bx
125#define gw_by_h g_by
126
127#define gw_bx_l r11
128#define gw_by_l gw_bx_l
129
130#define store_a r0
131#define store_b r1
132#define store_inc r5
133
134
135#define v0 q0
136#define uvrgb0 d0
137#define x0_y0 d1
138
139#define v1 q1
140#define uvrgb1 d2
141#define x1_y1 d3
142
143#define v2 q2
144#define uvrgb2 d4
145#define x2_y2 d5
146
147#define x0_ab q3
148#define uvrg_xxxx0 q3
149#define uvrg0 d6
150#define xxxx0 d7
151
152#define x1_ab q4
153#define uvrg_xxxx1 q4
154#define uvrg1 d8
155#define xxxx1 d9
156
157#define x2_ab q5
158#define uvrg_xxxx2 q5
159#define uvrg2 d10
160#define xxxx2 d11
161
162#define y0_ab q6
163#define yyyy_uvrg0 q6
164#define yyyy0 d12
165#define uvrg0b d13
166
167#define y1_ab q7
168#define yyyy_uvrg1 q7
169#define yyyy1 d14
170#define uvrg1b d15
171
172#define y2_ab q8
173#define yyyy_uvrg2 q8
174#define yyyy2 d16
175#define uvrg2b d17
176
177#define d0_ab q9
178#define d0_a d18
179#define d0_b d19
180
181#define d1_ab q10
182#define d1_a d20
183#define d1_b d21
184
185#define d2_ab q11
186#define d2_a d22
187#define d2_b d23
188
189#define d3_ab q12
190#define d3_a d24
191#define d3_b d25
192
193#define ga_uvrg_x q1
194#define ga_uvrg_y q4
195
196#define dx x0_x1
197#define dy y0_y1
198#define db b0_b1
199
200#define uvrg_base q11
201
202#define gs_uvrg_x q5
203#define gs_uvrg_y q6
204
205#define g_uvrg_x q1
206#define ga_uv_x d2
207#define g_uv_x d2
208#define ga_rg_x d3
209#define g_rg_x d3
210
211#define g_uvrg_y q4
212#define ga_uv_y d8
213#define g_uv_y d8
214#define ga_rg_y d9
215#define g_rg_y d9
216
217#define gw_uv_x q1
218#define gw_rg_x q2
219#define gw_uv_y q4
220#define gw_rg_y q3
221
222#define w_mask q9
223#define w_mask_l d18
224
225#define r_shift q10
226
227#define uvrg_dx0 q0
228#define uvrg_dx0l d0
229#define uvrg_dx0h d1
230
231#define uvrg_dx1 q1
232#define uvrg_dx1l d2
233#define uvrg_dx1h d3
234
235#define uvrg_dx2 q2
236#define uvrg_dx2l d4
237#define uvrg_dx2h d5
238
239#define uvrg_dx3 q3
240#define uvrg_dx3l d6
241#define uvrg_dx3h d7
242
243
244.align 4
245
246#define function(name) \
247 .global name; \
248 name: \
249
250@ r0: psx_gpu
251@ r1: v_a
252@ r2: v_b
253@ r3: v_c
254
255function(compute_all_gradients)
256 // First compute the triangle area reciprocal and shift. The division will
257 // happen concurrently with much of the work which follows.
258 @ r12 = psx_gpu->triangle_area
259 ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
260 stmdb sp!, { r4 - r11, lr }
261
262 @ load exponent of 62 into upper half of double
263 movw r4, #0
264 clz r14, r12 @ r14 = shift
265
266 movt r4, #((62 + 1023) << 4)
267 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
268
269 @ load area normalized into lower half of double
270 mov r5, r12, lsr #10
271 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
272
273 movt r4, #((1022 + 31) << 4)
274 mov r5, r12, lsl #20
275
276 add r4, r4, r12, lsr #11
277 vmov.f64 d31, r5, r4
278
279 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
280
281 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
282 // ( d0 * d1 ) - ( d2 * d3 ) =
283 // ( m0 ) - ( m1 ) = gradient
284
285 // This is split to do 12 elements at a time over three sets: a, b, and c.
286 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
287 // two of the slots are unused.
288
289 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
290 // is g.
291
292 // First type is: uvrg bxxx xxxx
293 // Second type is: yyyy ybyy uvrg
294 // Since x_a and y_c are the same the same variable is used for both.
295
296 vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 }
297 ldrsh x0, [ v_a, #8 ] @ load x0
298
299 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1}
300 ldrh x1, [ v_b, #8 ] @ load x1
301
302 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 }
303 ldrh x2, [ v_c, #8 ] @ load x2
304
305 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
306 ldrh y0, [ v_a, #10 ] @ load y0
307
308 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
309 ldrh y1, [ v_b, #10 ] @ load y1
310
311 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
312 ldrh y2, [ v_c, #10 ] @ load y2
313
314 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
315 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
316
317 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
318 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
319
320 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
321 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
322
323 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
324 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
325
326 ldrb b2, [ v_c, #4 ] @ load b2
327 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
328
329 ldrb b1, [ v_b, #4 ] @ load b1
330 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
331
332 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
333 vsub.s16 d0_ab, x1_ab, x0_ab
334
335 ldrb b0, [ v_a, #4 ] @ load b0
336 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
337
338 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
339 vsub.s16 d2_ab, x2_ab, x1_ab
340
341 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
342 vsub.s16 d1_ab, y2_ab, y1_ab
343
344 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
345 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
346
347 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
348 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
349
350 vsub.s16 d3_ab, y1_ab, y0_ab
351 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
352 @ ((x2 - X1) * (b1 - b0))
353 vmull.s16 ga_uvrg_x, d0_a, d1_a
354 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
355 @ ((b2 - b1) * (y1 - y0))
356 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
357 movs gs_bx, ga_bx, asr #31
358
359 vmull.s16 ga_uvrg_y, d0_b, d1_b
360 rsbmi ga_bx, ga_bx, #0
361
362 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
363 movs gs_by, ga_by, asr #31
364
365 vshr.u64 d0, d30, #22
366 mov b_base, b0, lsl #16
367
368 rsbmi ga_by, ga_by, #0
369 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
370
371 @ r12 = psx_gpu->triangle_winding_offset
372 ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
373 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
374
375 add b_base, b_base, #0x8000
376 rsb r12, r12, #0 @ r12 = -(triangle->winding)
377
378 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
379 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
380
381 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
382 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
383
384 vorr.u32 uvrg_base, #0x8000
385 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
386
387 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
388 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
389
390 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
391 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
392 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
393 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
394
395 vshl.u64 gw_rg_x, gw_rg_x, r_shift
396 vshl.u64 gw_uv_x, gw_uv_x, r_shift
397 vshl.u64 gw_rg_y, gw_rg_y, r_shift
398 vshl.u64 gw_uv_y, gw_uv_y, r_shift
399
400 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
401 vmovn.u64 g_uv_x, gw_uv_x
402
403 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
404 vmovn.u64 g_rg_x, gw_rg_x
405
406 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
407 vmovn.u64 g_uv_y, gw_uv_y
408
409 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
410 vmovn.u64 g_rg_y, gw_rg_y
411
412 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
413 mov ga_bx, ga_bx, lsl #13
414
415 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
416 mov ga_by, ga_by, lsl #13
417
418 vdup.u32 x0_y0, x0
419 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
420
421 vshl.u32 g_uvrg_x, g_uvrg_x, #4
422 vshl.u32 g_uvrg_y, g_uvrg_y, #4
423
424 umull gw_by_l, gw_by_h, ga_by, area_r_s
425 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
426
427 eor gs_bx, gs_bx, r12
428 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
429
430 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
431 eor gs_by, gs_by, r12
432
433 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
434 add store_a, psx_gpu, #psx_gpu_uvrg_offset
435
436 sub r11, r11, #(32 - 13)
437
438 add store_b, store_a, #16
439 mov store_inc, #32
440
441 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
442 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
443
444 vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
445 mov g_bx, gw_bx_h, lsr r11
446
447 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
448 mov g_by, gw_by_h, lsr r11
449
450 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
451 [ store_b, : 128 ], store_inc
452 eor g_bx, g_bx, gs_bx
453
454 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
455 [ store_b, : 128 ], store_inc
456 sub g_bx, g_bx, gs_bx
457
458 lsl g_bx, g_bx, #4
459 eor g_by, g_by, gs_by
460
461 mls b_base, g_bx, x0, b_base
462 sub g_by, g_by, gs_by
463
464 lsl g_by, g_by, #4
465 mov g_bx0, #0
466
467 add g_bx2, g_bx, g_bx
468 add g_bx3, g_bx, g_bx2
469
470 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
471
472 ldmia sp!, { r4 - r11, pc }
473
474
475#define psx_gpu r0
476#define v_a r1
477#define v_b r2
478#define v_c r3
479
480#define temp r14
481
482#define x_a r4
483#define x_b r5
484#define x_c r6
485#define y_a r1
486#define y_b r2
487#define y_c r3
488
489#define height_minor_a r7
490#define height_minor_b r8
491#define height_major r9
492#define height r9
493
494#define reciprocal_table_ptr r10
495
496#define edge_alt_low r4
497#define edge_alt_high r5
498#define edge_dx_dy_alt r6
499#define edge_shift_alt r10
500
501#define edge_dx_dy_alt_low r4
502#define edge_dx_dy_alt_high r5
503
504#define span_edge_data r4
505#define span_uvrg_offset r5
506#define span_b_offset r6
507
508#define clip r14
509
510#define b r11
511#define b_dy r12
512
513
514#define alternate_x q0
515#define alternate_dx_dy q1
516#define alternate_x_32 q2
517
518#define alternate_x_low d0
519#define alternate_x_high d1
520#define alternate_dx_dy_low d2
521#define alternate_dx_dy_high d3
522#define alternate_x_32_low d4
523#define alternate_x_32_high d5
524
525#define left_x q3
526#define right_x q4
527#define left_dx_dy q5
528#define right_dx_dy q6
529#define left_edge q7
530#define right_edge q8
531
532#define left_x_low d6
533#define left_x_high d7
534#define right_x_low d8
535#define right_x_high d9
536#define left_dx_dy_low d10
537#define left_dx_dy_high d11
538#define right_dx_dy_low d12
539#define right_dx_dy_high d13
540#define left_edge_low d14
541#define left_edge_high d15
542#define right_edge_low d16
543#define right_edge_high d17
544
545#define y_mid_point d18
546#define c_0x0004 d19
547
548#define left_right_x_16 q11
549#define span_shifts_y q12
550#define c_0x0001 q13
551
552#define span_shifts d24
553#define y_x4 d25
554#define c_0xFFFE d26
555#define c_0x0007 d27
556
557#define left_right_x_16_low d22
558#define left_right_x_16_high d23
559
560#define uvrg q14
561#define uvrg_dy q15
562
563#define alternate_x_16 d4
564
565#define v_clip q3
566#define v_clip_low d6
567
568#define right_x_32 q10
569#define left_x_32 q11
570#define alternate_select d24
571
572#define right_x_32_low d20
573#define right_x_32_high d21
574#define left_x_32_low d22
575#define left_x_32_high d23
576
577#define edges_xy q0
578#define edges_dx_dy d2
579#define edge_shifts d3
580#define edge_shifts_64 q2
581
582#define edges_xy_left d0
583#define edges_xy_right d1
584
585#define height_reciprocals d6
586#define heights d7
587
588#define widths d8
589#define c_0x01 d9
590#define x_starts d10
591#define x_ends d11
592
593#define heights_b d12
594#define edges_dx_dy_64 q10
595
596#define edges_dx_dy_64_left d20
597#define edges_dx_dy_64_right d21
598
599
600#define setup_spans_prologue() \
601 stmdb sp!, { r4 - r11, lr }; \
602 \
603 ldrsh x_a, [ v_a, #8 ]; \
604 ldrsh x_b, [ v_b, #8 ]; \
605 ldrsh x_c, [ v_c, #8 ]; \
606 ldrsh y_a, [ v_a, #10 ]; \
607 ldrsh y_b, [ v_b, #10 ]; \
608 ldrsh y_c, [ v_c, #10 ]; \
609 \
610 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
611 vld1.32 { uvrg }, [ temp ]; \
612 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
613 vld1.32 { uvrg_dy }, [ temp ]; \
614 movw reciprocal_table_ptr, :lower16:reciprocal_table; \
615 movt reciprocal_table_ptr, :upper16:reciprocal_table; \
616 \
617 vmov.u32 c_0x01, #0x01 \
618
619#define setup_spans_load_b() \
620 ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \
621 ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \
622
623#define setup_spans_prologue_b() \
624 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
625 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
626 \
627 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
628 vmov.u16 c_0x0004, #0x0004; \
629 \
630 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
631 vmov.u16 c_0x0001, #0x0001; \
632 \
633 vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \
634 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
635 \
636 vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \
637 vadd.u16 right_edge, right_edge, c_0x0001; \
638 \
639 vmov.u16 c_0x0007, #0x0007; \
640 vmvn.u16 c_0xFFFE, #0x0001 \
641
642
643#define compute_edge_delta_x2() \
644 ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \
645 \
646 vdup.u32 heights, height; \
647 vsub.u32 widths, x_ends, x_starts; \
648 \
649 vdup.u32 edge_shifts, temp; \
650 vsub.u32 heights_b, heights, c_0x01; \
651 vshr.u32 height_reciprocals, edge_shifts, #12; \
652 \
653 vmla.s32 heights_b, x_starts, heights; \
654 vbic.u16 edge_shifts, #0xE0; \
655 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
656 vmull.s32 edges_xy, heights_b, height_reciprocals \
657
658#define width_alt r6
659#define height_reciprocal_alt r11
660#define height_b_alt r12
661
662#define compute_edge_delta_x3(start_c, height_a, height_b) \
663 vmov.u32 heights, height_a, height_b; \
664 ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
665 vmov.u32 edge_shifts[0], temp; \
666 ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
667 vmov.u32 edge_shifts[1], temp; \
668 ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \
669 \
670 vsub.u32 widths, x_ends, x_starts; \
671 sub width_alt, x_c, start_c; \
672 \
673 vsub.u32 heights_b, heights, c_0x01; \
674 sub height_b_alt, height_minor_b, #1; \
675 \
676 vshr.u32 height_reciprocals, edge_shifts, #12; \
677 lsr height_reciprocal_alt, edge_shift_alt, #12; \
678 \
679 vmla.s32 heights_b, x_starts, heights; \
680 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
681 \
682 vbic.u16 edge_shifts, #0xE0; \
683 and edge_shift_alt, edge_shift_alt, #0x1F; \
684 \
685 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
686 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
687 \
688 vmull.s32 edges_xy, heights_b, height_reciprocals; \
689 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
690
691
692#define setup_spans_adjust_y_up() \
693 vsub.u32 y_x4, y_x4, c_0x0004 \
694
695#define setup_spans_adjust_y_down() \
696 vadd.u32 y_x4, y_x4, c_0x0004 \
697
698#define setup_spans_adjust_interpolants_up() \
699 vsub.u32 uvrg, uvrg, uvrg_dy; \
700 sub b, b, b_dy \
701
702#define setup_spans_adjust_interpolants_down() \
703 vadd.u32 uvrg, uvrg, uvrg_dy; \
704 add b, b, b_dy \
705
706
707#define setup_spans_clip_interpolants_increment() \
708 mla b, b_dy, clip, b; \
709 vmla.s32 uvrg, uvrg_dy, v_clip \
710
711#define setup_spans_clip_interpolants_decrement() \
712 mls b, b_dy, clip, b; \
713 vmls.s32 uvrg, uvrg_dy, v_clip \
714
715#define setup_spans_clip_alternate_yes() \
716 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
717
718#define setup_spans_clip_alternate_no() \
719
720#define setup_spans_clip(direction, alternate_active) \
721 vdup.u32 v_clip, clip; \
722 setup_spans_clip_alternate_##alternate_active(); \
723 setup_spans_clip_interpolants_##direction(); \
724 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
725
726
727#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
728 vmovl.s32 edge_shifts_64, edge_shifts; \
729 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
730 \
731 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
732 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
733 \
734 vmov left_x_low, edges_xy_##left_index; \
735 vmov right_x_low, edges_xy_##right_index; \
736 \
737 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
738 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
739 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
740 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
741 \
742 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
743 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
744 \
745 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
746 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
747
748
749#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
750 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
751 \
752 vdup.u16 y_mid_point, y_b; \
753 rsb temp, edge_shift_alt, #32; \
754 \
755 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
756 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
757 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
758 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
759 \
760 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
761 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
762 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
763 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
764 \
765 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
766 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
767
768
769#define setup_spans_y_select_up() \
770 vclt.s16 alternate_select, y_x4, y_mid_point \
771
772#define setup_spans_y_select_down() \
773 vcgt.s16 alternate_select, y_x4, y_mid_point \
774
775
776#define setup_spans_alternate_select_left() \
777 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
778
779#define setup_spans_alternate_select_right() \
780 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
781
782
783#define setup_spans_set_x4_alternate_yes(alternate, direction) \
784 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
785 vshrn.s64 left_x_32_low, left_x, #32; \
786 vshrn.s64 right_x_32_low, right_x, #32; \
787 \
788 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
789 vadd.u64 left_x, left_x, left_dx_dy; \
790 vadd.u64 right_x, right_x, right_dx_dy; \
791 \
792 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
793 vshrn.s64 left_x_32_high, left_x, #32; \
794 vshrn.s64 right_x_32_high, right_x, #32; \
795 \
796 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
797 vadd.u64 left_x, left_x, left_dx_dy; \
798 vadd.u64 right_x, right_x, right_dx_dy; \
799 \
800 vmovn.u32 alternate_x_16, alternate_x_32; \
801 setup_spans_y_select_##direction(); \
802 vmovn.u32 left_right_x_16_low, left_x_32; \
803 \
804 vmovn.u32 left_right_x_16_high, right_x_32; \
805 setup_spans_alternate_select_##alternate(); \
806 \
807 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
808 str b, [ span_b_offset ], #4; \
809 setup_spans_adjust_interpolants_##direction(); \
810 \
811 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
812 \
813 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
814 str b, [ span_b_offset ], #4; \
815 setup_spans_adjust_interpolants_##direction(); \
816 \
817 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
818 \
819 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
820 str b, [ span_b_offset ], #4; \
821 setup_spans_adjust_interpolants_##direction(); \
822 \
823 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
824 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
825 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
826 \
827 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
828 str b, [ span_b_offset ], #4; \
829 setup_spans_adjust_interpolants_##direction(); \
830 \
831 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
832 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
833 \
834 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
835 \
836 setup_spans_adjust_y_##direction() \
837
838
839#define setup_spans_set_x4_alternate_no(alternate, direction) \
840 vshrn.s64 left_x_32_low, left_x, #32; \
841 vshrn.s64 right_x_32_low, right_x, #32; \
842 \
843 vadd.u64 left_x, left_x, left_dx_dy; \
844 vadd.u64 right_x, right_x, right_dx_dy; \
845 \
846 vshrn.s64 left_x_32_high, left_x, #32; \
847 vshrn.s64 right_x_32_high, right_x, #32; \
848 \
849 vadd.u64 left_x, left_x, left_dx_dy; \
850 vadd.u64 right_x, right_x, right_dx_dy; \
851 \
852 vmovn.u32 left_right_x_16_low, left_x_32; \
853 vmovn.u32 left_right_x_16_high, right_x_32; \
854 \
855 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
856 str b, [ span_b_offset ], #4; \
857 setup_spans_adjust_interpolants_##direction(); \
858 \
859 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
860 \
861 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
862 str b, [ span_b_offset ], #4; \
863 setup_spans_adjust_interpolants_##direction(); \
864 \
865 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
866 \
867 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
868 str b, [ span_b_offset ], #4; \
869 setup_spans_adjust_interpolants_##direction(); \
870 \
871 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
872 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
873 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
874 \
875 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
876 str b, [ span_b_offset ], #4; \
877 setup_spans_adjust_interpolants_##direction(); \
878 \
879 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
880 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
881 \
882 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
883 \
884 setup_spans_adjust_y_##direction() \
885
886
887#define edge_adjust_low r11
888#define edge_adjust_high r12
889
890#define setup_spans_alternate_adjust_yes() \
891 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
892 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
893 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
894
895#define setup_spans_alternate_adjust_no() \
896
897
898#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
899 setup_spans_alternate_adjust_##alternate_active(); \
900 setup_spans_load_b(); \
901 \
902 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
903 subs y_c, y_c, temp; \
904 subgt height, height, y_c; \
905 addgt height, height, #1; \
906 \
907 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
908 subs clip, temp, y_a; \
909 ble 0f; \
910 \
911 sub height, height, clip; \
912 add y_a, y_a, clip; \
913 setup_spans_clip(increment, alternate_active); \
914 \
915 0: \
916 cmp height, #0; \
917 ble 1f; \
918 \
919 orr temp, y_a, y_a, lsl #16; \
920 add temp, temp, #(1 << 16); \
921 add y_a, temp, #2; \
922 add y_a, y_a, #(2 << 16); \
923 vmov.u32 y_x4, temp, y_a; \
924 \
925 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
926 right_index); \
927 setup_spans_prologue_b(); \
928 \
929 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
930 \
931 2: \
932 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
933 subs height, height, #4; \
934 bhi 2b; \
935 \
936 1: \
937
938
939#define setup_spans_alternate_pre_increment_yes() \
940 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
941 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
942
943#define setup_spans_alternate_pre_increment_no() \
944
945
946#define setup_spans_up_decrement_yes() \
947 suble height, height, #1 \
948
949#define setup_spans_up_decrement_no() \
950
951
952#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
953 setup_spans_alternate_adjust_##alternate_active(); \
954 setup_spans_load_b(); \
955 sub y_a, y_a, #1; \
956 \
957 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
958 subs temp, temp, y_c; \
959 subgt height, height, temp; \
960 setup_spans_up_decrement_##alternate_active(); \
961 \
962 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
963 subs clip, y_a, temp; \
964 ble 0f; \
965 \
966 sub height, height, clip; \
967 sub y_a, y_a, clip; \
968 setup_spans_clip(decrement, alternate_active); \
969 \
970 0: \
971 cmp height, #0; \
972 ble 1f; \
973 \
974 orr temp, y_a, y_a, lsl #16; \
975 sub temp, temp, #(1 << 16); \
976 sub y_a, temp, #2; \
977 sub y_a, y_a, #(2 << 16); \
978 vmov.u32 y_x4, temp, y_a; \
979 \
980 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
981 \
982 setup_spans_alternate_pre_increment_##alternate_active(); \
983 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
984 right_index); \
985 setup_spans_adjust_interpolants_up(); \
986 setup_spans_prologue_b(); \
987 \
988 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
989 \
990 2: \
991 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
992 subs height, height, #4; \
993 bhi 2b; \
994 \
995 1: \
996
997
998#define setup_spans_epilogue() \
999 ldmia sp!, { r4 - r11, pc } \
1000
1001
1002#define setup_spans_up_up(minor, major) \
1003 setup_spans_prologue(); \
1004 sub height_minor_a, y_a, y_b; \
1005 sub height_minor_b, y_b, y_c; \
1006 sub height, y_a, y_c; \
1007 \
1008 vdup.u32 x_starts, x_a; \
1009 vmov.u32 x_ends, x_c, x_b; \
1010 \
1011 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1012 setup_spans_up(major, minor, minor, yes); \
1013 setup_spans_epilogue() \
1014
1015function(setup_spans_up_left)
1016 setup_spans_up_up(left, right)
1017
1018function(setup_spans_up_right)
1019 setup_spans_up_up(right, left)
1020
1021
1022#define setup_spans_down_down(minor, major) \
1023 setup_spans_prologue(); \
1024 sub height_minor_a, y_b, y_a; \
1025 sub height_minor_b, y_c, y_b; \
1026 sub height, y_c, y_a; \
1027 \
1028 vdup.u32 x_starts, x_a; \
1029 vmov.u32 x_ends, x_c, x_b; \
1030 \
1031 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1032 setup_spans_down(major, minor, minor, yes); \
1033 setup_spans_epilogue() \
1034
1035function(setup_spans_down_left)
1036 setup_spans_down_down(left, right)
1037
1038function(setup_spans_down_right)
1039 setup_spans_down_down(right, left)
1040
1041
1042#define setup_spans_up_flat() \
1043 sub height, y_a, y_c; \
1044 \
1045 compute_edge_delta_x2(); \
1046 setup_spans_up(left, right, none, no); \
1047 setup_spans_epilogue() \
1048
1049function(setup_spans_up_a)
1050 setup_spans_prologue()
1051
1052 vmov.u32 x_starts, x_a, x_b
1053 vdup.u32 x_ends, x_c
1054
1055 setup_spans_up_flat()
1056
1057function(setup_spans_up_b)
1058 setup_spans_prologue()
1059
1060 vdup.u32 x_starts, x_a
1061 vmov.u32 x_ends, x_b, x_c
1062
1063 setup_spans_up_flat()
1064
1065#define setup_spans_down_flat() \
1066 sub height, y_c, y_a; \
1067 \
1068 compute_edge_delta_x2(); \
1069 setup_spans_down(left, right, none, no); \
1070 setup_spans_epilogue() \
1071
1072function(setup_spans_down_a)
1073 setup_spans_prologue()
1074
1075 vmov.u32 x_starts, x_a, x_b
1076 vdup.u32 x_ends, x_c
1077
1078 setup_spans_down_flat()
1079
1080function(setup_spans_down_b)
1081 setup_spans_prologue()
1082
1083 vdup.u32 x_starts, x_a
1084 vmov.u32 x_ends, x_b, x_c
1085
1086 setup_spans_down_flat()
1087
1088
1089#define middle_y r9
1090
1091#define edges_xy_b q11
1092#define edges_dx_dy_b d26
1093#define edge_shifts_b d27
1094#define edges_dx_dy_and_shifts_b q13
1095#define height_increment d20
1096
1097#define edges_dx_dy_and_shifts q1
1098
1099#define edges_xy_b_left d22
1100#define edges_xy_b_right d23
1101
1102#define setup_spans_up_down_load_edge_set_b() \
1103 vmov edges_xy, edges_xy_b; \
1104 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1105
1106
1107function(setup_spans_up_down)
1108 setup_spans_prologue()
1109
1110 // s32 middle_y = y_a;
1111 sub height_minor_a, y_a, y_b
1112 sub height_minor_b, y_c, y_a
1113 sub height_major, y_c, y_b
1114
1115 vmov.u32 x_starts, x_a, x_c
1116 vdup.u32 x_ends, x_b
1117
1118 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1119
1120 mov temp, #0
1121 vmov.u32 height_increment, temp, height_minor_b
1122 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1123
1124 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1125 vmov edges_xy_b_right, edges_xy_right
1126
1127 vmov edge_shifts_b, edge_shifts
1128 vmov.u32 edge_shifts_b[0], edge_shift_alt
1129
1130 vneg.s32 edges_dx_dy_b, edges_dx_dy
1131 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1132
1133 mov middle_y, y_a
1134
1135 setup_spans_load_b()
1136 sub y_a, y_a, #1
1137
1138 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1139 subs temp, temp, y_b
1140 subgt height_minor_a, height_minor_a, temp
1141
1142 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1143 subs clip, y_a, temp
1144 ble 0f
1145
1146 sub height_minor_a, height_minor_a, clip
1147 sub y_a, y_a, clip
1148 setup_spans_clip(decrement, no)
1149
1150 0:
1151 cmp height_minor_a, #0
1152 ble 3f
1153
1154 orr temp, y_a, y_a, lsl #16
1155 sub temp, temp, #(1 << 16)
1156 sub y_a, temp, #2
1157 sub y_a, y_a, #(2 << 16)
1158 vmov.u32 y_x4, temp, y_a
1159
1160 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1161
1162 strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1163
1164 setup_spans_adjust_edges_alternate_no(left, right);
1165 setup_spans_adjust_interpolants_up()
1166 setup_spans_up_down_load_edge_set_b()
1167
1168 setup_spans_prologue_b()
1169
1170
1171 2:
1172 setup_spans_set_x4_alternate_no(none, up)
1173 subs height_minor_a, height_minor_a, #4
1174 bhi 2b
1175
1176 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1177 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1178 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1179
1180 4:
1181 add temp, psx_gpu, #psx_gpu_uvrg_offset
1182 vld1.32 { uvrg }, [ temp ]
1183 mov y_a, middle_y
1184
1185 setup_spans_load_b()
1186
1187 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1188 subs y_c, y_c, temp
1189 subgt height_minor_b, height_minor_b, y_c
1190 addgt height_minor_b, height_minor_b, #1
1191
1192 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1193 subs clip, temp, y_a
1194 ble 0f
1195
1196 sub height_minor_b, height_minor_b, clip
1197 add y_a, y_a, clip
1198 setup_spans_clip(increment, no)
1199
1200 0:
1201 cmp height_minor_b, #0
1202 ble 1f
1203
1204 orr temp, y_a, y_a, lsl #16
1205 add temp, temp, #(1 << 16)
1206 add y_a, temp, #2
1207 add y_a, y_a, #(2 << 16)
1208 vmov.u32 y_x4, temp, y_a
1209
1210 setup_spans_adjust_edges_alternate_no(left, right)
1211
1212 ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1213 add temp, temp, height_minor_b
1214 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1215
1216 2:
1217 setup_spans_set_x4_alternate_no(none, down)
1218 subs height_minor_b, height_minor_b, #4
1219 bhi 2b
1220
1221 1:
1222 setup_spans_epilogue()
1223
1224 3:
1225 setup_spans_up_down_load_edge_set_b()
1226 setup_spans_prologue_b()
1227 bal 4b
1228
1229
1230#undef span_uvrg_offset
1231#undef span_edge_data
1232#undef span_b_offset
1233#undef left_x
1234#undef b
1235
1236#define psx_gpu r0
1237#define num_spans r1
1238#define span_uvrg_offset r2
1239#define span_edge_data r3
1240#define span_b_offset r4
1241#define b_dx r5
1242#define span_num_blocks r6
1243#define y r7
1244#define left_x r8
1245#define b r9
1246#define dither_offset_ptr r10
1247#define block_ptr_a r11
1248#define fb_ptr r12
1249#define num_blocks r14
1250
1251#define uvrg_dx_ptr r2
1252#define texture_mask_ptr r3
1253#define dither_shift r8
1254#define dither_row r10
1255
1256#define c_32 r7
1257#define b_dx4 r8
1258#define b_dx8 r9
1259#define block_ptr_b r10
1260
1261#define block_span_ptr r10
1262#define right_mask r8
1263
1264#define color r2
1265#define color_r r3
1266#define color_g r4
1267#define color_b r5
1268
1269#undef uvrg
1270
1271#define u_block q0
1272#define v_block q1
1273#define r_block q2
1274#define g_block q3
1275#define b_block q4
1276
1277#define uv_dx4 d10
1278#define rg_dx4 d11
1279#define uv_dx8 d12
1280#define rg_dx8 d13
1281#define b_whole_8 d14
1282#define fb_mask_ptrs d15
1283
1284#define uvrg_dx4 q5
1285#define uvrg_dx8 q6
1286#define uv_dx8 d12
1287#define rg_dx8 d13
1288
1289#define u_whole q8
1290#define v_whole q9
1291#define r_whole q10
1292#define g_whole q11
1293#define b_whole q12
1294
1295#define u_whole_low d16
1296#define u_whole_high d17
1297#define v_whole_low d18
1298#define v_whole_high d19
1299#define r_whole_low d20
1300#define r_whole_high d21
1301#define g_whole_low d22
1302#define g_whole_high d23
1303#define b_whole_low d24
1304#define b_whole_high d25
1305
1306#define dx4 q13
1307#define dx8 q13
1308
1309#define u_whole_8 d26
1310#define v_whole_8 d27
1311#define u_whole_8b d24
1312#define r_whole_8 d24
1313#define g_whole_8 d25
1314
1315#define uv_whole_8 q13
1316#define uv_whole_8b q14
1317
1318#define dither_offsets q14
1319#define texture_mask q15
1320#define texture_mask_u d30
1321#define texture_mask_v d31
1322
1323#define dither_offsets_short d28
1324
1325#define v_left_x q8
1326#define uvrg q9
1327#define block_span q10
1328
1329#define uv d18
1330#define rg d19
1331
1332#define draw_mask q1
1333#define draw_mask_edge q13
1334#define test_mask q0
1335
1336#define uvrg_dx q3
1337
1338#define colors q2
1339
1340#define setup_blocks_texture_swizzled() \
1341 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1342 vsli.u8 u_whole_8, v_whole_8, #4; \
1343 vsri.u8 v_whole_8, u_whole_8b, #4 \
1344
1345#define setup_blocks_texture_unswizzled() \
1346
1347
1348#define setup_blocks_shaded_textured_builder(swizzling) \
1349.align 3; \
1350 \
1351function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
1352 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1353 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1354 \
1355 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1356 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1357 \
1358 cmp num_spans, #0; \
1359 bxeq lr; \
1360 \
1361 stmdb sp!, { r4 - r11, r14 }; \
1362 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1363 \
1364 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
1365 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1366 \
1367 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1368 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1369 \
1370 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1371 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1372 \
1373 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1374 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1375 \
1376 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1377 \
1378 0: \
1379 vmov.u8 fb_mask_ptrs, #0; \
1380 \
1381 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1382 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1383 \
1384 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
1385 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
1386 \
1387 cmp span_num_blocks, #0; \
1388 beq 1f; \
1389 \
1390 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1391 add num_blocks, span_num_blocks, num_blocks; \
1392 \
1393 cmp num_blocks, #MAX_BLOCKS; \
1394 bgt 2f; \
1395 \
1396 3: \
1397 ldr b, [ span_b_offset ]; \
1398 add fb_ptr, fb_ptr, y, lsl #11; \
1399 \
1400 vdup.u32 v_left_x, left_x; \
1401 and y, y, #0x3; \
1402 \
1403 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1404 add fb_ptr, fb_ptr, left_x, lsl #1; \
1405 \
1406 mla b, b_dx, left_x, b; \
1407 and dither_shift, left_x, #0x03; \
1408 \
1409 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1410 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1411 \
1412 mov dither_shift, dither_shift, lsl #3; \
1413 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1414 \
1415 mov c_32, #32; \
1416 subs span_num_blocks, span_num_blocks, #1; \
1417 \
1418 mov dither_row, dither_row, ror dither_shift; \
1419 mov b_dx4, b_dx, lsl #2; \
1420 \
1421 vdup.u32 dither_offsets_short, dither_row; \
1422 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1423 \
1424 vdup.u32 b_block, b; \
1425 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1426 \
1427 vdup.u32 u_block, uv[0]; \
1428 mov b_dx8, b_dx, lsl #3; \
1429 \
1430 vdup.u32 v_block, uv[1]; \
1431 vdup.u32 r_block, rg[0]; \
1432 vdup.u32 g_block, rg[1]; \
1433 \
1434 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1435 \
1436 vadd.u32 u_block, u_block, block_span; \
1437 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1438 \
1439 vadd.u32 v_block, v_block, block_span; \
1440 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1441 \
1442 vadd.u32 r_block, r_block, block_span; \
1443 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1444 \
1445 vadd.u32 g_block, g_block, block_span; \
1446 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
1447 \
1448 vadd.u32 b_block, b_block, block_span; \
1449 add block_ptr_b, block_ptr_a, #16; \
1450 \
1451 vshrn.u32 u_whole_low, u_block, #16; \
1452 vshrn.u32 v_whole_low, v_block, #16; \
1453 vshrn.u32 r_whole_low, r_block, #16; \
1454 vshrn.u32 g_whole_low, g_block, #16; \
1455 \
1456 vdup.u32 dx4, uv_dx4[0]; \
1457 vshrn.u32 b_whole_low, b_block, #16; \
1458 \
1459 vaddhn.u32 u_whole_high, u_block, dx4; \
1460 vdup.u32 dx4, uv_dx4[1]; \
1461 \
1462 vaddhn.u32 v_whole_high, v_block, dx4; \
1463 vdup.u32 dx4, rg_dx4[0]; \
1464 \
1465 vaddhn.u32 r_whole_high, r_block, dx4; \
1466 vdup.u32 dx4, rg_dx4[1]; \
1467 \
1468 vaddhn.u32 g_whole_high, g_block, dx4; \
1469 vdup.u32 dx4, b_dx4; \
1470 \
1471 vaddhn.u32 b_whole_high, b_block, dx4; \
1472 vdup.u32 dx8, uv_dx8[0]; \
1473 \
1474 vadd.u32 u_block, u_block, dx8; \
1475 vdup.u32 dx8, uv_dx8[1]; \
1476 \
1477 vadd.u32 v_block, v_block, dx8; \
1478 vdup.u32 dx8, rg_dx8[0]; \
1479 \
1480 vadd.u32 r_block, r_block, dx8; \
1481 vdup.u32 dx8, rg_dx8[1]; \
1482 \
1483 vadd.u32 g_block, g_block, dx8; \
1484 vdup.u32 dx8, b_dx8; \
1485 \
1486 vadd.u32 b_block, b_block, dx8; \
1487 vmovn.u16 u_whole_8, u_whole; \
1488 \
1489 vmovn.u16 v_whole_8, v_whole; \
1490 \
1491 vmovn.u16 b_whole_8, b_whole; \
1492 pld [ fb_ptr ]; \
1493 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1494 \
1495 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1496 setup_blocks_texture_##swizzling(); \
1497 \
1498 vmovn.u16 r_whole_8, r_whole; \
1499 beq 5f; \
1500 \
1501 4: \
1502 vmovn.u16 g_whole_8, g_whole; \
1503 vshrn.u32 u_whole_low, u_block, #16; \
1504 \
1505 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1506 vshrn.u32 v_whole_low, v_block, #16; \
1507 \
1508 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1509 vshrn.u32 r_whole_low, r_block, #16; \
1510 \
1511 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1512 vshrn.u32 g_whole_low, g_block, #16; \
1513 \
1514 vdup.u32 dx4, uv_dx4[0]; \
1515 vshrn.u32 b_whole_low, b_block, #16; \
1516 \
1517 vaddhn.u32 u_whole_high, u_block, dx4; \
1518 vdup.u32 dx4, uv_dx4[1]; \
1519 \
1520 vaddhn.u32 v_whole_high, v_block, dx4; \
1521 vdup.u32 dx4, rg_dx4[0]; \
1522 \
1523 vaddhn.u32 r_whole_high, r_block, dx4; \
1524 vdup.u32 dx4, rg_dx4[1]; \
1525 \
1526 vaddhn.u32 g_whole_high, g_block, dx4; \
1527 vdup.u32 dx4, b_dx4; \
1528 \
1529 vaddhn.u32 b_whole_high, b_block, dx4; \
1530 vdup.u32 dx8, uv_dx8[0]; \
1531 \
1532 vadd.u32 u_block, u_block, dx8; \
1533 vdup.u32 dx8, uv_dx8[1]; \
1534 \
1535 vadd.u32 v_block, v_block, dx8; \
1536 vdup.u32 dx8, rg_dx8[0]; \
1537 \
1538 vadd.u32 r_block, r_block, dx8; \
1539 vdup.u32 dx8, rg_dx8[1]; \
1540 \
1541 vadd.u32 g_block, g_block, dx8; \
1542 vdup.u32 dx8, b_dx8; \
1543 \
1544 vadd.u32 b_block, b_block, dx8; \
1545 vmovn.u16 u_whole_8, u_whole; \
1546 \
1547 add fb_ptr, fb_ptr, #16; \
1548 vmovn.u16 v_whole_8, v_whole; \
1549 \
1550 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1551 vmovn.u16 b_whole_8, b_whole; \
1552 \
1553 pld [ fb_ptr ]; \
1554 \
1555 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1556 subs span_num_blocks, span_num_blocks, #1; \
1557 \
1558 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1559 setup_blocks_texture_##swizzling(); \
1560 \
1561 vmovn.u16 r_whole_8, r_whole; \
1562 bne 4b; \
1563 \
1564 5: \
1565 vmovn.u16 g_whole_8, g_whole; \
1566 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1567 \
1568 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1569 vdup.u8 draw_mask, right_mask; \
1570 \
1571 vmov.u32 fb_mask_ptrs[0], right_mask; \
1572 vtst.u16 draw_mask, draw_mask, test_mask; \
1573 vzip.u8 u_whole_8, v_whole_8; \
1574 \
1575 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1576 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1577 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1578 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1579 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1580 \
1581 1: \
1582 add span_uvrg_offset, span_uvrg_offset, #16; \
1583 add span_b_offset, span_b_offset, #4; \
1584 \
1585 add span_edge_data, span_edge_data, #8; \
1586 subs num_spans, num_spans, #1; \
1587 \
1588 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1589 bne 0b; \
1590 \
1591 ldmia sp!, { r4 - r11, pc }; \
1592 \
1593 2: \
1594 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1595 vpush { texture_mask }; \
1596 vpush { uvrg_dx4 }; \
1597 \
1598 stmdb sp!, { r0 - r3, r12, r14 }; \
1599 bl flush_render_block_buffer; \
1600 ldmia sp!, { r0 - r3, r12, r14 }; \
1601 \
1602 vpop { uvrg_dx4 }; \
1603 vpop { texture_mask }; \
1604 \
1605 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1606 vmov.u8 fb_mask_ptrs, #0; \
1607 \
1608 mov num_blocks, span_num_blocks; \
1609 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1610 bal 3b \
1611
1612
1613setup_blocks_shaded_textured_builder(swizzled)
1614setup_blocks_shaded_textured_builder(unswizzled)
1615
1616
1617#define setup_blocks_unshaded_textured_builder(swizzling) \
1618.align 3; \
1619 \
1620function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
1621 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1622 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1623 \
1624 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1625 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1626 \
1627 cmp num_spans, #0; \
1628 bxeq lr; \
1629 \
1630 stmdb sp!, { r4 - r11, r14 }; \
1631 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1632 \
1633 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1634 \
1635 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1636 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1637 \
1638 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1639 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1640 \
1641 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1642 \
1643 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1644 \
1645 0: \
1646 vmov.u8 fb_mask_ptrs, #0; \
1647 \
1648 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1649 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1650 \
1651 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
1652 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
1653 \
1654 cmp span_num_blocks, #0; \
1655 beq 1f; \
1656 \
1657 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1658 add num_blocks, span_num_blocks, num_blocks; \
1659 \
1660 cmp num_blocks, #MAX_BLOCKS; \
1661 bgt 2f; \
1662 \
1663 3: \
1664 add fb_ptr, fb_ptr, y, lsl #11; \
1665 \
1666 vdup.u32 v_left_x, left_x; \
1667 and y, y, #0x3; \
1668 \
1669 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1670 add fb_ptr, fb_ptr, left_x, lsl #1; \
1671 \
1672 and dither_shift, left_x, #0x03; \
1673 \
1674 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1675 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1676 \
1677 mov dither_shift, dither_shift, lsl #3; \
1678 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1679 \
1680 mov c_32, #32; \
1681 subs span_num_blocks, span_num_blocks, #1; \
1682 \
1683 mov dither_row, dither_row, ror dither_shift; \
1684 \
1685 vdup.u32 dither_offsets_short, dither_row; \
1686 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1687 \
1688 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1689 \
1690 vdup.u32 u_block, uv[0]; \
1691 \
1692 vdup.u32 v_block, uv[1]; \
1693 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1694 \
1695 vadd.u32 u_block, u_block, block_span; \
1696 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1697 \
1698 vadd.u32 v_block, v_block, block_span; \
1699 add block_ptr_b, block_ptr_a, #16; \
1700 \
1701 vshrn.u32 u_whole_low, u_block, #16; \
1702 vshrn.u32 v_whole_low, v_block, #16; \
1703 \
1704 vdup.u32 dx4, uv_dx4[0]; \
1705 \
1706 vaddhn.u32 u_whole_high, u_block, dx4; \
1707 vdup.u32 dx4, uv_dx4[1]; \
1708 \
1709 vaddhn.u32 v_whole_high, v_block, dx4; \
1710 vdup.u32 dx8, uv_dx8[0]; \
1711 \
1712 vadd.u32 u_block, u_block, dx8; \
1713 vdup.u32 dx8, uv_dx8[1]; \
1714 \
1715 vadd.u32 v_block, v_block, dx8; \
1716 vmovn.u16 u_whole_8, u_whole; \
1717 \
1718 vmovn.u16 v_whole_8, v_whole; \
1719 \
1720 pld [ fb_ptr ]; \
1721 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1722 \
1723 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1724 setup_blocks_texture_##swizzling(); \
1725 \
1726 beq 5f; \
1727 \
1728 4: \
1729 vshrn.u32 u_whole_low, u_block, #16; \
1730 \
1731 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1732 vshrn.u32 v_whole_low, v_block, #16; \
1733 \
1734 add block_ptr_b, block_ptr_b, #32; \
1735 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1736 \
1737 vdup.u32 dx4, uv_dx4[0]; \
1738 vaddhn.u32 u_whole_high, u_block, dx4; \
1739 vdup.u32 dx4, uv_dx4[1]; \
1740 \
1741 vaddhn.u32 v_whole_high, v_block, dx4; \
1742 vdup.u32 dx8, uv_dx8[0]; \
1743 \
1744 vadd.u32 u_block, u_block, dx8; \
1745 vdup.u32 dx8, uv_dx8[1]; \
1746 \
1747 vadd.u32 v_block, v_block, dx8; \
1748 vmovn.u16 u_whole_8, u_whole; \
1749 \
1750 add fb_ptr, fb_ptr, #16; \
1751 vmovn.u16 v_whole_8, v_whole; \
1752 \
1753 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1754 pld [ fb_ptr ]; \
1755 \
1756 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1757 subs span_num_blocks, span_num_blocks, #1; \
1758 \
1759 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1760 setup_blocks_texture_##swizzling(); \
1761 \
1762 bne 4b; \
1763 \
1764 5: \
1765 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1766 \
1767 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1768 vdup.u8 draw_mask, right_mask; \
1769 \
1770 vmov.u32 fb_mask_ptrs[0], right_mask; \
1771 vtst.u16 draw_mask, draw_mask, test_mask; \
1772 vzip.u8 u_whole_8, v_whole_8; \
1773 \
1774 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1775 add block_ptr_b, block_ptr_b, #32; \
1776 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1777 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1778 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1779 \
1780 1: \
1781 add span_uvrg_offset, span_uvrg_offset, #16; \
1782 add span_edge_data, span_edge_data, #8; \
1783 subs num_spans, num_spans, #1; \
1784 \
1785 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1786 bne 0b; \
1787 \
1788 ldmia sp!, { r4 - r11, pc }; \
1789 \
1790 2: \
1791 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1792 vpush { texture_mask }; \
1793 vpush { uvrg_dx4 }; \
1794 \
1795 stmdb sp!, { r0 - r3, r12, r14 }; \
1796 bl flush_render_block_buffer; \
1797 ldmia sp!, { r0 - r3, r12, r14 }; \
1798 \
1799 vpop { uvrg_dx4 }; \
1800 vpop { texture_mask }; \
1801 \
1802 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1803 vmov.u8 fb_mask_ptrs, #0; \
1804 \
1805 mov num_blocks, span_num_blocks; \
1806 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1807 bal 3b \
1808
1809
1810setup_blocks_unshaded_textured_builder(swizzled)
1811setup_blocks_unshaded_textured_builder(unswizzled)
1812
1813
1814.align 3
1815
1816function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1817 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1818 veor.u32 draw_mask, draw_mask, draw_mask
1819
1820 cmp num_spans, #0
1821 bxeq lr
1822
1823 stmdb sp!, { r4 - r11, r14 }
1824 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1825
1826 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1827
1828 ubfx color_r, color, #3, #5
1829 ubfx color_g, color, #11, #5
1830 ubfx color_b, color, #19, #5
1831
1832 orr color, color_r, color_b, lsl #10
1833 orr color, color, color_g, lsl #5
1834
1835 vdup.u16 colors, color
1836
1837 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1838 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1839
1840 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1841 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1842
1843 0:
1844 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1845 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1846
1847 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
1848
1849 cmp span_num_blocks, #0
1850 beq 1f
1851
1852 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1853 add num_blocks, span_num_blocks, num_blocks
1854
1855 cmp num_blocks, #MAX_BLOCKS
1856 bgt 2f
1857
1858 3:
1859 add fb_ptr, fb_ptr, y, lsl #11
1860 and y, y, #0x3
1861
1862 add fb_ptr, fb_ptr, left_x, lsl #1
1863 mov c_32, #32
1864
1865 subs span_num_blocks, span_num_blocks, #1
1866
1867 add block_ptr_b, block_ptr_a, #16
1868 pld [ fb_ptr ]
1869
1870 vmov.u32 fb_mask_ptrs[1], fb_ptr
1871 beq 5f
1872
1873 4:
1874 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1875 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1876 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1877
1878 add fb_ptr, fb_ptr, #16
1879 add block_ptr_b, block_ptr_b, #32
1880
1881 pld [ fb_ptr ]
1882
1883 vmov.u32 fb_mask_ptrs[1], fb_ptr
1884 subs span_num_blocks, span_num_blocks, #1
1885
1886 bne 4b
1887
1888 5:
1889 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1890
1891 vdup.u8 draw_mask_edge, right_mask
1892 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1893
1894 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1895 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1896 add block_ptr_b, block_ptr_b, #32
1897 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1898
1899 1:
1900 add span_edge_data, span_edge_data, #8
1901 subs num_spans, num_spans, #1
1902
1903 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1904 bne 0b
1905
1906 ldmia sp!, { r4 - r11, pc }
1907
1908 2:
1909 vpush { colors }
1910
1911 stmdb sp!, { r0 - r3, r12, r14 }
1912 bl flush_render_block_buffer
1913 ldmia sp!, { r0 - r3, r12, r14 }
1914
1915 vpop { colors }
1916
1917 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1918 veor.u32 draw_mask, draw_mask, draw_mask
1919
1920 mov num_blocks, span_num_blocks
1921 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1922 bal 3b
1923
1924
1925#define mask_msb_scalar r14
1926
1927#define msb_mask q15
1928
1929#define pixels_low d16
1930
1931#define msb_mask_low d30
1932#define msb_mask_high d31
1933
1934
1935.align 3
1936
1937function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1938 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1939
1940 cmp num_spans, #0
1941 bxeq lr
1942
1943 stmdb sp!, { r4 - r11, r14 }
1944
1945 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1946
1947 ubfx color_r, color, #3, #5
1948 ubfx color_g, color, #11, #5
1949
1950 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1951 ubfx color_b, color, #19, #5
1952
1953 orr color, color_r, color_b, lsl #10
1954 orr color, color, color_g, lsl #5
1955 orr color, color, mask_msb_scalar
1956
1957 vdup.u16 colors, color
1958
1959 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1960
1961 0:
1962 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1963 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1964
1965 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
1966
1967 cmp span_num_blocks, #0
1968 beq 1f
1969
1970 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1971
1972 add fb_ptr, fb_ptr, y, lsl #11
1973 subs span_num_blocks, span_num_blocks, #1
1974
1975 add fb_ptr, fb_ptr, left_x, lsl #1
1976 beq 3f
1977
1978 2:
1979 vst1.u32 { colors }, [ fb_ptr ]!
1980 subs span_num_blocks, span_num_blocks, #1
1981
1982 bne 2b
1983
1984 3:
1985 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1986 eor right_mask, right_mask, #0xFF
1987
1988 4:
1989 strh color, [ fb_ptr ], #2
1990 movs right_mask, right_mask, lsr #1
1991 bne 4b
1992
1993 1:
1994 add span_edge_data, span_edge_data, #8
1995 subs num_spans, num_spans, #1
1996
1997 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1998 bne 0b
1999
2000 ldmia sp!, { r4 - r11, pc }
2001
2002
2003
2004#undef c_64
2005
2006#define c_64 r7
2007#define rg_dx_ptr r2
2008
2009
2010#undef r_block
2011#undef g_block
2012#undef b_block
2013#undef r_whole
2014#undef g_whole
2015#undef b_whole
2016#undef r_whole_low
2017#undef r_whole_high
2018#undef g_whole_low
2019#undef g_whole_high
2020#undef b_whole_low
2021#undef b_whole_high
2022#undef r_whole_8
2023#undef g_whole_8
2024#undef b_whole_8
2025#undef dither_offsets
2026#undef rg_dx4
2027#undef rg_dx8
2028#undef dx4
2029#undef dx8
2030#undef v_left_x
2031#undef uvrg
2032#undef block_span
2033#undef rg
2034#undef draw_mask
2035#undef test_mask
2036
2037#define r_block q0
2038#define g_block q1
2039#define b_block q2
2040
2041#define r_whole q3
2042#define g_whole q4
2043#define b_whole q5
2044
2045#define r_whole_low d6
2046#define r_whole_high d7
2047#define g_whole_low d8
2048#define g_whole_high d9
2049#define b_whole_low d10
2050#define b_whole_high d11
2051
2052#define gb_whole_8 q6
2053
2054#define g_whole_8 d12
2055#define b_whole_8 d13
2056
2057#define r_whole_8 d14
2058
2059#define pixels q8
2060
2061#define rg_dx4 d18
2062#define rg_dx8 d19
2063
2064#define dx4 q10
2065#define dx8 q10
2066
2067#define v_left_x d6
2068#define uvrg q4
2069#define block_span q5
2070
2071#define rg d9
2072
2073#define d64_1 d22
2074#define d64_128 d23
2075
2076#define d128_4 q12
2077#define d128_0x7 q13
2078
2079#define d64_4 d24
2080
2081#define dither_offsets q14
2082#define draw_mask q15
2083
2084#define dither_offsets_low d28
2085
2086#define rg_dx d0
2087#define test_mask q10
2088
2089
2090#define setup_blocks_shaded_untextured_dither_a_dithered() \
2091 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2092 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2093
2094#define setup_blocks_shaded_untextured_dither_b_dithered() \
2095 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2096 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2097
2098#define setup_blocks_shaded_untextured_dither_a_undithered() \
2099
2100#define setup_blocks_shaded_untextured_dither_b_undithered() \
2101
2102
2103#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2104.align 3; \
2105 \
2106function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
2107 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2108 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2109 \
2110 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2111 \
2112 cmp num_spans, #0; \
2113 bxeq lr; \
2114 \
2115 stmdb sp!, { r4 - r11, r14 }; \
2116 vshl.u32 rg_dx4, rg_dx, #2; \
2117 \
2118 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2119 vshl.u32 rg_dx8, rg_dx, #3; \
2120 \
2121 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2122 \
2123 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2124 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2125 \
2126 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2127 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2128 \
2129 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2130 vmov.u8 d64_1, #1; \
2131 \
2132 vmov.u8 d128_4, #4; \
2133 vmov.u8 d64_128, #128; \
2134 \
2135 vmov.u8 d128_0x7, #0x7; \
2136 \
2137 0: \
2138 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2139 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2140 \
2141 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
2142 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
2143 \
2144 cmp span_num_blocks, #0; \
2145 beq 1f; \
2146 \
2147 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2148 add num_blocks, span_num_blocks, num_blocks; \
2149 \
2150 cmp num_blocks, #MAX_BLOCKS; \
2151 bgt 2f; \
2152 \
2153 3: \
2154 ldr b, [ span_b_offset ]; \
2155 add fb_ptr, fb_ptr, y, lsl #11; \
2156 \
2157 vdup.u32 v_left_x, left_x; \
2158 and y, y, #0x3; \
2159 \
2160 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2161 add fb_ptr, fb_ptr, left_x, lsl #1; \
2162 \
2163 mla b, b_dx, left_x, b; \
2164 and dither_shift, left_x, #0x03; \
2165 \
2166 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2167 vshr.u32 rg_dx, rg_dx4, #2; \
2168 \
2169 mov dither_shift, dither_shift, lsl #3; \
2170 vmla.u32 rg, rg_dx, v_left_x; \
2171 \
2172 mov c_64, #64; \
2173 subs span_num_blocks, span_num_blocks, #1; \
2174 \
2175 mov dither_row, dither_row, ror dither_shift; \
2176 mov b_dx4, b_dx, lsl #2; \
2177 \
2178 vdup.u32 dither_offsets, dither_row; \
2179 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2180 \
2181 vdup.u32 b_block, b; \
2182 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2183 \
2184 mov b_dx8, b_dx, lsl #3; \
2185 vdup.u32 r_block, rg[0]; \
2186 vdup.u32 g_block, rg[1]; \
2187 \
2188 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2189 \
2190 vadd.u32 r_block, r_block, block_span; \
2191 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2192 \
2193 vadd.u32 g_block, g_block, block_span; \
2194 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2195 \
2196 vadd.u32 b_block, b_block, block_span; \
2197 add block_ptr_b, block_ptr_a, #16; \
2198 \
2199 vshrn.u32 r_whole_low, r_block, #16; \
2200 vshrn.u32 g_whole_low, g_block, #16; \
2201 vshrn.u32 b_whole_low, b_block, #16; \
2202 vdup.u32 dx4, rg_dx4[0]; \
2203 \
2204 vaddhn.u32 r_whole_high, r_block, dx4; \
2205 vdup.u32 dx4, rg_dx4[1]; \
2206 \
2207 vaddhn.u32 g_whole_high, g_block, dx4; \
2208 vdup.u32 dx4, b_dx4; \
2209 \
2210 vaddhn.u32 b_whole_high, b_block, dx4; \
2211 vdup.u32 dx8, rg_dx8[0]; \
2212 \
2213 vadd.u32 r_block, r_block, dx8; \
2214 vdup.u32 dx8, rg_dx8[1]; \
2215 \
2216 vadd.u32 g_block, g_block, dx8; \
2217 vdup.u32 dx8, b_dx8; \
2218 \
2219 vadd.u32 b_block, b_block, dx8; \
2220 \
2221 vmovn.u16 r_whole_8, r_whole; \
2222 vmovn.u16 g_whole_8, g_whole; \
2223 vmovn.u16 b_whole_8, b_whole; \
2224 \
2225 beq 5f; \
2226 veor.u32 draw_mask, draw_mask, draw_mask; \
2227 \
2228 4: \
2229 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2230 vshrn.u32 r_whole_low, r_block, #16; \
2231 \
2232 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2233 vshrn.u32 g_whole_low, g_block, #16; \
2234 \
2235 vshrn.u32 b_whole_low, b_block, #16; \
2236 str fb_ptr, [ block_ptr_a, #44 ]; \
2237 \
2238 vdup.u32 dx4, rg_dx4[0]; \
2239 vshr.u8 r_whole_8, r_whole_8, #3; \
2240 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2241 \
2242 vaddhn.u32 r_whole_high, r_block, dx4; \
2243 vdup.u32 dx4, rg_dx4[1]; \
2244 \
2245 vaddhn.u32 g_whole_high, g_block, dx4; \
2246 vdup.u32 dx4, b_dx4; \
2247 \
2248 vaddhn.u32 b_whole_high, b_block, dx4; \
2249 vdup.u32 dx8, rg_dx8[0]; \
2250 \
2251 vmull.u8 pixels, r_whole_8, d64_1; \
2252 vmlal.u8 pixels, g_whole_8, d64_4; \
2253 vmlal.u8 pixels, b_whole_8, d64_128; \
2254 \
2255 vadd.u32 r_block, r_block, dx8; \
2256 vdup.u32 dx8, rg_dx8[1]; \
2257 \
2258 vadd.u32 g_block, g_block, dx8; \
2259 vdup.u32 dx8, b_dx8; \
2260 \
2261 vadd.u32 b_block, b_block, dx8; \
2262 add fb_ptr, fb_ptr, #16; \
2263 \
2264 vmovn.u16 r_whole_8, r_whole; \
2265 vmovn.u16 g_whole_8, g_whole; \
2266 vmovn.u16 b_whole_8, b_whole; \
2267 \
2268 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2269 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2270 \
2271 pld [ fb_ptr ]; \
2272 \
2273 subs span_num_blocks, span_num_blocks, #1; \
2274 bne 4b; \
2275 \
2276 5: \
2277 str fb_ptr, [ block_ptr_a, #44 ]; \
2278 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2279 \
2280 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2281 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2282 \
2283 vshr.u8 r_whole_8, r_whole_8, #3; \
2284 vdup.u8 draw_mask, right_mask; \
2285 \
2286 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2287 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
2288 \
2289 vtst.u16 draw_mask, draw_mask, test_mask; \
2290 \
2291 vmull.u8 pixels, r_whole_8, d64_1; \
2292 vmlal.u8 pixels, g_whole_8, d64_4; \
2293 vmlal.u8 pixels, b_whole_8, d64_128; \
2294 \
2295 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2296 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2297 \
2298 1: \
2299 add span_uvrg_offset, span_uvrg_offset, #16; \
2300 add span_b_offset, span_b_offset, #4; \
2301 \
2302 add span_edge_data, span_edge_data, #8; \
2303 subs num_spans, num_spans, #1; \
2304 \
2305 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2306 bne 0b; \
2307 \
2308 ldmia sp!, { r4 - r11, pc }; \
2309 \
2310 2: \
2311 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2312 vpush { rg_dx4 }; \
2313 \
2314 stmdb sp!, { r0 - r3, r12, r14 }; \
2315 bl flush_render_block_buffer; \
2316 ldmia sp!, { r0 - r3, r12, r14 }; \
2317 \
2318 vpop { rg_dx4 }; \
2319 \
2320 vmov.u8 d64_1, #1; \
2321 vmov.u8 d128_4, #4; \
2322 vmov.u8 d64_128, #128; \
2323 vmov.u8 d128_0x7, #0x7; \
2324 \
2325 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2326 \
2327 mov num_blocks, span_num_blocks; \
2328 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2329 bal 3b \
2330
2331
2332setup_blocks_shaded_untextured_indirect_builder(undithered)
2333setup_blocks_shaded_untextured_indirect_builder(dithered)
2334
2335
2336#undef draw_mask
2337
2338#define mask_msb_ptr r14
2339
2340#define draw_mask q0
2341#define pixels_low d16
2342
2343
2344
2345#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2346.align 3; \
2347 \
2348function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
2349 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2350 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2351 \
2352 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2353 \
2354 cmp num_spans, #0; \
2355 bxeq lr; \
2356 \
2357 stmdb sp!, { r4 - r11, r14 }; \
2358 vshl.u32 rg_dx4, rg_dx, #2; \
2359 \
2360 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2361 vshl.u32 rg_dx8, rg_dx, #3; \
2362 \
2363 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2364 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2365 \
2366 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2367 vmov.u8 d64_1, #1; \
2368 \
2369 vmov.u8 d128_4, #4; \
2370 vmov.u8 d64_128, #128; \
2371 \
2372 vmov.u8 d128_0x7, #0x7; \
2373 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
2374 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
2375 \
2376 0: \
2377 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2378 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2379 \
2380 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
2381 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
2382 \
2383 cmp span_num_blocks, #0; \
2384 beq 1f; \
2385 \
2386 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2387 add fb_ptr, fb_ptr, y, lsl #11; \
2388 \
2389 ldr b, [ span_b_offset ]; \
2390 vdup.u32 v_left_x, left_x; \
2391 and y, y, #0x3; \
2392 \
2393 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2394 add fb_ptr, fb_ptr, left_x, lsl #1; \
2395 \
2396 mla b, b_dx, left_x, b; \
2397 and dither_shift, left_x, #0x03; \
2398 \
2399 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2400 vshr.u32 rg_dx, rg_dx4, #2; \
2401 \
2402 mov dither_shift, dither_shift, lsl #3; \
2403 vmla.u32 rg, rg_dx, v_left_x; \
2404 \
2405 subs span_num_blocks, span_num_blocks, #1; \
2406 \
2407 mov dither_row, dither_row, ror dither_shift; \
2408 mov b_dx4, b_dx, lsl #2; \
2409 \
2410 vdup.u32 dither_offsets, dither_row; \
2411 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2412 \
2413 vdup.u32 b_block, b; \
2414 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2415 \
2416 mov b_dx8, b_dx, lsl #3; \
2417 vdup.u32 r_block, rg[0]; \
2418 vdup.u32 g_block, rg[1]; \
2419 \
2420 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2421 \
2422 vadd.u32 r_block, r_block, block_span; \
2423 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2424 \
2425 vadd.u32 g_block, g_block, block_span; \
2426 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2427 \
2428 vadd.u32 b_block, b_block, block_span; \
2429 add block_ptr_b, block_ptr_a, #16; \
2430 \
2431 vshrn.u32 r_whole_low, r_block, #16; \
2432 vshrn.u32 g_whole_low, g_block, #16; \
2433 vshrn.u32 b_whole_low, b_block, #16; \
2434 vdup.u32 dx4, rg_dx4[0]; \
2435 \
2436 vaddhn.u32 r_whole_high, r_block, dx4; \
2437 vdup.u32 dx4, rg_dx4[1]; \
2438 \
2439 vaddhn.u32 g_whole_high, g_block, dx4; \
2440 vdup.u32 dx4, b_dx4; \
2441 \
2442 vaddhn.u32 b_whole_high, b_block, dx4; \
2443 vdup.u32 dx8, rg_dx8[0]; \
2444 \
2445 vadd.u32 r_block, r_block, dx8; \
2446 vdup.u32 dx8, rg_dx8[1]; \
2447 \
2448 vadd.u32 g_block, g_block, dx8; \
2449 vdup.u32 dx8, b_dx8; \
2450 \
2451 vadd.u32 b_block, b_block, dx8; \
2452 \
2453 vmovn.u16 r_whole_8, r_whole; \
2454 vmovn.u16 g_whole_8, g_whole; \
2455 vmovn.u16 b_whole_8, b_whole; \
2456 \
2457 beq 3f; \
2458 \
2459 2: \
2460 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2461 vshrn.u32 r_whole_low, r_block, #16; \
2462 \
2463 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2464 vshrn.u32 g_whole_low, g_block, #16; \
2465 \
2466 vshrn.u32 b_whole_low, b_block, #16; \
2467 \
2468 vdup.u32 dx4, rg_dx4[0]; \
2469 vshr.u8 r_whole_8, r_whole_8, #3; \
2470 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2471 \
2472 vaddhn.u32 r_whole_high, r_block, dx4; \
2473 vdup.u32 dx4, rg_dx4[1]; \
2474 \
2475 vmov pixels, msb_mask; \
2476 vaddhn.u32 g_whole_high, g_block, dx4; \
2477 vdup.u32 dx4, b_dx4; \
2478 \
2479 vaddhn.u32 b_whole_high, b_block, dx4; \
2480 vdup.u32 dx8, rg_dx8[0]; \
2481 \
2482 vmlal.u8 pixels, r_whole_8, d64_1; \
2483 vmlal.u8 pixels, g_whole_8, d64_4; \
2484 vmlal.u8 pixels, b_whole_8, d64_128; \
2485 \
2486 vadd.u32 r_block, r_block, dx8; \
2487 vdup.u32 dx8, rg_dx8[1]; \
2488 \
2489 vadd.u32 g_block, g_block, dx8; \
2490 vdup.u32 dx8, b_dx8; \
2491 \
2492 vadd.u32 b_block, b_block, dx8; \
2493 \
2494 vmovn.u16 r_whole_8, r_whole; \
2495 vmovn.u16 g_whole_8, g_whole; \
2496 vmovn.u16 b_whole_8, b_whole; \
2497 \
2498 vst1.u32 { pixels }, [ fb_ptr ]!; \
2499 subs span_num_blocks, span_num_blocks, #1; \
2500 bne 2b; \
2501 \
2502 3: \
2503 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2504 \
2505 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2506 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2507 \
2508 vshr.u8 r_whole_8, r_whole_8, #3; \
2509 vmov pixels, msb_mask; \
2510 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2511 eor right_mask, right_mask, #0xFF; \
2512 \
2513 vmlal.u8 pixels, r_whole_8, d64_1; \
2514 vmlal.u8 pixels, g_whole_8, d64_4; \
2515 vmlal.u8 pixels, b_whole_8, d64_128; \
2516 \
2517 4: \
2518 vst1.u16 { pixels_low[0] }, [ fb_ptr ]!; \
2519 vext.16 pixels, pixels, #1; \
2520 movs right_mask, right_mask, lsr #1; \
2521 bne 4b; \
2522 \
2523 1: \
2524 add span_uvrg_offset, span_uvrg_offset, #16; \
2525 add span_b_offset, span_b_offset, #4; \
2526 \
2527 add span_edge_data, span_edge_data, #8; \
2528 subs num_spans, num_spans, #1; \
2529 \
2530 bne 0b; \
2531 \
2532 ldmia sp!, { r4 - r11, pc } \
2533
2534setup_blocks_shaded_untextured_direct_builder(undithered)
2535setup_blocks_shaded_untextured_direct_builder(dithered)
2536
2537
2538#undef psx_gpu
2539#undef num_blocks
2540#undef triangle
2541#undef c_64
2542
2543#define psx_gpu r0
2544#define block_ptr r1
2545#define num_blocks r2
2546#define uv_01 r3
2547#define uv_23 r4
2548#define uv_45 r5
2549#define uv_67 r6
2550#define uv_0 r7
2551#define uv_1 r3
2552#define uv_2 r8
2553#define uv_3 r4
2554#define uv_4 r9
2555#define uv_5 r5
2556#define uv_6 r10
2557#define uv_7 r6
2558#define texture_ptr r11
2559
2560#define pixel_0 r7
2561#define pixel_1 r3
2562#define pixel_2 r8
2563#define pixel_3 r4
2564#define pixel_4 r9
2565#define pixel_5 r5
2566#define pixel_6 r10
2567#define pixel_7 r6
2568
2569#define pixels_a r7
2570#define pixels_b r9
2571#define pixels_c r8
2572#define pixels_d r10
2573
2574#define c_64 r0
2575
2576#define clut_ptr r12
2577#define current_texture_mask r5
2578#define dirty_textures_mask r6
2579
2580#define texels d0
2581
2582#define clut_low_a d2
2583#define clut_low_b d3
2584#define clut_high_a d4
2585#define clut_high_b d5
2586
2587#define clut_a q1
2588#define clut_b q2
2589
2590#define texels_low d6
2591#define texels_high d7
2592
2593.align 3
2594
2595function(texture_blocks_untextured)
2596 bx lr
2597
2598
2599.align 3
2600
2601function(texture_blocks_4bpp)
2602 stmdb sp!, { r3 - r11, r14 }
2603 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2604
2605 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2606 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2607
2608 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2609 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2610
2611 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2612 vuzp.u8 clut_a, clut_b
2613
2614 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2615 tst dirty_textures_mask, current_texture_mask
2616
2617 bne 1f
2618 mov c_64, #64
2619
26200:
2621 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2622
2623 uxtah uv_0, texture_ptr, uv_01
2624 uxtah uv_1, texture_ptr, uv_01, ror #16
2625
2626 uxtah uv_2, texture_ptr, uv_23
2627 uxtah uv_3, texture_ptr, uv_23, ror #16
2628
2629 uxtah uv_4, texture_ptr, uv_45
2630 ldrb pixel_0, [ uv_0 ]
2631
2632 uxtah uv_5, texture_ptr, uv_45, ror #16
2633 ldrb pixel_1, [ uv_1 ]
2634
2635 uxtah uv_6, texture_ptr, uv_67
2636 ldrb pixel_2, [ uv_2 ]
2637
2638 uxtah uv_7, texture_ptr, uv_67, ror #16
2639 ldrb pixel_3, [ uv_3 ]
2640
2641 ldrb pixel_4, [ uv_4 ]
2642 subs num_blocks, num_blocks, #1
2643
2644 ldrb pixel_5, [ uv_5 ]
2645 orr pixels_a, pixel_0, pixel_1, lsl #8
2646
2647 ldrb pixel_6, [ uv_6 ]
2648 orr pixels_b, pixel_4, pixel_5, lsl #8
2649
2650 ldrb pixel_7, [ uv_7 ]
2651 orr pixels_a, pixels_a, pixel_2, lsl #16
2652
2653 orr pixels_b, pixels_b, pixel_6, lsl #16
2654 orr pixels_a, pixels_a, pixel_3, lsl #24
2655
2656 orr pixels_b, pixels_b, pixel_7, lsl #24
2657 vmov.u32 texels, pixels_a, pixels_b
2658
2659 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2660 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2661
2662 vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2663 bne 0b
2664
2665 ldmia sp!, { r3 - r11, pc }
2666
26671:
2668 stmdb sp!, { r1 - r2 }
2669 bl update_texture_4bpp_cache
2670
2671 mov c_64, #64
2672 ldmia sp!, { r1 - r2 }
2673 bal 0b
2674
2675
2676.align 3
2677
2678function(texture_blocks_8bpp)
2679 stmdb sp!, { r3 - r11, r14 }
2680 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2681
2682 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2683 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2684
2685 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2686 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2687
2688 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2689 tst dirty_textures_mask, current_texture_mask
2690
2691 bne 1f
2692 nop
2693
26940:
2695 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2696
2697 uxtah uv_0, texture_ptr, uv_01
2698 uxtah uv_1, texture_ptr, uv_01, ror #16
2699
2700 uxtah uv_2, texture_ptr, uv_23
2701 uxtah uv_3, texture_ptr, uv_23, ror #16
2702
2703 uxtah uv_4, texture_ptr, uv_45
2704 ldrb pixel_0, [ uv_0 ]
2705
2706 uxtah uv_5, texture_ptr, uv_45, ror #16
2707 ldrb pixel_1, [ uv_1 ]
2708
2709 uxtah uv_6, texture_ptr, uv_67
2710 ldrb pixel_2, [ uv_2 ]
2711
2712 uxtah uv_7, texture_ptr, uv_67, ror #16
2713 ldrb pixel_3, [ uv_3 ]
2714
2715 ldrb pixel_4, [ uv_4 ]
2716 add pixel_0, pixel_0, pixel_0
2717
2718 ldrb pixel_5, [ uv_5 ]
2719 add pixel_1, pixel_1, pixel_1
2720
2721 ldrb pixel_6, [ uv_6 ]
2722 add pixel_2, pixel_2, pixel_2
2723
2724 ldrb pixel_7, [ uv_7 ]
2725 add pixel_3, pixel_3, pixel_3
2726
2727 ldrh pixel_0, [ clut_ptr, pixel_0 ]
2728 add pixel_4, pixel_4, pixel_4
2729
2730 ldrh pixel_1, [ clut_ptr, pixel_1 ]
2731 add pixel_5, pixel_5, pixel_5
2732
2733 ldrh pixel_2, [ clut_ptr, pixel_2 ]
2734 add pixel_6, pixel_6, pixel_6
2735
2736 ldrh pixel_3, [ clut_ptr, pixel_3 ]
2737 add pixel_7, pixel_7, pixel_7
2738
2739 ldrh pixel_4, [ clut_ptr, pixel_4 ]
2740 orr pixels_a, pixel_0, pixel_1, lsl #16
2741
2742 ldrh pixel_5, [ clut_ptr, pixel_5 ]
2743 orr pixels_c, pixel_2, pixel_3, lsl #16
2744
2745 ldrh pixel_6, [ clut_ptr, pixel_6 ]
2746 subs num_blocks, num_blocks, #1
2747
2748 ldrh pixel_7, [ clut_ptr, pixel_7 ]
2749 orr pixels_b, pixel_4, pixel_5, lsl #16
2750
2751 orr pixels_d, pixel_6, pixel_7, lsl #16
2752 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2753
2754 add block_ptr, block_ptr, #64
2755 bne 0b
2756
2757 ldmia sp!, { r3 - r11, pc }
2758
27591:
2760 stmdb sp!, { r1 - r2, r12 }
2761
2762 bl update_texture_8bpp_cache
2763
2764 ldmia sp!, { r1 - r2, r12 }
2765 bal 0b
2766
2767
2768#undef uv_0
2769#undef uv_1
2770#undef uv_2
2771#undef uv_3
2772#undef uv_4
2773#undef uv_5
2774#undef uv_6
2775#undef uv_7
2776
2777#undef pixel_0
2778#undef pixel_1
2779#undef pixel_2
2780#undef pixel_3
2781#undef pixel_4
2782#undef pixel_5
2783#undef pixel_6
2784#undef pixel_7
2785
2786#undef texture_ptr
2787
2788#undef pixels_a
2789#undef pixels_b
2790#undef pixels_c
2791#undef pixels_d
2792
2793#define psx_gpu r0
2794#define block_ptr r1
2795#define num_blocks r2
2796
2797#define uv_0 r3
2798#define uv_1 r4
2799#define u_0 r3
2800#define u_1 r4
2801#define v_0 r5
2802#define v_1 r6
2803
2804#define uv_2 r5
2805#define uv_3 r6
2806#define u_2 r5
2807#define u_3 r6
2808#define v_2 r7
2809#define v_3 r8
2810
2811#define uv_4 r7
2812#define uv_5 r8
2813#define u_4 r7
2814#define u_5 r8
2815#define v_4 r9
2816#define v_5 r10
2817
2818#define uv_6 r9
2819#define uv_7 r10
2820#define u_6 r9
2821#define u_7 r10
2822#define v_6 r11
2823#define v_7 r0
2824
2825#define pixel_0 r3
2826#define pixel_1 r4
2827#define pixel_2 r5
2828#define pixel_3 r6
2829#define pixel_4 r7
2830#define pixel_5 r8
2831#define pixel_6 r9
2832#define pixel_7 r10
2833
2834#define pixels_a r3
2835#define pixels_b r5
2836#define pixels_c r7
2837#define pixels_d r9
2838
2839#define texture_ptr r12
2840
2841
2842.align 3
2843
2844function(texture_blocks_16bpp)
2845 stmdb sp!, { r3 - r11, r14 }
2846 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2847
2848 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2849 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2850
28510:
2852 ldrh uv_0, [ block_ptr ]
2853 subs num_blocks, num_blocks, #1
2854
2855 ldrh uv_1, [ block_ptr, #2 ]
2856
2857 and v_0, uv_0, #0xFF00
2858 and v_1, uv_1, #0xFF00
2859
2860 and u_0, uv_0, #0xFF
2861 and u_1, uv_1, #0xFF
2862
2863 add uv_0, u_0, v_0, lsl #2
2864 ldrh uv_2, [ block_ptr, #4 ]
2865
2866 add uv_1, u_1, v_1, lsl #2
2867 ldrh uv_3, [ block_ptr, #6 ]
2868
2869 add uv_0, uv_0, uv_0
2870 add uv_1, uv_1, uv_1
2871
2872 and v_2, uv_2, #0xFF00
2873 and v_3, uv_3, #0xFF00
2874
2875 and u_2, uv_2, #0xFF
2876 and u_3, uv_3, #0xFF
2877
2878 add uv_2, u_2, v_2, lsl #2
2879 ldrh uv_4, [ block_ptr, #8 ]
2880
2881 add uv_3, u_3, v_3, lsl #2
2882 ldrh uv_5, [ block_ptr, #10 ]
2883
2884 add uv_2, uv_2, uv_2
2885 add uv_3, uv_3, uv_3
2886
2887 and v_4, uv_4, #0xFF00
2888 and v_5, uv_5, #0xFF00
2889
2890 and u_4, uv_4, #0xFF
2891 and u_5, uv_5, #0xFF
2892
2893 add uv_4, u_4, v_4, lsl #2
2894 ldrh uv_6, [ block_ptr, #12 ]
2895
2896 add uv_5, u_5, v_5, lsl #2
2897 ldrh uv_7, [ block_ptr, #14 ]
2898
2899 add uv_4, uv_4, uv_4
2900 ldrh pixel_0, [ texture_ptr, uv_0 ]
2901
2902 add uv_5, uv_5, uv_5
2903 ldrh pixel_1, [ texture_ptr, uv_1 ]
2904
2905 and v_6, uv_6, #0xFF00
2906 ldrh pixel_2, [ texture_ptr, uv_2 ]
2907
2908 and v_7, uv_7, #0xFF00
2909 ldrh pixel_3, [ texture_ptr, uv_3 ]
2910
2911 and u_6, uv_6, #0xFF
2912 ldrh pixel_4, [ texture_ptr, uv_4 ]
2913
2914 and u_7, uv_7, #0xFF
2915 ldrh pixel_5, [ texture_ptr, uv_5 ]
2916
2917 add uv_6, u_6, v_6, lsl #2
2918 add uv_7, u_7, v_7, lsl #2
2919
2920 add uv_6, uv_6, uv_6
2921 add uv_7, uv_7, uv_7
2922
2923 orr pixels_a, pixel_0, pixel_1, lsl #16
2924 orr pixels_b, pixel_2, pixel_3, lsl #16
2925
2926 ldrh pixel_6, [ texture_ptr, uv_6 ]
2927 orr pixels_c, pixel_4, pixel_5, lsl #16
2928
2929 ldrh pixel_7, [ texture_ptr, uv_7 ]
2930 orr pixels_d, pixel_6, pixel_7, lsl #16
2931
2932 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2933 add block_ptr, block_ptr, #64
2934
2935 bne 0b
2936
2937 ldmia sp!, { r3 - r11, pc }
2938
2939
2940#undef num_blocks
2941
2942#undef test_mask
2943#undef texels
2944#undef pixels_b
2945#undef pixels
2946#undef d64_1
2947#undef d64_4
2948#undef d64_128
2949#undef draw_mask
2950#undef msb_mask
2951#undef msb_mask_low
2952#undef msb_mask_high
2953#undef fb_pixels
2954
2955#undef c_32
2956#undef fb_ptr
2957#undef mask_msb_ptr
2958
2959#define psx_gpu r0
2960#define num_blocks r1
2961#define color_ptr r2
2962#define mask_msb_ptr r2
2963
2964#define block_ptr_load_a r0
2965#define block_ptr_store r3
2966#define block_ptr_load_b r12
2967#define c_32 r2
2968
2969#define c_48 r4
2970#define fb_ptr r14
2971#define draw_mask_bits_scalar r5
2972
2973#define d128_0x07 q0
2974#define d128_0x1F q1
2975#define d128_0x8000 q2
2976#define test_mask q3
2977#define texels q4
2978#define colors_rg q5
2979#define colors_b_dm_bits q6
2980#define texels_rg q7
2981#define pixels_r q8
2982#define pixels_g q9
2983#define pixels_b q10
2984#define pixels q11
2985#define zero_mask q4
2986#define draw_mask q12
2987#define msb_mask q13
2988
2989#define fb_pixels q8
2990
2991#define pixels_gb_low q9
2992
2993#define colors_r d10
2994#define colors_g d11
2995#define colors_b d12
2996#define draw_mask_bits d13
2997#define texels_r d14
2998#define texels_g d15
2999#define pixels_r_low d16
3000#define pixels_g_low d18
3001#define pixels_b_low d19
3002#define msb_mask_low d26
3003#define msb_mask_high d27
3004
3005#define d64_1 d28
3006#define d64_4 d29
3007#define d64_128 d30
3008#define texels_b d31
3009
3010#define shade_blocks_textured_modulated_prologue_indirect() \
3011 mov c_48, #48; \
3012 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3013
3014#define shade_blocks_textured_modulated_prologue_direct() \
3015 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3016 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \
3017
3018#define shade_blocks_textured_modulated_prologue_shaded() \
3019
3020#define shade_blocks_textured_modulated_prologue_unshaded() \
3021 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
3022 vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \
3023 vdup.u8 colors_g, colors_r[1]; \
3024 vdup.u8 colors_b, colors_r[2]; \
3025 vdup.u8 colors_r, colors_r[0] \
3026
3027
3028#define shade_blocks_textured_modulated_load_dithered(target) \
3029 vld1.u32 { target }, [ block_ptr_load_b, :128 ] \
3030
3031#define shade_blocks_textured_modulated_load_last_dithered(target) \
3032 vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \
3033
3034#define shade_blocks_textured_modulated_load_undithered(target) \
3035
3036#define shade_blocks_textured_modulated_load_last_undithered(target) \
3037 add block_ptr_load_b, block_ptr_load_b, #32 \
3038
3039#define shade_blocks_textured_modulate_dithered(channel) \
3040 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3041
3042#define shade_blocks_textured_modulate_undithered(channel) \
3043 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3044
3045
3046#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
3047 vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \
3048
3049#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
3050 ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \
3051 vld1.u32 { fb_pixels }, [ fb_ptr ]; \
3052 vbit.u16 pixels, fb_pixels, draw_mask \
3053
3054#define shade_blocks_textured_modulated_store_pixels_indirect() \
3055 vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \
3056
3057#define shade_blocks_textured_modulated_store_pixels_direct() \
3058 vst1.u32 { pixels }, [ fb_ptr ] \
3059
3060
3061#define shade_blocks_textured_modulated_load_rg_shaded() \
3062 vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \
3063
3064#define shade_blocks_textured_modulated_load_rg_unshaded() \
3065 add block_ptr_load_b, block_ptr_load_b, #32 \
3066
3067#define shade_blocks_textured_modulated_load_bdm_shaded() \
3068 vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \
3069
3070#define shade_blocks_textured_modulated_load_bdm_unshaded() \
3071 ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \
3072 add block_ptr_load_a, block_ptr_load_a, #32 \
3073
3074#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3075 vdup.u16 draw_mask, draw_mask_bits[0] \
3076
3077#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3078 vdup.u16 draw_mask, draw_mask_bits_scalar \
3079
3080
3081#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3082
3083#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3084 vorr.u16 pixels, pixels, msb_mask \
3085
3086
3087#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3088.align 3; \
3089 \
3090function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3091 stmdb sp!, { r4 - r5, lr }; \
3092 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3093 \
3094 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
3095 \
3096 shade_blocks_textured_modulated_prologue_##target(); \
3097 shade_blocks_textured_modulated_prologue_##shading(); \
3098 \
3099 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3100 mov c_32, #32; \
3101 \
3102 add block_ptr_load_b, block_ptr_load_a, #16; \
3103 vmov.u8 d64_1, #1; \
3104 vmov.u8 d64_4, #4; \
3105 vmov.u8 d64_128, #128; \
3106 \
3107 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3108 vmov.u8 d128_0x07, #0x07; \
3109 \
3110 shade_blocks_textured_modulated_load_rg_##shading(); \
3111 vmov.u8 d128_0x1F, #0x1F; \
3112 \
3113 shade_blocks_textured_modulated_load_bdm_##shading(); \
3114 vmov.u16 d128_0x8000, #0x8000; \
3115 \
3116 vmovn.u16 texels_r, texels; \
3117 vshrn.u16 texels_g, texels, #5; \
3118 \
3119 vshrn.u16 texels_b, texels, #7; \
3120 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3121 \
3122 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3123 vtst.u16 draw_mask, draw_mask, test_mask; \
3124 \
3125 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3126 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3127 \
3128 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3129 vshr.u8 texels_b, texels_b, #3; \
3130 \
3131 shade_blocks_textured_modulate_##dithering(r); \
3132 shade_blocks_textured_modulate_##dithering(g); \
3133 shade_blocks_textured_modulate_##dithering(b); \
3134 \
3135 vand.u16 pixels, texels, d128_0x8000; \
3136 vceq.u16 zero_mask, texels, #0; \
3137 \
3138 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3139 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3140 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3141 \
3142 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3143 vorr.u16 draw_mask, draw_mask, zero_mask; \
3144 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3145 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3146 \
3147 subs num_blocks, num_blocks, #1; \
3148 beq 1f; \
3149 \
3150 .align 3; \
3151 \
3152 0: \
3153 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3154 shade_blocks_textured_modulated_load_rg_##shading(); \
3155 vshrn.u16 texels_g, texels, #5; \
3156 \
3157 shade_blocks_textured_modulated_load_bdm_##shading(); \
3158 vshrn.u16 texels_b, texels, #7; \
3159 \
3160 vmovn.u16 texels_r, texels; \
3161 vmlal.u8 pixels, pixels_r_low, d64_1; \
3162 \
3163 vmlal.u8 pixels, pixels_g_low, d64_4; \
3164 vmlal.u8 pixels, pixels_b_low, d64_128; \
3165 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3166 \
3167 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3168 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3169 \
3170 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3171 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3172 \
3173 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3174 vtst.u16 draw_mask, draw_mask, test_mask; \
3175 \
3176 shade_blocks_textured_modulated_store_pixels_##target(); \
3177 vshr.u8 texels_b, texels_b, #3; \
3178 \
3179 shade_blocks_textured_modulate_##dithering(r); \
3180 shade_blocks_textured_modulate_##dithering(g); \
3181 shade_blocks_textured_modulate_##dithering(b); \
3182 \
3183 vand.u16 pixels, texels, d128_0x8000; \
3184 vceq.u16 zero_mask, texels, #0; \
3185 \
3186 subs num_blocks, num_blocks, #1; \
3187 \
3188 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3189 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3190 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3191 \
3192 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3193 vorr.u16 draw_mask, draw_mask, zero_mask; \
3194 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3195 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3196 \
3197 bne 0b; \
3198 \
3199 1: \
3200 vmlal.u8 pixels, pixels_r_low, d64_1; \
3201 vmlal.u8 pixels, pixels_g_low, d64_4; \
3202 vmlal.u8 pixels, pixels_b_low, d64_128; \
3203 \
3204 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3205 shade_blocks_textured_modulated_store_pixels_##target(); \
3206 \
3207 ldmia sp!, { r4 - r5, pc } \
3208
3209
3210shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3211shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3212shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3213shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3214
3215shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3216shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3217shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3218shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3219
3220
3221#undef c_64
3222#undef fb_ptr
3223#undef color_ptr
3224
3225#undef color_r
3226#undef color_g
3227#undef color_b
3228
3229#undef test_mask
3230#undef pixels
3231#undef draw_mask
3232#undef zero_mask
3233#undef fb_pixels
3234#undef msb_mask
3235#undef msb_mask_low
3236#undef msb_mask_high
3237
3238#define psx_gpu r0
3239#define num_blocks r1
3240#define mask_msb_ptr r2
3241#define color_ptr r3
3242
3243#define block_ptr_load r0
3244#define draw_mask_store_ptr r3
3245#define draw_mask_bits_ptr r12
3246#define draw_mask_ptr r12
3247#define pixel_store_ptr r14
3248
3249#define fb_ptr_cmp r4
3250
3251#define fb_ptr r3
3252#define fb_ptr_next r14
3253
3254#define c_64 r2
3255
3256#define test_mask q0
3257#define pixels q1
3258#define draw_mask q2
3259#define zero_mask q3
3260#define draw_mask_combined q4
3261#define fb_pixels q5
3262#define fb_pixels_next q6
3263#define msb_mask q7
3264
3265#define draw_mask_low d4
3266#define draw_mask_high d5
3267#define msb_mask_low d14
3268#define msb_mask_high d15
3269
3270.align 3
3271function(shade_blocks_textured_unmodulated_indirect)
3272 str r14, [ sp, #-4 ]
3273 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3274
3275 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3276 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3277
3278 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3279 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3280
3281 mov c_64, #64
3282 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3283
3284 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3285 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3286 [ draw_mask_bits_ptr, :16 ], c_64
3287 vceq.u16 zero_mask, pixels, #0
3288
3289 vtst.u16 draw_mask, draw_mask, test_mask
3290 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3291
3292 subs num_blocks, num_blocks, #1
3293 beq 1f
3294
3295 0:
3296 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3297 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3298
3299 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3300 [ draw_mask_bits_ptr, :16 ], c_64
3301 vceq.u16 zero_mask, pixels, #0
3302
3303 vtst.u16 draw_mask, draw_mask, test_mask
3304 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3305
3306 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3307 subs num_blocks, num_blocks, #1
3308
3309 bne 0b
3310
3311 1:
3312 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3313 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3314
3315 ldr pc, [ sp, #-4 ]
3316
3317
3318.align 3
3319
3320function(shade_blocks_textured_unmodulated_direct)
3321 stmdb sp!, { r4, r14 }
3322 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3323
3324 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3325 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3326
3327 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3328 mov c_64, #64
3329
3330 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3331 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3332
3333 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3334 [ draw_mask_bits_ptr, :16 ], c_64
3335 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3336
3337 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3338 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3339 vceq.u16 zero_mask, pixels, #0
3340 vtst.u16 draw_mask, draw_mask, test_mask
3341
3342 subs num_blocks, num_blocks, #1
3343 beq 1f
3344
3345 0:
3346 mov fb_ptr, fb_ptr_next
3347 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3348
3349 vorr.u16 pixels, pixels, msb_mask
3350
3351 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3352 vmov fb_pixels, fb_pixels_next
3353
3354 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3355 [ draw_mask_bits_ptr, :16 ], c_64
3356 vbif.u16 fb_pixels, pixels, draw_mask_combined
3357
3358 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3359
3360 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3361 add fb_ptr_cmp, fb_ptr_cmp, #14
3362 cmp fb_ptr_cmp, #28
3363 bls 4f
3364
3365 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3366 vceq.u16 zero_mask, pixels, #0
3367
3368 vst1.u16 { fb_pixels }, [ fb_ptr ]
3369 vtst.u16 draw_mask, draw_mask, test_mask
3370
3371 3:
3372 subs num_blocks, num_blocks, #1
3373 bne 0b
3374
3375 1:
3376 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3377 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3378
3379 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3380
3381 ldmia sp!, { r4, pc }
3382
3383 4:
3384 vst1.u16 { fb_pixels }, [ fb_ptr ]
3385 vceq.u16 zero_mask, pixels, #0
3386
3387 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3388 vtst.u16 draw_mask, draw_mask, test_mask
3389
3390 bal 3b
3391
3392
3393function(shade_blocks_unshaded_untextured_indirect)
3394 bx lr
3395
3396.align 3
3397
3398function(shade_blocks_unshaded_untextured_direct)
3399 stmdb sp!, { r4, r14 }
3400 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3401
3402 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3403 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3404
3405 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3406 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3407
3408 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3409 vld1.u16 { pixels }, [ color_ptr, :128 ]
3410
3411 mov c_64, #64
3412 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3413
3414 vorr.u16 pixels, pixels, msb_mask
3415 subs num_blocks, num_blocks, #1
3416
3417 ldr fb_ptr_next, [ block_ptr_load ], #64
3418
3419 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3420 beq 1f
3421
3422 0:
3423 vmov fb_pixels, fb_pixels_next
3424 mov fb_ptr, fb_ptr_next
3425 ldr fb_ptr_next, [ block_ptr_load ], #64
3426
3427 vbif.u16 fb_pixels, pixels, draw_mask
3428 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3429
3430 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3431 add fb_ptr_cmp, fb_ptr_cmp, #14
3432 cmp fb_ptr_cmp, #28
3433 bls 4f
3434
3435 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3436 vst1.u16 { fb_pixels }, [ fb_ptr ]
3437
3438 3:
3439 subs num_blocks, num_blocks, #1
3440 bne 0b
3441
3442 1:
3443 vbif.u16 fb_pixels_next, pixels, draw_mask
3444 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3445
3446 ldmia sp!, { r4, pc }
3447
3448 4:
3449 vst1.u16 { fb_pixels }, [ fb_ptr ]
3450 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3451 bal 3b
3452
3453
3454#undef draw_mask_ptr
3455#undef c_64
3456#undef fb_ptr
3457#undef fb_ptr_next
3458#undef fb_ptr_cmp
3459
3460#define psx_gpu r0
3461#define num_blocks r1
3462#define msb_mask_ptr r2
3463#define pixel_ptr r3
3464#define draw_mask_ptr r0
3465#define c_64 r2
3466#define fb_ptr r12
3467#define fb_ptr_next r14
3468#define fb_ptr_cmp r4
3469
3470#undef msb_mask
3471#undef draw_mask
3472#undef pixels
3473#undef fb_pixels
3474#undef d128_0x8000
3475#undef msb_mask_low
3476#undef msb_mask_high
3477#undef draw_mask_next
3478#undef pixels_g
3479#undef blend_pixels
3480#undef fb_pixels_next
3481
3482#define msb_mask q0
3483#define draw_mask q1
3484#define pixels q2
3485#define fb_pixels q3
3486#define blend_pixels q4
3487#define pixels_no_msb q5
3488#define blend_mask q6
3489#define fb_pixels_no_msb q7
3490#define d128_0x8000 q8
3491#define d128_0x0421 q9
3492#define fb_pixels_next q10
3493#define blend_pixels_next q11
3494#define pixels_next q12
3495#define draw_mask_next q13
3496#define write_mask q14
3497
3498#define pixels_rb q5
3499#define pixels_mg q7
3500#define pixels_g q7
3501#define d128_0x7C1F q8
3502#define d128_0x03E0 q9
3503#define fb_pixels_rb q10
3504#define fb_pixels_g q11
3505#define fb_pixels_masked q11
3506#define d128_0x83E0 q15
3507#define pixels_fourth q7
3508#define d128_0x1C07 q12
3509#define d128_0x00E0 q13
3510#define d128_0x80E0 q13
3511
3512#define msb_mask_low d0
3513#define msb_mask_high d1
3514
3515#define blend_blocks_average_set_blend_mask_textured(source) \
3516 vclt.s16 blend_mask, source, #0 \
3517
3518#define blend_blocks_average_set_stp_bit_textured() \
3519 vorr.u16 blend_pixels, #0x8000 \
3520
3521#define blend_blocks_average_combine_textured(source) \
3522 vbif.u16 blend_pixels, source, blend_mask \
3523
3524#define blend_blocks_average_set_blend_mask_untextured(source) \
3525
3526#define blend_blocks_average_set_stp_bit_untextured() \
3527
3528#define blend_blocks_average_combine_untextured(source) \
3529
3530#define blend_blocks_average_mask_set_on() \
3531 vclt.s16 write_mask, fb_pixels_next, #0 \
3532
3533#define blend_blocks_average_mask_copy_on() \
3534 vorr.u16 draw_mask, draw_mask_next, write_mask \
3535
3536#define blend_blocks_average_mask_copy_b_on() \
3537 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3538
3539#define blend_blocks_average_mask_set_off() \
3540
3541#define blend_blocks_average_mask_copy_off() \
3542 vmov draw_mask, draw_mask_next \
3543
3544#define blend_blocks_average_mask_copy_b_off() \
3545
3546#define blend_blocks_average_builder(texturing, mask_evaluate) \
3547.align 3; \
3548 \
3549function(blend_blocks_##texturing##_average_##mask_evaluate) \
3550 stmdb sp!, { r4, r14 }; \
3551 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3552 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3553 \
3554 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3555 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3556 \
3557 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3558 mov c_64, #64; \
3559 \
3560 vmov.u16 d128_0x8000, #0x8000; \
3561 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3562 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3563 \
3564 vmov.u16 d128_0x0421, #0x0400; \
3565 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3566 \
3567 vorr.u16 d128_0x0421, #0x0021; \
3568 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3569 \
3570 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3571 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3572 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3573 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3574 blend_blocks_average_mask_set_##mask_evaluate(); \
3575 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3576 \
3577 subs num_blocks, num_blocks, #1; \
3578 beq 1f; \
3579 \
3580 0: \
3581 mov fb_ptr, fb_ptr_next; \
3582 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3583 \
3584 vmov pixels, pixels_next; \
3585 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3586 \
3587 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3588 \
3589 blend_blocks_average_mask_copy_##mask_evaluate(); \
3590 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3591 \
3592 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3593 blend_blocks_average_set_stp_bit_##texturing(); \
3594 vmov fb_pixels, fb_pixels_next; \
3595 blend_blocks_average_combine_##texturing(pixels); \
3596 \
3597 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3598 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3599 cmp fb_ptr_cmp, #28; \
3600 bls 2f; \
3601 \
3602 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3603 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3604 \
3605 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3606 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3607 \
3608 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3609 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3610 \
3611 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3612 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3613 blend_blocks_average_mask_set_##mask_evaluate(); \
3614 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3615 \
3616 3: \
3617 subs num_blocks, num_blocks, #1; \
3618 bne 0b; \
3619 \
3620 1: \
3621 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3622 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3623 \
3624 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3625 blend_blocks_average_set_stp_bit_##texturing(); \
3626 blend_blocks_average_combine_##texturing(pixels_next); \
3627 \
3628 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3629 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
3630 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3631 \
3632 ldmia sp!, { r4, pc }; \
3633 \
3634 2: \
3635 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3636 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3637 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3638 \
3639 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3640 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3641 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3642 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3643 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3644 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3645 \
3646 bal 3b \
3647
3648blend_blocks_average_builder(textured, off)
3649blend_blocks_average_builder(untextured, off)
3650blend_blocks_average_builder(textured, on)
3651blend_blocks_average_builder(untextured, on)
3652
3653
3654#define blend_blocks_add_mask_set_on() \
3655 vclt.s16 write_mask, fb_pixels, #0 \
3656
3657#define blend_blocks_add_mask_copy_on() \
3658 vorr.u16 draw_mask, draw_mask, write_mask \
3659
3660#define blend_blocks_add_mask_set_off() \
3661
3662#define blend_blocks_add_mask_copy_off() \
3663
3664
3665#define blend_blocks_add_textured_builder(mask_evaluate) \
3666.align 3; \
3667 \
3668function(blend_blocks_textured_add_##mask_evaluate) \
3669 stmdb sp!, { r4, r14 }; \
3670 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3671 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3672 \
3673 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3674 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3675 \
3676 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3677 mov c_64, #64; \
3678 \
3679 vmov.u16 d128_0x7C1F, #0x7C00; \
3680 vmov.u16 d128_0x03E0, #0x0300; \
3681 vmov.u16 d128_0x83E0, #0x8000; \
3682 vorr.u16 d128_0x03E0, #0x00E0; \
3683 vorr.u16 d128_0x7C1F, #0x001F; \
3684 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3685 \
3686 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3687 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3688 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3689 vclt.s16 blend_mask, pixels, #0; \
3690 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3691 blend_blocks_add_mask_set_##mask_evaluate(); \
3692 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3693 \
3694 blend_blocks_add_mask_copy_##mask_evaluate(); \
3695 vorr.u16 pixels, pixels, msb_mask; \
3696 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3697 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3698 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3699 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3700 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3701 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3702 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3703 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3704 \
3705 subs num_blocks, num_blocks, #1; \
3706 beq 1f; \
3707 \
3708 0: \
3709 mov fb_ptr, fb_ptr_next; \
3710 \
3711 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3712 \
3713 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3714 vclt.s16 blend_mask, pixels, #0; \
3715 \
3716 vorr.u16 pixels, pixels, msb_mask; \
3717 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3718 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3719 \
3720 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3721 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3722 \
3723 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3724 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3725 cmp fb_ptr_cmp, #28; \
3726 bls 2f; \
3727 \
3728 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3729 blend_blocks_add_mask_set_##mask_evaluate(); \
3730 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3731 blend_blocks_add_mask_copy_##mask_evaluate(); \
3732 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3733 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3734 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3735 \
3736 3: \
3737 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3738 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3739 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3740 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3741 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3742 \
3743 subs num_blocks, num_blocks, #1; \
3744 bne 0b; \
3745 \
3746 1: \
3747 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3748 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3749 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3750 \
3751 ldmia sp!, { r4, pc }; \
3752 \
3753 2: \
3754 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3755 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3756 \
3757 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3758 blend_blocks_add_mask_set_##mask_evaluate(); \
3759 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3760 blend_blocks_add_mask_copy_##mask_evaluate(); \
3761 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3762 bal 3b \
3763
3764
3765#define blend_blocks_add_untextured_builder(mask_evaluate) \
3766.align 3; \
3767 \
3768function(blend_blocks_untextured_add_##mask_evaluate) \
3769 stmdb sp!, { r4, r14 }; \
3770 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3771 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3772 \
3773 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3774 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3775 \
3776 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3777 mov c_64, #64; \
3778 \
3779 vmov.u16 d128_0x7C1F, #0x7C00; \
3780 vmov.u16 d128_0x03E0, #0x0300; \
3781 vorr.u16 d128_0x7C1F, #0x001F; \
3782 vorr.u16 d128_0x03E0, #0x00E0; \
3783 \
3784 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3785 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3786 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3787 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3788 blend_blocks_add_mask_set_##mask_evaluate(); \
3789 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3790 \
3791 blend_blocks_add_mask_copy_##mask_evaluate(); \
3792 vand.u16 pixels_g, pixels, d128_0x03E0; \
3793 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3794 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3795 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3796 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3797 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3798 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3799 \
3800 subs num_blocks, num_blocks, #1; \
3801 beq 1f; \
3802 \
3803 0: \
3804 mov fb_ptr, fb_ptr_next; \
3805 \
3806 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3807 \
3808 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3809 \
3810 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3811 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3812 vand.u16 pixels_g, pixels, d128_0x03E0; \
3813 \
3814 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3815 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3816 \
3817 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3818 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3819 cmp fb_ptr_cmp, #28; \
3820 bls 2f; \
3821 \
3822 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3823 blend_blocks_add_mask_set_##mask_evaluate(); \
3824 blend_blocks_add_mask_copy_##mask_evaluate(); \
3825 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3826 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3827 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3828 \
3829 3: \
3830 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3831 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3832 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3833 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3834 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3835 \
3836 subs num_blocks, num_blocks, #1; \
3837 bne 0b; \
3838 \
3839 1: \
3840 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3841 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3842 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3843 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3844 \
3845 ldmia sp!, { r4, pc }; \
3846 \
3847 2: \
3848 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3849 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3850 \
3851 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3852 blend_blocks_add_mask_set_##mask_evaluate(); \
3853 blend_blocks_add_mask_copy_##mask_evaluate(); \
3854 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3855 bal 3b \
3856
3857
3858blend_blocks_add_textured_builder(off)
3859blend_blocks_add_textured_builder(on)
3860blend_blocks_add_untextured_builder(off)
3861blend_blocks_add_untextured_builder(on)
3862
3863#define blend_blocks_subtract_set_blend_mask_textured() \
3864 vclt.s16 blend_mask, pixels_next, #0 \
3865
3866#define blend_blocks_subtract_combine_textured() \
3867 vbif.u16 blend_pixels, pixels, blend_mask \
3868
3869#define blend_blocks_subtract_set_stb_textured() \
3870 vorr.u16 blend_pixels, #0x8000 \
3871
3872#define blend_blocks_subtract_msb_mask_textured() \
3873 vorr.u16 pixels, pixels_next, msb_mask \
3874
3875#define blend_blocks_subtract_set_blend_mask_untextured() \
3876
3877#define blend_blocks_subtract_combine_untextured() \
3878
3879#define blend_blocks_subtract_set_stb_untextured() \
3880 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3881
3882#define blend_blocks_subtract_msb_mask_untextured() \
3883
3884
3885#define blend_blocks_subtract_mask_set_on() \
3886 vclt.s16 write_mask, fb_pixels, #0 \
3887
3888#define blend_blocks_subtract_mask_copy_on() \
3889 vorr.u16 draw_mask, draw_mask_next, write_mask \
3890
3891#define blend_blocks_subtract_mask_set_off() \
3892
3893#define blend_blocks_subtract_mask_copy_off() \
3894 vmov draw_mask, draw_mask_next \
3895
3896
3897#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3898.align 3; \
3899 \
3900function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3901 stmdb sp!, { r4, r14 }; \
3902 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3903 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3904 \
3905 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3906 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3907 \
3908 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3909 mov c_64, #64; \
3910 \
3911 vmov.u16 d128_0x7C1F, #0x7C00; \
3912 vmov.u16 d128_0x03E0, #0x0300; \
3913 vorr.u16 d128_0x7C1F, #0x001F; \
3914 vorr.u16 d128_0x03E0, #0x00E0; \
3915 \
3916 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3917 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3918 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3919 blend_blocks_subtract_set_blend_mask_##texturing(); \
3920 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3921 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3922 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3923 \
3924 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3925 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3926 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3927 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3928 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3929 \
3930 subs num_blocks, num_blocks, #1; \
3931 beq 1f; \
3932 \
3933 0: \
3934 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3935 mov fb_ptr, fb_ptr_next; \
3936 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3937 \
3938 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3939 blend_blocks_subtract_msb_mask_##texturing(); \
3940 \
3941 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3942 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3943 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3944 blend_blocks_subtract_set_stb_##texturing(); \
3945 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3946 blend_blocks_subtract_combine_##texturing(); \
3947 blend_blocks_subtract_set_blend_mask_##texturing(); \
3948 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3949 \
3950 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3951 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3952 cmp fb_ptr_cmp, #28; \
3953 bls 2f; \
3954 \
3955 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3956 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3957 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3958 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3959 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3960 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3961 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3962 \
3963 3: \
3964 subs num_blocks, num_blocks, #1; \
3965 bne 0b; \
3966 \
3967 1: \
3968 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3969 \
3970 blend_blocks_subtract_msb_mask_##texturing(); \
3971 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3972 blend_blocks_subtract_set_stb_##texturing(); \
3973 blend_blocks_subtract_combine_##texturing(); \
3974 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3975 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3976 \
3977 ldmia sp!, { r4, pc }; \
3978 \
3979 2: \
3980 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3981 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3982 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3983 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3984 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3985 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3986 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3987 bal 3b \
3988
3989
3990blend_blocks_subtract_builder(textured, off)
3991blend_blocks_subtract_builder(textured, on)
3992blend_blocks_subtract_builder(untextured, off)
3993blend_blocks_subtract_builder(untextured, on)
3994
3995
3996#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
3997.align 3; \
3998 \
3999function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4000 stmdb sp!, { r4, r14 }; \
4001 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4002 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4003 \
4004 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4005 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4006 \
4007 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4008 mov c_64, #64; \
4009 \
4010 vmov.u16 d128_0x7C1F, #0x7C00; \
4011 vmov.u16 d128_0x03E0, #0x0300; \
4012 vmov.u16 d128_0x83E0, #0x8300; \
4013 vmov.u16 d128_0x1C07, #0x1C00; \
4014 vmov.u16 d128_0x80E0, #0x8000; \
4015 vorr.u16 d128_0x7C1F, #0x001F; \
4016 vorr.u16 d128_0x03E0, #0x00E0; \
4017 vorr.u16 d128_0x83E0, #0x00E0; \
4018 vorr.u16 d128_0x1C07, #0x0007; \
4019 vorr.u16 d128_0x80E0, #0x00E0; \
4020 \
4021 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4022 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4023 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4024 vclt.s16 blend_mask, pixels, #0; \
4025 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4026 blend_blocks_add_mask_set_##mask_evaluate(); \
4027 vshr.s16 pixels_fourth, pixels, #2; \
4028 \
4029 blend_blocks_add_mask_copy_##mask_evaluate(); \
4030 vorr.u16 pixels, pixels, msb_mask; \
4031 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4032 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
4033 vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \
4034 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
4035 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
4036 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4037 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
4038 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4039 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
4040 \
4041 subs num_blocks, num_blocks, #1; \
4042 beq 1f; \
4043 \
4044 0: \
4045 mov fb_ptr, fb_ptr_next; \
4046 \
4047 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4048 \
4049 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4050 vclt.s16 blend_mask, pixels, #0; \
4051 \
4052 vshr.s16 pixels_fourth, pixels, #2; \
4053 vorr.u16 pixels, pixels, msb_mask; \
4054 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4055 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4056 \
4057 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4058 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4059 \
4060 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4061 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4062 cmp fb_ptr_cmp, #28; \
4063 bls 2f; \
4064 \
4065 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4066 blend_blocks_add_mask_set_##mask_evaluate(); \
4067 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
4068 blend_blocks_add_mask_copy_##mask_evaluate(); \
4069 vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \
4070 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
4071 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4072 \
4073 3: \
4074 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
4075 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4076 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
4077 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4078 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
4079 \
4080 subs num_blocks, num_blocks, #1; \
4081 bne 0b; \
4082 \
4083 1: \
4084 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4085 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4086 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4087 \
4088 ldmia sp!, { r4, pc }; \
4089 \
4090 2: \
4091 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4092 vand.u16 pixels_mg, pixels_fourth, d128_0x80E0; \
4093 \
4094 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4095 blend_blocks_add_mask_set_##mask_evaluate(); \
4096 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
4097 blend_blocks_add_mask_copy_##mask_evaluate(); \
4098 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
4099 bal 3b \
4100
4101
4102#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4103.align 3; \
4104 \
4105function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4106 stmdb sp!, { r4, r14 }; \
4107 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4108 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4109 \
4110 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4111 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4112 \
4113 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4114 mov c_64, #64; \
4115 \
4116 vmov.u16 d128_0x7C1F, #0x7C00; \
4117 vmov.u16 d128_0x03E0, #0x0300; \
4118 vmov.u16 d128_0x83E0, #0x8300; \
4119 vmov.u16 d128_0x1C07, #0x1C00; \
4120 vmov.u16 d128_0x00E0, #0x00E0; \
4121 vorr.u16 d128_0x7C1F, #0x001F; \
4122 vorr.u16 d128_0x03E0, #0x00E0; \
4123 vorr.u16 d128_0x83E0, #0x00E0; \
4124 vorr.u16 d128_0x1C07, #0x0007; \
4125 \
4126 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4127 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4128 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4129 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4130 blend_blocks_add_mask_set_##mask_evaluate(); \
4131 vshr.s16 pixels_fourth, pixels, #2; \
4132 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4133 \
4134 blend_blocks_add_mask_copy_##mask_evaluate(); \
4135 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4136 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4137 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4138 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4139 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4140 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4141 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4142 \
4143 subs num_blocks, num_blocks, #1; \
4144 beq 1f; \
4145 \
4146 0: \
4147 mov fb_ptr, fb_ptr_next; \
4148 \
4149 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4150 \
4151 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4152 \
4153 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4154 vshr.s16 pixels_fourth, pixels, #2; \
4155 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4156 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4157 \
4158 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4159 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4160 \
4161 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4162 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4163 cmp fb_ptr_cmp, #28; \
4164 bls 2f; \
4165 \
4166 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4167 blend_blocks_add_mask_set_##mask_evaluate(); \
4168 blend_blocks_add_mask_copy_##mask_evaluate(); \
4169 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4170 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4171 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4172 \
4173 3: \
4174 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4175 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4176 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4177 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4178 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4179 \
4180 subs num_blocks, num_blocks, #1; \
4181 bne 0b; \
4182 \
4183 1: \
4184 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4185 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4186 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4187 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4188 \
4189 ldmia sp!, { r4, pc }; \
4190 \
4191 2: \
4192 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4193 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4194 \
4195 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4196 blend_blocks_add_mask_set_##mask_evaluate(); \
4197 blend_blocks_add_mask_copy_##mask_evaluate(); \
4198 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4199 bal 3b \
4200
4201
4202blend_blocks_add_fourth_textured_builder(off)
4203blend_blocks_add_fourth_textured_builder(on)
4204blend_blocks_add_fourth_untextured_builder(off)
4205blend_blocks_add_fourth_untextured_builder(on)
4206
4207// TODO: Optimize this more. Need a scene that actually uses it for
4208// confirmation..
4209
4210.align 3
4211
4212function(blend_blocks_textured_unblended_on)
4213 stmdb sp!, { r4, r14 }
4214 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4215 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4216
4217 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4218 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4219
4220 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4221 mov c_64, #64
4222
4223 ldr fb_ptr, [ pixel_ptr, #28 ]
4224 vld1.u16 { fb_pixels }, [ fb_ptr ]
4225 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4226 vclt.s16 write_mask, fb_pixels, #0
4227 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4228
4229 subs num_blocks, num_blocks, #1
4230 beq 1f
4231
4232 0:
4233 vorr.u16 draw_mask, draw_mask, write_mask
4234 vbif.u16 fb_pixels, pixels, draw_mask
4235 vst1.u16 { fb_pixels }, [ fb_ptr ]
4236
4237 ldr fb_ptr, [ pixel_ptr, #28 ]
4238 vld1.u16 { fb_pixels }, [ fb_ptr ]
4239 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4240 vclt.s16 write_mask, fb_pixels, #0
4241 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4242
4243 subs num_blocks, num_blocks, #1
4244 bne 0b
4245
4246 1:
4247 vorr.u16 draw_mask, draw_mask, write_mask
4248 vbif.u16 fb_pixels, pixels, draw_mask
4249 vst1.u16 { fb_pixels }, [ fb_ptr ]
4250
4251 ldmia sp!, { r4, pc }
4252
4253
4254function(blend_blocks_textured_unblended_off)
4255 bx lr
4256
4257
4258function(warmup)
4259 mov r3, #64
4260 cmp r0, #0
4261 bxeq lr
4262
4263 0:
4264 vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4265
4266 subs r0, r0, #1
4267 bne 0b
4268
4269 bx lr
4270
4271#undef color
4272#undef y
4273#undef height
4274
4275#define psx_gpu r0
4276#define color r1
4277#define x r2
4278#define y r3
4279
4280#define vram_ptr r0
4281#define width r3
4282#define height r12
4283
4284#define parameter_width_offset 0
4285#define parameter_height_offset 4
4286
4287#define color_r r14
4288#define color_g r4
4289#define color_b r5
4290
4291#define left_unaligned r14
4292#define right_unaligned r4
4293#define pitch r5
4294#define num_unaligned r2
4295#define num_width r6
4296
4297#undef colors
4298
4299#define colors q0
4300
4301.align 3
4302
4303function(render_block_fill_body)
4304 ldr vram_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
4305 ldr height, [ sp, #parameter_height_offset ]
4306
4307 add vram_ptr, vram_ptr, y, lsl #11
4308 ldr width, [ sp, #parameter_width_offset ]
4309
4310 add vram_ptr, vram_ptr, x, lsl #1
4311 stmdb sp!, { r4 - r6, r14 }
4312
4313 ubfx color_r, color, #3, #5
4314 ubfx color_g, color, #11, #5
4315
4316 ubfx color_b, color, #19, #5
4317 orr color, color_r, color_g, lsl #5
4318
4319 orr color, color, color_b, lsl #10
4320 add left_unaligned, x, #0x7
4321
4322 bic left_unaligned, left_unaligned, #0x7
4323 vdup.u16 colors, color
4324
4325 sub left_unaligned, left_unaligned, x
4326 mov pitch, #2048
4327
4328 sub pitch, pitch, width, lsl #1
4329 sub width, width, left_unaligned
4330
4331 and right_unaligned, width, #0x7
4332 bic width, width, #0x7
4333
4334 0:
4335 mov num_width, width, lsr #3
4336
4337 movs num_unaligned, left_unaligned
4338 beq 2f
4339
4340 1:
4341 strh color, [ vram_ptr ], #2
4342
4343 subs num_unaligned, num_unaligned, #1
4344 bne 1b
4345
4346 2:
4347 vst1.u32 { colors }, [ vram_ptr, :128 ]!
4348 subs num_width, num_width, #1
4349 bne 2b
4350
4351 movs num_unaligned, right_unaligned
4352 beq 4f
4353
4354 3:
4355 strh color, [ vram_ptr ], #2
4356
4357 subs num_unaligned, num_unaligned, #1
4358 bne 3b
4359
4360 4:
4361 add vram_ptr, vram_ptr, pitch
4362 subs height, height, #1
4363 bne 0b
4364
4365 ldmia sp!, { r4 - r6, pc }
4366
4367
4368#undef x
4369#undef y
4370#undef width
4371#undef height
4372#undef fb_ptr
4373#undef texture_mask
4374#undef num_blocks
4375#undef temp
4376#undef dirty_textures_mask
4377#undef clut_ptr
4378#undef current_texture_mask
4379
4380#define psx_gpu r0
4381#define x r1
4382#define y r2
4383#define u r3
4384#define v r4
4385#define width r5
4386#define height r6
4387#define offset_u r8
4388#define offset_v r9
4389#define offset_u_right r10
4390#define width_rounded r11
4391#define height_rounded r12
4392
4393#define texture_offset_base r1
4394#define tile_width r2
4395#define tile_height r3
4396#define num_blocks r4
4397#define block r5
4398#define sub_tile_height r6
4399#define fb_ptr r7
4400#define texture_mask r8
4401#define column_data r9
4402#define texture_offset r10
4403#define tiles_remaining r11
4404#define fb_ptr_advance_column r12
4405#define texture_block_ptr r14
4406
4407#define texture_page_ptr r3
4408#define left_block_mask r4
4409#define right_block_mask r5
4410#define texture_mask_rev r10
4411#define control_mask r11
4412
4413#define dirty_textures_mask r4
4414#define clut_ptr r5
4415#define current_texture_mask r6
4416
4417
4418#undef texels
4419#undef clut_low_a
4420#undef clut_low_b
4421#undef clut_high_a
4422#undef clut_high_b
4423#undef clut_a
4424#undef clut_b
4425#undef texels_low
4426#undef texels_high
4427
4428#define texels d0
4429#define draw_masks_fb_ptrs q1
4430
4431#define draw_mask_fb_ptr_left d2
4432#define draw_mask_fb_ptr_right d3
4433
4434#define clut_low_a d4
4435#define clut_low_b d5
4436#define clut_high_a d6
4437#define clut_high_b d7
4438
4439#define block_masks d8
4440#define block_masks_shifted d9
4441
4442#define clut_a q2
4443#define clut_b q3
4444
4445#define texels_low d10
4446#define texels_high d11
4447
4448
4449setup_sprite_flush_blocks_single:
4450 vpush { q1 - q4 }
4451
4452 stmdb sp!, { r0 - r3, r12, r14 }
4453 bl flush_render_block_buffer
4454 ldmia sp!, { r0 - r3, r12, r14 }
4455
4456 vpop { q1 - q4 }
4457
4458 add block, psx_gpu, #psx_gpu_blocks_offset
4459
4460 mov num_blocks, sub_tile_height
4461 bx lr
4462
4463
4464setup_sprite_flush_blocks_double:
4465 vpush { q1 - q4 }
4466
4467 stmdb sp!, { r0 - r3, r12, r14 }
4468 bl flush_render_block_buffer
4469 ldmia sp!, { r0 - r3, r12, r14 }
4470
4471 vpop { q1 - q4 }
4472
4473 add block, psx_gpu, #psx_gpu_blocks_offset
4474
4475 mov num_blocks, sub_tile_height, lsl #1
4476 bx lr
4477
4478
4479setup_sprite_update_texture_4bpp_cache:
4480 stmdb sp!, { r0 - r3, r14 }
4481 bl update_texture_4bpp_cache
4482 ldmia sp!, { r0 - r3, pc }
4483
4484
4485setup_sprite_update_texture_8bpp_cache:
4486 stmdb sp!, { r0 - r3, r14 }
4487 bl update_texture_8bpp_cache
4488 ldmia sp!, { r0 - r3, pc }
4489
4490
4491#define setup_sprite_tiled_initialize_4bpp() \
4492 ldr dirty_textures_mask, \
4493 [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \
4494 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4495 \
4496 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4497 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4498 \
4499 tst current_texture_mask, dirty_textures_mask; \
4500 vuzp.u8 clut_a, clut_b; \
4501 \
4502 blne setup_sprite_update_texture_4bpp_cache \
4503
4504#define setup_sprite_tiled_initialize_8bpp() \
4505 ldr dirty_textures_mask, \
4506 [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \
4507 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4508 \
4509 tst current_texture_mask, dirty_textures_mask; \
4510 blne setup_sprite_update_texture_8bpp_cache \
4511
4512
4513#define setup_sprite_tile_setup_block_no(side, offset, texture_mode) \
4514
4515#define setup_sprite_block_count_single() \
4516 sub_tile_height \
4517
4518#define setup_sprite_block_count_double() \
4519 sub_tile_height, lsl #1 \
4520
4521#define setup_sprite_tile_add_blocks(type) \
4522 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4523 cmp num_blocks, #MAX_BLOCKS; \
4524 \
4525 blgt setup_sprite_flush_blocks_##type \
4526
4527
4528#define setup_sprite_tile_full_4bpp(edge) \
4529 setup_sprite_tile_add_blocks(double); \
4530 \
4531 4: \
4532 and texture_block_ptr, texture_offset, texture_mask; \
4533 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4534 \
4535 pld [ fb_ptr ]; \
4536 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4537 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4538 \
4539 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4540 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4541 \
4542 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4543 add texture_block_ptr, texture_offset, #8; \
4544 \
4545 and texture_block_ptr, texture_block_ptr, texture_mask; \
4546 add block, block, #40; \
4547 \
4548 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4549 add fb_ptr, fb_ptr, #16; \
4550 \
4551 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4552 add block, block, #24; \
4553 \
4554 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4555 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4556 \
4557 pld [ fb_ptr ]; \
4558 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4559 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4560 \
4561 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4562 add block, block, #40; \
4563 \
4564 add texture_offset, texture_offset, #0x10; \
4565 add fb_ptr, fb_ptr, #(2048 - 16); \
4566 \
4567 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4568 add block, block, #24; \
4569 \
4570 subs sub_tile_height, sub_tile_height, #1; \
4571 bne 4b; \
4572 \
4573 add texture_offset, texture_offset, #0xF00; \
4574 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4575
4576
4577#define setup_sprite_tile_half_4bpp(edge) \
4578 setup_sprite_tile_add_blocks(single); \
4579 \
4580 4: \
4581 and texture_block_ptr, texture_offset, texture_mask; \
4582 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4583 \
4584 pld [ fb_ptr ]; \
4585 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4586 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4587 \
4588 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4589 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4590 \
4591 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4592 add block, block, #40; \
4593 \
4594 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4595 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4596 \
4597 add block, block, #24; \
4598 add texture_offset, texture_offset, #0x10; \
4599 \
4600 add fb_ptr, fb_ptr, #2048; \
4601 subs sub_tile_height, sub_tile_height, #1; \
4602 \
4603 bne 4b; \
4604 \
4605 add texture_offset, texture_offset, #0xF00; \
4606 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4607
4608
4609#define setup_sprite_tile_full_8bpp(edge) \
4610 setup_sprite_tile_add_blocks(double); \
4611 add block, block, #16; \
4612 \
4613 4: \
4614 and texture_block_ptr, texture_offset, texture_mask; \
4615 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4616 \
4617 pld [ fb_ptr ]; \
4618 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4619 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4620 \
4621 add texture_block_ptr, texture_offset, #8; \
4622 vst1.u32 { texels }, [ block, :64 ]; \
4623 \
4624 and texture_block_ptr, texture_block_ptr, texture_mask; \
4625 add block, block, #24; \
4626 \
4627 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4628 \
4629 add fb_ptr, fb_ptr, #16; \
4630 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4631 \
4632 add block, block, #40; \
4633 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4634 pld [ fb_ptr ]; \
4635 \
4636 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4637 vst1.u32 { texels }, [ block, :64 ]; \
4638 add block, block, #24; \
4639 \
4640 add texture_offset, texture_offset, #0x10; \
4641 add fb_ptr, fb_ptr, #(2048 - 16); \
4642 \
4643 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4644 add block, block, #40; \
4645 \
4646 subs sub_tile_height, sub_tile_height, #1; \
4647 bne 4b; \
4648 \
4649 sub block, block, #16; \
4650 add texture_offset, texture_offset, #0xF00; \
4651 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4652
4653
4654#define setup_sprite_tile_half_8bpp(edge) \
4655 setup_sprite_tile_add_blocks(single); \
4656 add block, block, #16; \
4657 \
4658 4: \
4659 and texture_block_ptr, texture_offset, texture_mask; \
4660 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4661 pld [ fb_ptr ]; \
4662 \
4663 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4664 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4665 \
4666 vst1.u32 { texels }, [ block, :64 ]; \
4667 add block, block, #24; \
4668 \
4669 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4670 add block, block, #40; \
4671 \
4672 add texture_offset, texture_offset, #0x10; \
4673 add fb_ptr, fb_ptr, #2048; \
4674 \
4675 subs sub_tile_height, sub_tile_height, #1; \
4676 bne 4b; \
4677 \
4678 sub block, block, #16; \
4679 add texture_offset, texture_offset, #0xF00; \
4680 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4681
4682
4683#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4684 add texture_offset, texture_offset_base, #8; \
4685 add fb_ptr, fb_ptr, #16 \
4686
4687#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4688 mov texture_offset, texture_offset_base \
4689
4690#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4691 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4692
4693#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4694 mov texture_offset, texture_offset_base \
4695
4696#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4697 sub fb_ptr, fb_ptr, #16 \
4698
4699#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4700
4701#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4702 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4703
4704#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4705
4706
4707#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode) \
4708 mov sub_tile_height, column_data; \
4709 setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge); \
4710 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4711 setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge) \
4712
4713#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode) \
4714 and sub_tile_height, column_data, #0xFF; \
4715 mov tiles_remaining, column_data, lsr #16; \
4716 setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge); \
4717 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4718 \
4719 subs tiles_remaining, tiles_remaining, #1; \
4720 beq 2f; \
4721 \
4722 3: \
4723 mov sub_tile_height, #16; \
4724 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4725 subs tiles_remaining, tiles_remaining, #1; \
4726 bne 3b; \
4727 \
4728 2: \
4729 uxtb sub_tile_height, column_data, ror #8; \
4730 setup_sprite_tile_##edge_mode##_##texture_mode(edge); \
4731 setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge) \
4732
4733
4734#define setup_sprite_column_data_single() \
4735 mov column_data, height; \
4736 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \
4737
4738#define setup_sprite_column_data_multi() \
4739 and height_rounded, height_rounded, #0xF; \
4740 rsb column_data, offset_v, #16; \
4741 \
4742 add height_rounded, height_rounded, #1; \
4743 sub tile_height, tile_height, #1; \
4744 \
4745 orr column_data, column_data, tile_height, lsl #16; \
4746 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \
4747 \
4748 orr column_data, column_data, height_rounded, lsl #8 \
4749
4750#define setup_sprite_tile_column_width_single(texture_mode, multi_height, \
4751 edge_mode, edge) \
4752 setup_sprite_##texture_mode##_single_##multi_height##_##edge_mode##_##edge: \
4753 setup_sprite_column_data_##multi_height(); \
4754 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4755 vorr.u32 block_masks, block_masks, block_masks_shifted; \
4756 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4757 vdup.u8 draw_mask_fb_ptr_right, block_masks[1]; \
4758 \
4759 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, \
4760 texture_mode); \
4761 ldmia sp!, { r4 - r11, pc } \
4762
4763#define setup_sprite_tiled_advance_column() \
4764 add texture_offset_base, texture_offset_base, #0x100; \
4765 tst texture_offset_base, #0xF00; \
4766 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4767
4768#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
4769 right_mode) \
4770 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode: \
4771 setup_sprite_column_data_##multi_height(); \
4772 mov fb_ptr_advance_column, #32; \
4773 \
4774 sub fb_ptr_advance_column, height, lsl #11; \
4775 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4776 \
4777 vdup.u8 draw_mask_fb_ptr_right, block_masks[1]; \
4778 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm); \
4779 \
4780 subs tile_width, tile_width, #2; \
4781 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4782 \
4783 vmov.u8 draw_masks_fb_ptrs, #0; \
4784 beq 1f; \
4785 \
4786 0: \
4787 setup_sprite_tiled_advance_column(); \
4788 setup_sprite_tile_column_height_##multi_height(full, none, tm); \
4789 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4790 subs tile_width, tile_width, #1; \
4791 bne 0b; \
4792 \
4793 1: \
4794 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4795 vdup.u8 draw_mask_fb_ptr_right, block_masks[5]; \
4796 \
4797 setup_sprite_tiled_advance_column(); \
4798 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm); \
4799 ldmia sp!, { r4 - r11, pc } \
4800
4801
4802// r0: psx_gpu
4803// r1: x
4804// r2: y
4805// r3: u
4806// [ sp ]: v
4807// [ sp + 4 ]: width
4808// [ sp + 8 ]: height
4809// [ sp + 12 ]: color (unused)
4810
4811#define setup_sprite_tiled_builder(texture_mode) \
4812 \
4813setup_sprite_tile_column_width_multi(texture_mode, multi, full, full); \
4814setup_sprite_tile_column_width_single(texture_mode, multi, full, none); \
4815setup_sprite_tile_column_width_multi(texture_mode, single, full, full); \
4816setup_sprite_tile_column_width_single(texture_mode, single, full, none); \
4817setup_sprite_tile_column_width_multi(texture_mode, multi, half, full); \
4818setup_sprite_tile_column_width_single(texture_mode, multi, half, right); \
4819setup_sprite_tile_column_width_multi(texture_mode, single, half, full); \
4820setup_sprite_tile_column_width_single(texture_mode, single, half, right); \
4821setup_sprite_tile_column_width_multi(texture_mode, multi, full, half); \
4822setup_sprite_tile_column_width_single(texture_mode, multi, half, left); \
4823setup_sprite_tile_column_width_multi(texture_mode, single, full, half); \
4824setup_sprite_tile_column_width_single(texture_mode, single, half, left); \
4825setup_sprite_tile_column_width_multi(texture_mode, multi, half, half); \
4826setup_sprite_tile_column_width_multi(texture_mode, single, half, half); \
4827 \
4828.align 4; \
4829 \
4830function(setup_sprite_##texture_mode) \
4831 stmdb sp!, { r4 - r11, r14 }; \
4832 setup_sprite_tiled_initialize_##texture_mode(); \
4833 \
4834 ldr v, [ sp, #36 ]; \
4835 and offset_u, u, #0xF; \
4836 \
4837 ldr width, [ sp, #40 ]; \
4838 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]; \
4839 \
4840 ldr height, [ sp, #44 ]; \
4841 add fb_ptr, fb_ptr, y, lsl #11; \
4842 \
4843 add fb_ptr, fb_ptr, x, lsl #1; \
4844 and offset_v, v, #0xF; \
4845 \
4846 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4847 add width_rounded, offset_u, width; \
4848 \
4849 add height_rounded, offset_v, height; \
4850 add width_rounded, width_rounded, #15; \
4851 \
4852 add height_rounded, height_rounded, #15; \
4853 mov tile_width, width_rounded, lsr #4; \
4854 \
4855 /* texture_offset_base = VH-VL-00-00 */\
4856 mov texture_offset_base, v, lsl #8; \
4857 and offset_u_right, width_rounded, #0xF; \
4858 \
4859 /* texture_offset_base = VH-UH-UL-00 */\
4860 bfi texture_offset_base, u, #4, #8; \
4861 movw right_block_mask, #0xFFFE; \
4862 \
4863 /* texture_offset_base = VH-UH-VL-00 */\
4864 bfi texture_offset_base, v, #4, #4; \
4865 movw left_block_mask, #0xFFFF; \
4866 \
4867 mov tile_height, height_rounded, lsr #4; \
4868 mvn left_block_mask, left_block_mask, lsl offset_u; \
4869 \
4870 /* texture_mask = HH-HL-WH-WL */\
4871 ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \
4872 mov right_block_mask, right_block_mask, lsl offset_u_right; \
4873 \
4874 /* texture_mask_rev = WH-WL-HH-HL */\
4875 rev16 texture_mask_rev, texture_mask; \
4876 vmov block_masks, left_block_mask, right_block_mask; \
4877 \
4878 /* texture_mask = HH-HL-HL-WL */\
4879 bfi texture_mask, texture_mask_rev, #4, #4; \
4880 /* texture_mask_rev = 00-00-00-WH */\
4881 mov texture_mask_rev, texture_mask_rev, lsr #12; \
4882 \
4883 /* texture_mask = HH-WH-HL-WL */\
4884 bfi texture_mask, texture_mask_rev, #8, #4; \
4885 and left_block_mask, left_block_mask, #0xFF; \
4886 \
4887 mov control_mask, #0; \
4888 cmp left_block_mask, #0xFF; \
4889 \
4890 uxtb right_block_mask, right_block_mask, ror #8; \
4891 orreq control_mask, control_mask, #0x4; \
4892 \
4893 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4894 cmp right_block_mask, #0xFF; \
4895 \
4896 orreq control_mask, control_mask, #0x8; \
4897 cmp tile_width, #1; \
4898 \
4899 add block, psx_gpu, #psx_gpu_blocks_offset; \
4900 orreq control_mask, control_mask, #0x1; \
4901 \
4902 cmp tile_height, #1; \
4903 add block, block, num_blocks, lsl #6; \
4904 \
4905 orreq control_mask, control_mask, #0x2; \
4906 ldr pc, [ pc, control_mask, lsl #2 ]; \
4907 nop; \
4908 \
4909 .word setup_sprite_##texture_mode##_multi_multi_full_full; \
4910 .word setup_sprite_##texture_mode##_single_multi_full_none; \
4911 .word setup_sprite_##texture_mode##_multi_single_full_full; \
4912 .word setup_sprite_##texture_mode##_single_single_full_none; \
4913 .word setup_sprite_##texture_mode##_multi_multi_half_full; \
4914 .word setup_sprite_##texture_mode##_single_multi_half_right; \
4915 .word setup_sprite_##texture_mode##_multi_single_half_full; \
4916 .word setup_sprite_##texture_mode##_single_single_half_right; \
4917 .word setup_sprite_##texture_mode##_multi_multi_full_half; \
4918 .word setup_sprite_##texture_mode##_single_multi_half_left; \
4919 .word setup_sprite_##texture_mode##_multi_single_full_half; \
4920 .word setup_sprite_##texture_mode##_single_single_half_left; \
4921 .word setup_sprite_##texture_mode##_multi_multi_half_half; \
4922 .word 0x00000000; \
4923 .word setup_sprite_##texture_mode##_multi_single_half_half \
4924
4925
4926setup_sprite_tiled_builder(4bpp);
4927setup_sprite_tiled_builder(8bpp);
4928
4929
4930#undef block_ptr
4931#undef num_blocks
4932#undef clut_ptr
4933
4934#define psx_gpu r0
4935#define block_ptr r0
4936#define num_blocks r1
4937#define clut_ptr r2
4938#define texel_shift_mask r3
4939#define block_pixels_a r4
4940#define block_pixels_b r5
4941#define texel_0 r6
4942#define texel_2 r7
4943#define texel_4 r8
4944#define texel_6 r9
4945#define texel_1 r10
4946#define texel_3 r11
4947#define texel_5 r12
4948#define texel_7 r14
4949#define texels_01 r6
4950#define texels_23 r7
4951#define texels_45 r8
4952#define texels_67 r9
4953
4954function(texture_sprite_blocks_8bpp)
4955 stmdb sp!, { r4 - r11, r14 }
4956 movw texel_shift_mask, #(0xFF << 1)
4957
4958 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4959 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
4960
4961 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
4962 ldr block_pixels_a, [ block_ptr, #16 ]
4963
4964 0:
4965 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
4966 ldr block_pixels_b, [ block_ptr, #20 ]
4967
4968 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
4969 ldrh texel_0, [ clut_ptr, texel_0 ]
4970
4971 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
4972 ldrh texel_1, [ clut_ptr, texel_1 ]
4973
4974 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
4975 ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
4976
4977 ldrh texel_2, [ clut_ptr, texel_2 ]
4978 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
4979
4980 ldrh texel_3, [ clut_ptr, texel_3 ]
4981 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
4982
4983 ldrh texel_4, [ clut_ptr, texel_4 ]
4984 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
4985
4986 ldrh texel_5, [ clut_ptr, texel_5 ]
4987 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
4988
4989 ldrh texel_6, [ clut_ptr, texel_6 ]
4990 orr texels_01, texel_0, texel_1, lsl #16
4991
4992 ldrh texel_7, [ clut_ptr, texel_7 ]
4993 orr texels_23, texel_2, texel_3, lsl #16
4994
4995 orr texels_45, texel_4, texel_5, lsl #16
4996 str texels_01, [ block_ptr, #0 ]
4997
4998 orr texels_67, texel_6, texel_7, lsl #16
4999 str texels_23, [ block_ptr, #4 ]
5000
5001 subs num_blocks, num_blocks, #1
5002 str texels_45, [ block_ptr, #8 ]
5003
5004 str texels_67, [ block_ptr, #12 ]
5005 add block_ptr, block_ptr, #64
5006
5007 bne 0b
5008
5009 ldmia sp!, { r4 - r11, pc }
5010
5011
5012#undef width_rounded
5013#undef texture_mask
5014#undef num_blocks
5015#undef texture_offset
5016
5017#define psx_gpu r0
5018#define x r1
5019#define y r2
5020#define u r3
5021#define v r4
5022#define width r5
5023#define height r6
5024#define left_offset r8
5025#define width_rounded r9
5026#define right_width r10
5027#define block_width r11
5028
5029#define texture_offset_base r1
5030#define texture_mask r2
5031#define texture_page_ptr r3
5032#define num_blocks r4
5033#define block r5
5034#define fb_ptr r7
5035#define texture_offset r8
5036#define blocks_remaining r9
5037#define fb_ptr_pitch r12
5038#define texture_block_ptr r14
5039
5040#define texture_mask_width r2
5041#define texture_mask_height r3
5042#define left_mask_bits r4
5043#define right_mask_bits r5
5044
5045
5046#undef block_masks
5047#undef block_masks_shifted
5048#undef texels
5049
5050#define block_masks d0
5051#define block_masks_shifted d1
5052#define draw_mask_fb_ptr d2
5053#define texels q2
5054
5055
5056setup_sprites_16bpp_flush_single:
5057 vpush { d0 - d2 }
5058
5059 stmdb sp!, { r0 - r3, r12, r14 }
5060 bl flush_render_block_buffer
5061 ldmia sp!, { r0 - r3, r12, r14 }
5062
5063 vpop { d0 - d2 }
5064
5065 add block, psx_gpu, #psx_gpu_blocks_offset
5066 mov num_blocks, #1
5067
5068 bx lr
5069
5070setup_sprites_16bpp_flush_row:
5071 vpush { d0 - d2 }
5072
5073 stmdb sp!, { r0 - r3, r12, r14 }
5074 bl flush_render_block_buffer
5075 ldmia sp!, { r0 - r3, r12, r14 }
5076
5077 vpop { d0 - d2 }
5078
5079 add block, psx_gpu, #psx_gpu_blocks_offset
5080 mov num_blocks, block_width
5081
5082 bx lr
5083
5084function(setup_sprite_16bpp)
5085 stmdb sp!, { r4 - r11, r14 }
5086 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5087
5088 ldr v, [ sp, #36 ]
5089 add fb_ptr, fb_ptr, y, lsl #11
5090
5091 ldr width, [ sp, #40 ]
5092 add fb_ptr, fb_ptr, x, lsl #1
5093
5094 ldr height, [ sp, #44 ]
5095 and left_offset, u, #0x7
5096
5097 add texture_offset_base, u, u
5098 add width_rounded, width, #7
5099
5100 add texture_offset_base, v, lsl #11
5101 mov left_mask_bits, #0xFF
5102
5103 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5104 add width_rounded, width_rounded, left_offset
5105
5106 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5107 sub fb_ptr, fb_ptr, left_offset, lsl #1
5108
5109 add texture_mask, texture_mask_width, texture_mask_width
5110 mov right_mask_bits, #0xFE
5111
5112 and right_width, width_rounded, #0x7
5113 mvn left_mask_bits, left_mask_bits, lsl left_offset
5114
5115 add texture_mask, texture_mask_height, lsl #11
5116 mov block_width, width_rounded, lsr #3
5117
5118 mov right_mask_bits, right_mask_bits, lsl right_width
5119 movw fb_ptr_pitch, #(2048 + 16)
5120
5121 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5122 vmov block_masks, left_mask_bits, right_mask_bits
5123
5124 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5125 add block, psx_gpu, #psx_gpu_blocks_offset
5126
5127 bic texture_offset_base, texture_offset_base, #0x7
5128 cmp block_width, #1
5129
5130 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5131 add block, block, num_blocks, lsl #6
5132
5133 bne 0f
5134
5135 vext.32 block_masks_shifted, block_masks, block_masks, #1
5136 vorr.u32 block_masks, block_masks, block_masks_shifted
5137 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5138
5139 1:
5140 add num_blocks, num_blocks, #1
5141 cmp num_blocks, #MAX_BLOCKS
5142 blgt setup_sprites_16bpp_flush_single
5143
5144 and texture_block_ptr, texture_offset_base, texture_mask
5145 subs height, height, #1
5146
5147 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5148 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5149
5150 vst1.u32 { texels }, [ block, :128 ]
5151 add block, block, #40
5152
5153 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5154 pld [ fb_ptr ]
5155
5156 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5157
5158 add block, block, #24
5159 add texture_offset_base, texture_offset_base, #2048
5160 add fb_ptr, fb_ptr, #2048
5161 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5162 bne 1b
5163
5164 ldmia sp!, { r4 - r11, pc }
5165
5166 0:
5167 add num_blocks, num_blocks, block_width
5168 mov texture_offset, texture_offset_base
5169
5170 cmp num_blocks, #MAX_BLOCKS
5171 blgt setup_sprites_16bpp_flush_row
5172
5173 add texture_offset_base, texture_offset_base, #2048
5174 and texture_block_ptr, texture_offset, texture_mask
5175
5176 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5177 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5178
5179 vst1.u32 { texels }, [ block, :128 ]
5180 add block, block, #40
5181
5182 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5183 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5184 pld [ fb_ptr ]
5185
5186 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5187 subs blocks_remaining, block_width, #2
5188
5189 add texture_offset, texture_offset, #16
5190 add fb_ptr, fb_ptr, #16
5191
5192 vmov.u8 draw_mask_fb_ptr, #0
5193
5194 add block, block, #24
5195 beq 2f
5196
5197 1:
5198 and texture_block_ptr, texture_offset, texture_mask
5199 subs blocks_remaining, blocks_remaining, #1
5200
5201 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5202 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5203
5204 vst1.u32 { texels }, [ block, :128 ]
5205 add block, block, #40
5206
5207 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5208 pld [ fb_ptr ]
5209
5210 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5211
5212 add texture_offset, texture_offset, #16
5213 add fb_ptr, fb_ptr, #16
5214
5215 add block, block, #24
5216 bne 1b
5217
5218 2:
5219 and texture_block_ptr, texture_offset, texture_mask
5220 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5221
5222 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5223 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5224
5225 vst1.u32 { texels }, [ block, :128 ]
5226 add block, block, #40
5227
5228 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5229 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5230
5231 add block, block, #24
5232 subs height, height, #1
5233
5234 add fb_ptr, fb_ptr, fb_ptr_pitch
5235 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5236
5237 bne 0b
5238
5239 ldmia sp!, { r4 - r11, pc }
5240
5241
5242#undef texture_page_ptr
5243#undef vram_ptr
5244#undef dirty_textures_mask
5245#undef current_texture_mask
5246
5247#define psx_gpu r0
5248#define current_texture_page r1
5249#define texture_page_ptr r2
5250#define vram_ptr_a r3
5251#define current_texture_page_x r12
5252#define current_texture_page_y r4
5253#define dirty_textures_mask r5
5254#define tile_y r6
5255#define tile_x r7
5256#define sub_y r8
5257#define current_texture_mask r9
5258#define c_4096 r10
5259#define vram_ptr_b r11
5260
5261#define texel_block_a d0
5262#define texel_block_b d1
5263#define texel_block_expanded_a q1
5264#define texel_block_expanded_b q2
5265#define texel_block_expanded_ab q2
5266#define texel_block_expanded_c q3
5267#define texel_block_expanded_d q4
5268#define texel_block_expanded_cd q3
5269
5270function(update_texture_4bpp_cache)
5271 stmdb sp!, { r4 - r11, r14 }
5272 vpush { q0 - q3 }
5273
5274 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5275
5276 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5277 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5278
5279 and current_texture_page_x, current_texture_page, #0xF
5280 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5281
5282 mov current_texture_page_y, current_texture_page, lsr #4
5283 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5284
5285 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5286 mov tile_y, #16
5287
5288 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5289 bic dirty_textures_mask, current_texture_mask
5290
5291 mov tile_x, #16
5292 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5293
5294 mov sub_y, #8
5295 movw c_4096, #4096
5296
5297 add vram_ptr_b, vram_ptr_a, #2048
5298
5299 0:
5300 vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5301 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5302
5303 vmovl.u8 texel_block_expanded_a, texel_block_a
5304 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5305 vmovl.u8 texel_block_expanded_c, texel_block_b
5306 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5307
5308 vbic.u16 texel_block_expanded_a, #0x00F0
5309 vbic.u16 texel_block_expanded_b, #0x00F0
5310 vbic.u16 texel_block_expanded_c, #0x00F0
5311 vbic.u16 texel_block_expanded_d, #0x00F0
5312
5313 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5314 texel_block_expanded_b
5315 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5316 texel_block_expanded_d
5317
5318 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
5319 [ texture_page_ptr, :256 ]!
5320
5321 subs sub_y, sub_y, #1
5322 bne 0b
5323
5324 mov sub_y, #8
5325 add vram_ptr_a, vram_ptr_a, #8
5326 add vram_ptr_b, vram_ptr_b, #8
5327
5328 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5329 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5330
5331 subs tile_x, tile_x, #1
5332 bne 0b
5333
5334 mov tile_x, #16
5335 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5336 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5337
5338 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5339 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5340
5341 subs tile_y, tile_y, #1
5342 bne 0b
5343
5344 vpop { q0 - q3 }
5345 ldmia sp!, { r4 - r11, pc }
5346
5347
5348#undef current_texture_page
5349
5350#define psx_gpu r0
5351#define texture_page r1
5352#define texture_page_ptr r2
5353#define vram_ptr_a r3
5354#define texture_page_x r12
5355#define texture_page_y r4
5356#define current_texture_page r5
5357#define tile_y r6
5358#define tile_x r7
5359#define sub_y r8
5360#define c_4096 r10
5361#define vram_ptr_b r11
5362
5363
5364#undef texels_a
5365#undef texels_b
5366
5367#define texels_a q0
5368#define texels_b q1
5369#define texels_c q2
5370#define texels_d q3
5371
5372
5373function(update_texture_8bpp_cache_slice)
5374 stmdb sp!, { r4 - r11, r14 }
5375 vpush { q0 - q3 }
5376
5377 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5378 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5379
5380 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5381 mov tile_y, #16
5382
5383 and texture_page_x, texture_page, #0xF
5384 mov texture_page_y, texture_page, lsr #4
5385
5386 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
5387 mov tile_x, #8
5388
5389 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
5390 eor current_texture_page, current_texture_page, texture_page
5391
5392 ands current_texture_page, current_texture_page, #0x1
5393 mov sub_y, #4
5394
5395 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5396 movw c_4096, #4096
5397
5398 add vram_ptr_b, vram_ptr_a, #2048
5399
5400 0:
5401 vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
5402 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
5403 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
5404 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
5405
5406 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
5407 vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
5408
5409 subs sub_y, sub_y, #1
5410 bne 0b
5411
5412 mov sub_y, #4
5413
5414 add vram_ptr_a, vram_ptr_a, #16
5415 add vram_ptr_b, vram_ptr_b, #16
5416
5417 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5418 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5419
5420 subs tile_x, tile_x, #1
5421 bne 0b
5422
5423 mov tile_x, #8
5424
5425 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5426 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5427
5428 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5429 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5430
5431 subs tile_y, tile_y, #1
5432 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5433
5434 bne 0b
5435
5436 vpop { q0 - q3 }
5437 ldmia sp!, { r4 - r11, pc }
5438