libretro/ios: workaround clang segfault
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
16#define MAX_SPANS 512
17#define MAX_BLOCKS 64
18#define MAX_BLOCKS_PER_ROW 128
19
f0931e56 20#define RENDER_STATE_MASK_EVALUATE 0x20
21#define RENDER_FLAGS_MODULATE_TEXELS 0x1
22#define RENDER_FLAGS_BLEND 0x2
d5c08ed3 23#define RENDER_INTERLACE_ENABLED 0x1
f0931e56 24
cb88320b 25#include "psx_gpu_offsets.h"
75e28f62 26
cb88320b 27#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 28
75e28f62
E
29#define edge_data_left_x_offset 0
30#define edge_data_num_blocks_offset 2
31#define edge_data_right_mask_offset 4
32#define edge_data_y_offset 6
33
34
35#define psx_gpu r0
36#define v_a r1
37#define v_b r2
38#define v_c r3
39
40#define x0 r4
41#define x1 r5
42#define x2 r6
43#define x0_x1 r5
44#define x1_x2 r6
45#define y0 r7
46#define y1 r8
47#define y2 r9
48#define y0_y1 r7
49#define y1_y2 r8
50#define b0 r9
51#define b1 r10
52#define b2 r11
53#define b0_b1 r10
54#define b1_b2 r11
55
56
57#define area_r_s r5
58
59#define g_bx0 r2
60#define g_bx r3
61#define g_bx2 r4
62#define g_bx3 r5
63#define b_base r6
64#define g_by r8
65
66#define gs_bx r7
67#define gs_by r10
68
69#define ga_bx g_bx
70#define ga_by g_by
71
72#define gw_bx_h g_bx
73#define gw_by_h g_by
74
75#define gw_bx_l r11
76#define gw_by_l gw_bx_l
77
78#define store_a r0
79#define store_b r1
80#define store_inc r5
81
82
83#define v0 q0
84#define uvrgb0 d0
85#define x0_y0 d1
86
87#define v1 q1
88#define uvrgb1 d2
89#define x1_y1 d3
90
91#define v2 q2
92#define uvrgb2 d4
93#define x2_y2 d5
94
95#define x0_ab q3
96#define uvrg_xxxx0 q3
97#define uvrg0 d6
98#define xxxx0 d7
99
100#define x1_ab q4
101#define uvrg_xxxx1 q4
102#define uvrg1 d8
103#define xxxx1 d9
104
105#define x2_ab q5
106#define uvrg_xxxx2 q5
107#define uvrg2 d10
108#define xxxx2 d11
109
110#define y0_ab q6
111#define yyyy_uvrg0 q6
112#define yyyy0 d12
113#define uvrg0b d13
114
115#define y1_ab q7
116#define yyyy_uvrg1 q7
117#define yyyy1 d14
118#define uvrg1b d15
119
120#define y2_ab q8
121#define yyyy_uvrg2 q8
122#define yyyy2 d16
123#define uvrg2b d17
124
125#define d0_ab q9
126#define d0_a d18
127#define d0_b d19
128
129#define d1_ab q10
130#define d1_a d20
131#define d1_b d21
132
133#define d2_ab q11
134#define d2_a d22
135#define d2_b d23
136
137#define d3_ab q12
138#define d3_a d24
139#define d3_b d25
140
141#define ga_uvrg_x q1
142#define ga_uvrg_y q4
143
144#define dx x0_x1
145#define dy y0_y1
146#define db b0_b1
147
148#define uvrg_base q11
149
150#define gs_uvrg_x q5
151#define gs_uvrg_y q6
152
153#define g_uvrg_x q1
154#define ga_uv_x d2
155#define g_uv_x d2
156#define ga_rg_x d3
157#define g_rg_x d3
158
159#define g_uvrg_y q4
160#define ga_uv_y d8
161#define g_uv_y d8
162#define ga_rg_y d9
163#define g_rg_y d9
164
165#define gw_uv_x q1
166#define gw_rg_x q2
167#define gw_uv_y q4
168#define gw_rg_y q3
169
170#define w_mask q9
171#define w_mask_l d18
172
173#define r_shift q10
174
175#define uvrg_dx0 q0
176#define uvrg_dx0l d0
177#define uvrg_dx0h d1
178
179#define uvrg_dx1 q1
180#define uvrg_dx1l d2
181#define uvrg_dx1h d3
182
183#define uvrg_dx2 q2
184#define uvrg_dx2l d4
185#define uvrg_dx2h d5
186
187#define uvrg_dx3 q3
188#define uvrg_dx3l d6
189#define uvrg_dx3h d7
190
c6063f89 191#define uvrgb_phase q13
75e28f62
E
192
193.align 4
194
5d834c08 195/* FIXME: users of this should be in psx_gpu instead */
196#ifndef __PIC__
197#define load_pointer(register, pointer) \
198 movw register, :lower16:pointer; \
199 movt register, :upper16:pointer; \
200
201#else
202#define load_pointer(register, pointer) \
203 ldr register, =pointer \
204
205#endif
206
75e28f62
E
207#define function(name) \
208 .global name; \
209 name: \
210
211@ r0: psx_gpu
212@ r1: v_a
213@ r2: v_b
214@ r3: v_c
215
216function(compute_all_gradients)
217 // First compute the triangle area reciprocal and shift. The division will
218 // happen concurrently with much of the work which follows.
219 @ r12 = psx_gpu->triangle_area
220 ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
221 stmdb sp!, { r4 - r11, lr }
222
223 @ load exponent of 62 into upper half of double
224 movw r4, #0
225 clz r14, r12 @ r14 = shift
226
227 movt r4, #((62 + 1023) << 4)
228 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
229
230 @ load area normalized into lower half of double
231 mov r5, r12, lsr #10
232 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
233
234 movt r4, #((1022 + 31) << 4)
235 mov r5, r12, lsl #20
236
237 add r4, r4, r12, lsr #11
238 vmov.f64 d31, r5, r4
239
240 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
241
242 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
243 // ( d0 * d1 ) - ( d2 * d3 ) =
244 // ( m0 ) - ( m1 ) = gradient
245
246 // This is split to do 12 elements at a time over three sets: a, b, and c.
247 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
248 // two of the slots are unused.
249
250 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
251 // is g.
252
253 // First type is: uvrg bxxx xxxx
254 // Second type is: yyyy ybyy uvrg
255 // Since x_a and y_c are the same the same variable is used for both.
256
257 vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 }
258 ldrsh x0, [ v_a, #8 ] @ load x0
259
260 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1}
261 ldrh x1, [ v_b, #8 ] @ load x1
262
263 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 }
264 ldrh x2, [ v_c, #8 ] @ load x2
265
266 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
267 ldrh y0, [ v_a, #10 ] @ load y0
268
269 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
270 ldrh y1, [ v_b, #10 ] @ load y1
271
272 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
273 ldrh y2, [ v_c, #10 ] @ load y2
274
275 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
276 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
277
278 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
279 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
280
281 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
282 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
283
284 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
285 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
286
287 ldrb b2, [ v_c, #4 ] @ load b2
288 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
289
290 ldrb b1, [ v_b, #4 ] @ load b1
291 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
292
293 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
294 vsub.s16 d0_ab, x1_ab, x0_ab
295
296 ldrb b0, [ v_a, #4 ] @ load b0
297 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
298
299 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
300 vsub.s16 d2_ab, x2_ab, x1_ab
301
302 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
303 vsub.s16 d1_ab, y2_ab, y1_ab
304
305 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
306 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
307
308 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
309 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
310
311 vsub.s16 d3_ab, y1_ab, y0_ab
312 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
313 @ ((x2 - X1) * (b1 - b0))
314 vmull.s16 ga_uvrg_x, d0_a, d1_a
315 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
316 @ ((b2 - b1) * (y1 - y0))
317 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
318 movs gs_bx, ga_bx, asr #31
319
320 vmull.s16 ga_uvrg_y, d0_b, d1_b
321 rsbmi ga_bx, ga_bx, #0
322
c6063f89 323 @ r12 = psx_gpu->uvrgb_phase
324 ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
325
75e28f62
E
326 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
327 movs gs_by, ga_by, asr #31
328
329 vshr.u64 d0, d30, #22
c6063f89 330 add b_base, r12, b0, lsl #16
331
332 vdup.u32 uvrgb_phase, r12
75e28f62
E
333
334 rsbmi ga_by, ga_by, #0
335 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
336
337 @ r12 = psx_gpu->triangle_winding_offset
338 ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
339 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
340
75e28f62
E
341 rsb r12, r12, #0 @ r12 = -(triangle->winding)
342
343 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
344 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
345
346 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
347 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
348
c6063f89 349 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
350 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
351
352 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
353 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
354
355 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
356 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
357 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
358 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
359
360 vshl.u64 gw_rg_x, gw_rg_x, r_shift
361 vshl.u64 gw_uv_x, gw_uv_x, r_shift
362 vshl.u64 gw_rg_y, gw_rg_y, r_shift
363 vshl.u64 gw_uv_y, gw_uv_y, r_shift
364
365 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
366 vmovn.u64 g_uv_x, gw_uv_x
367
368 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
369 vmovn.u64 g_rg_x, gw_rg_x
370
371 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
372 vmovn.u64 g_uv_y, gw_uv_y
373
374 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
375 vmovn.u64 g_rg_y, gw_rg_y
376
377 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
378 mov ga_bx, ga_bx, lsl #13
379
380 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
381 mov ga_by, ga_by, lsl #13
382
383 vdup.u32 x0_y0, x0
384 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
385
386 vshl.u32 g_uvrg_x, g_uvrg_x, #4
387 vshl.u32 g_uvrg_y, g_uvrg_y, #4
388
389 umull gw_by_l, gw_by_h, ga_by, area_r_s
390 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
391
392 eor gs_bx, gs_bx, r12
393 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
394
395 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
396 eor gs_by, gs_by, r12
397
398 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
399 add store_a, psx_gpu, #psx_gpu_uvrg_offset
400
401 sub r11, r11, #(32 - 13)
402
403 add store_b, store_a, #16
404 mov store_inc, #32
405
406 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
407 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
408
409 vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
410 mov g_bx, gw_bx_h, lsr r11
411
412 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
413 mov g_by, gw_by_h, lsr r11
414
415 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
416 [ store_b, : 128 ], store_inc
417 eor g_bx, g_bx, gs_bx
418
419 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
420 [ store_b, : 128 ], store_inc
421 sub g_bx, g_bx, gs_bx
422
423 lsl g_bx, g_bx, #4
424 eor g_by, g_by, gs_by
425
426 mls b_base, g_bx, x0, b_base
427 sub g_by, g_by, gs_by
428
429 lsl g_by, g_by, #4
430 mov g_bx0, #0
431
432 add g_bx2, g_bx, g_bx
433 add g_bx3, g_bx, g_bx2
434
435 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
436
437 ldmia sp!, { r4 - r11, pc }
438
439
440#define psx_gpu r0
441#define v_a r1
442#define v_b r2
443#define v_c r3
444
445#define temp r14
446
447#define x_a r4
448#define x_b r5
449#define x_c r6
450#define y_a r1
451#define y_b r2
452#define y_c r3
453
454#define height_minor_a r7
455#define height_minor_b r8
456#define height_major r9
457#define height r9
458
459#define reciprocal_table_ptr r10
460
461#define edge_alt_low r4
462#define edge_alt_high r5
463#define edge_dx_dy_alt r6
464#define edge_shift_alt r10
465
466#define edge_dx_dy_alt_low r4
467#define edge_dx_dy_alt_high r5
468
469#define span_edge_data r4
470#define span_uvrg_offset r5
471#define span_b_offset r6
472
473#define clip r14
474
475#define b r11
476#define b_dy r12
477
478
479#define alternate_x q0
480#define alternate_dx_dy q1
481#define alternate_x_32 q2
482
483#define alternate_x_low d0
484#define alternate_x_high d1
485#define alternate_dx_dy_low d2
486#define alternate_dx_dy_high d3
487#define alternate_x_32_low d4
488#define alternate_x_32_high d5
489
490#define left_x q3
491#define right_x q4
492#define left_dx_dy q5
493#define right_dx_dy q6
494#define left_edge q7
495#define right_edge q8
496
497#define left_x_low d6
498#define left_x_high d7
499#define right_x_low d8
500#define right_x_high d9
501#define left_dx_dy_low d10
502#define left_dx_dy_high d11
503#define right_dx_dy_low d12
504#define right_dx_dy_high d13
505#define left_edge_low d14
506#define left_edge_high d15
507#define right_edge_low d16
508#define right_edge_high d17
509
510#define y_mid_point d18
511#define c_0x0004 d19
512
513#define left_right_x_16 q11
514#define span_shifts_y q12
515#define c_0x0001 q13
516
517#define span_shifts d24
518#define y_x4 d25
519#define c_0xFFFE d26
520#define c_0x0007 d27
521
522#define left_right_x_16_low d22
523#define left_right_x_16_high d23
524
525#define uvrg q14
526#define uvrg_dy q15
527
528#define alternate_x_16 d4
529
530#define v_clip q3
531#define v_clip_low d6
532
533#define right_x_32 q10
534#define left_x_32 q11
535#define alternate_select d24
536
537#define right_x_32_low d20
538#define right_x_32_high d21
539#define left_x_32_low d22
540#define left_x_32_high d23
541
542#define edges_xy q0
543#define edges_dx_dy d2
544#define edge_shifts d3
545#define edge_shifts_64 q2
546
547#define edges_xy_left d0
548#define edges_xy_right d1
549
550#define height_reciprocals d6
551#define heights d7
552
553#define widths d8
554#define c_0x01 d9
555#define x_starts d10
556#define x_ends d11
557
558#define heights_b d12
559#define edges_dx_dy_64 q10
560
561#define edges_dx_dy_64_left d20
562#define edges_dx_dy_64_right d21
563
564
565#define setup_spans_prologue() \
566 stmdb sp!, { r4 - r11, lr }; \
567 \
568 ldrsh x_a, [ v_a, #8 ]; \
569 ldrsh x_b, [ v_b, #8 ]; \
570 ldrsh x_c, [ v_c, #8 ]; \
571 ldrsh y_a, [ v_a, #10 ]; \
572 ldrsh y_b, [ v_b, #10 ]; \
573 ldrsh y_c, [ v_c, #10 ]; \
574 \
575 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
576 vld1.32 { uvrg }, [ temp ]; \
577 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
578 vld1.32 { uvrg_dy }, [ temp ]; \
5d834c08 579 load_pointer(reciprocal_table_ptr, reciprocal_table); \
75e28f62
E
580 \
581 vmov.u32 c_0x01, #0x01 \
582
583#define setup_spans_load_b() \
584 ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \
585 ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \
586
587#define setup_spans_prologue_b() \
588 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
589 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
590 \
591 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
592 vmov.u16 c_0x0004, #0x0004; \
593 \
594 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
595 vmov.u16 c_0x0001, #0x0001; \
596 \
597 vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \
598 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
599 \
600 vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \
601 vadd.u16 right_edge, right_edge, c_0x0001; \
602 \
603 vmov.u16 c_0x0007, #0x0007; \
604 vmvn.u16 c_0xFFFE, #0x0001 \
605
606
607#define compute_edge_delta_x2() \
608 ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \
609 \
610 vdup.u32 heights, height; \
611 vsub.u32 widths, x_ends, x_starts; \
612 \
613 vdup.u32 edge_shifts, temp; \
614 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 615 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
616 \
617 vmla.s32 heights_b, x_starts, heights; \
618 vbic.u16 edge_shifts, #0xE0; \
619 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
620 vmull.s32 edges_xy, heights_b, height_reciprocals \
621
622#define width_alt r6
623#define height_reciprocal_alt r11
624#define height_b_alt r12
625
626#define compute_edge_delta_x3(start_c, height_a, height_b) \
627 vmov.u32 heights, height_a, height_b; \
628 ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
629 vmov.u32 edge_shifts[0], temp; \
630 ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
631 vmov.u32 edge_shifts[1], temp; \
632 ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \
633 \
634 vsub.u32 widths, x_ends, x_starts; \
635 sub width_alt, x_c, start_c; \
636 \
637 vsub.u32 heights_b, heights, c_0x01; \
638 sub height_b_alt, height_minor_b, #1; \
639 \
7d5140f5
E
640 vshr.u32 height_reciprocals, edge_shifts, #10; \
641 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
642 \
643 vmla.s32 heights_b, x_starts, heights; \
644 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
645 \
646 vbic.u16 edge_shifts, #0xE0; \
647 and edge_shift_alt, edge_shift_alt, #0x1F; \
648 \
649 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
650 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
651 \
652 vmull.s32 edges_xy, heights_b, height_reciprocals; \
653 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
654
655
656#define setup_spans_adjust_y_up() \
657 vsub.u32 y_x4, y_x4, c_0x0004 \
658
659#define setup_spans_adjust_y_down() \
660 vadd.u32 y_x4, y_x4, c_0x0004 \
661
662#define setup_spans_adjust_interpolants_up() \
663 vsub.u32 uvrg, uvrg, uvrg_dy; \
664 sub b, b, b_dy \
665
666#define setup_spans_adjust_interpolants_down() \
667 vadd.u32 uvrg, uvrg, uvrg_dy; \
668 add b, b, b_dy \
669
670
671#define setup_spans_clip_interpolants_increment() \
672 mla b, b_dy, clip, b; \
673 vmla.s32 uvrg, uvrg_dy, v_clip \
674
675#define setup_spans_clip_interpolants_decrement() \
676 mls b, b_dy, clip, b; \
677 vmls.s32 uvrg, uvrg_dy, v_clip \
678
679#define setup_spans_clip_alternate_yes() \
680 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
681
682#define setup_spans_clip_alternate_no() \
683
684#define setup_spans_clip(direction, alternate_active) \
685 vdup.u32 v_clip, clip; \
686 setup_spans_clip_alternate_##alternate_active(); \
687 setup_spans_clip_interpolants_##direction(); \
688 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
689
690
691#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
692 vmovl.s32 edge_shifts_64, edge_shifts; \
693 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
694 \
695 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
696 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
697 \
698 vmov left_x_low, edges_xy_##left_index; \
699 vmov right_x_low, edges_xy_##right_index; \
700 \
701 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
702 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
703 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
704 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
705 \
706 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
707 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
708 \
709 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
710 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
711
712
713#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
714 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
715 \
716 vdup.u16 y_mid_point, y_b; \
717 rsb temp, edge_shift_alt, #32; \
718 \
719 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
720 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
721 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
722 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
723 \
724 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
725 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
726 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
727 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
728 \
729 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
730 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
731
732
733#define setup_spans_y_select_up() \
734 vclt.s16 alternate_select, y_x4, y_mid_point \
735
736#define setup_spans_y_select_down() \
737 vcgt.s16 alternate_select, y_x4, y_mid_point \
738
739
740#define setup_spans_alternate_select_left() \
741 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
742
743#define setup_spans_alternate_select_right() \
744 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
745
746
747#define setup_spans_set_x4_alternate_yes(alternate, direction) \
748 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
749 vshrn.s64 left_x_32_low, left_x, #32; \
750 vshrn.s64 right_x_32_low, right_x, #32; \
751 \
752 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
753 vadd.u64 left_x, left_x, left_dx_dy; \
754 vadd.u64 right_x, right_x, right_dx_dy; \
755 \
756 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
757 vshrn.s64 left_x_32_high, left_x, #32; \
758 vshrn.s64 right_x_32_high, right_x, #32; \
759 \
760 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
761 vadd.u64 left_x, left_x, left_dx_dy; \
762 vadd.u64 right_x, right_x, right_dx_dy; \
763 \
764 vmovn.u32 alternate_x_16, alternate_x_32; \
765 setup_spans_y_select_##direction(); \
766 vmovn.u32 left_right_x_16_low, left_x_32; \
767 \
768 vmovn.u32 left_right_x_16_high, right_x_32; \
769 setup_spans_alternate_select_##alternate(); \
770 \
771 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
772 str b, [ span_b_offset ], #4; \
773 setup_spans_adjust_interpolants_##direction(); \
774 \
775 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
776 \
777 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
778 str b, [ span_b_offset ], #4; \
779 setup_spans_adjust_interpolants_##direction(); \
780 \
781 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
782 \
783 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
784 str b, [ span_b_offset ], #4; \
785 setup_spans_adjust_interpolants_##direction(); \
786 \
787 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
788 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
789 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
790 \
791 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
792 str b, [ span_b_offset ], #4; \
793 setup_spans_adjust_interpolants_##direction(); \
794 \
795 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
796 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
797 \
798 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
799 \
800 setup_spans_adjust_y_##direction() \
801
802
803#define setup_spans_set_x4_alternate_no(alternate, direction) \
804 vshrn.s64 left_x_32_low, left_x, #32; \
805 vshrn.s64 right_x_32_low, right_x, #32; \
806 \
807 vadd.u64 left_x, left_x, left_dx_dy; \
808 vadd.u64 right_x, right_x, right_dx_dy; \
809 \
810 vshrn.s64 left_x_32_high, left_x, #32; \
811 vshrn.s64 right_x_32_high, right_x, #32; \
812 \
813 vadd.u64 left_x, left_x, left_dx_dy; \
814 vadd.u64 right_x, right_x, right_dx_dy; \
815 \
816 vmovn.u32 left_right_x_16_low, left_x_32; \
817 vmovn.u32 left_right_x_16_high, right_x_32; \
818 \
819 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
820 str b, [ span_b_offset ], #4; \
821 setup_spans_adjust_interpolants_##direction(); \
822 \
823 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
824 \
825 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
826 str b, [ span_b_offset ], #4; \
827 setup_spans_adjust_interpolants_##direction(); \
828 \
829 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
830 \
831 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
832 str b, [ span_b_offset ], #4; \
833 setup_spans_adjust_interpolants_##direction(); \
834 \
835 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
836 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
837 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
838 \
839 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
840 str b, [ span_b_offset ], #4; \
841 setup_spans_adjust_interpolants_##direction(); \
842 \
843 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
844 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
845 \
846 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
847 \
848 setup_spans_adjust_y_##direction() \
849
850
851#define edge_adjust_low r11
852#define edge_adjust_high r12
853
854#define setup_spans_alternate_adjust_yes() \
855 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
856 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
857 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
858
859#define setup_spans_alternate_adjust_no() \
860
861
862#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
863 setup_spans_alternate_adjust_##alternate_active(); \
864 setup_spans_load_b(); \
865 \
866 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
867 subs y_c, y_c, temp; \
868 subgt height, height, y_c; \
869 addgt height, height, #1; \
870 \
871 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
872 subs clip, temp, y_a; \
873 ble 0f; \
874 \
875 sub height, height, clip; \
876 add y_a, y_a, clip; \
877 setup_spans_clip(increment, alternate_active); \
878 \
879 0: \
880 cmp height, #0; \
881 ble 1f; \
882 \
883 orr temp, y_a, y_a, lsl #16; \
884 add temp, temp, #(1 << 16); \
885 add y_a, temp, #2; \
886 add y_a, y_a, #(2 << 16); \
887 vmov.u32 y_x4, temp, y_a; \
888 \
889 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
890 right_index); \
891 setup_spans_prologue_b(); \
892 \
893 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
894 \
895 2: \
896 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
897 subs height, height, #4; \
898 bhi 2b; \
899 \
900 1: \
901
902
903#define setup_spans_alternate_pre_increment_yes() \
904 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
905 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
906
907#define setup_spans_alternate_pre_increment_no() \
908
909
910#define setup_spans_up_decrement_yes() \
911 suble height, height, #1 \
912
913#define setup_spans_up_decrement_no() \
914
915
916#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
917 setup_spans_alternate_adjust_##alternate_active(); \
918 setup_spans_load_b(); \
919 sub y_a, y_a, #1; \
920 \
921 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
922 subs temp, temp, y_c; \
923 subgt height, height, temp; \
924 setup_spans_up_decrement_##alternate_active(); \
925 \
926 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
927 subs clip, y_a, temp; \
928 ble 0f; \
929 \
930 sub height, height, clip; \
931 sub y_a, y_a, clip; \
932 setup_spans_clip(decrement, alternate_active); \
933 \
934 0: \
935 cmp height, #0; \
936 ble 1f; \
937 \
938 orr temp, y_a, y_a, lsl #16; \
939 sub temp, temp, #(1 << 16); \
940 sub y_a, temp, #2; \
941 sub y_a, y_a, #(2 << 16); \
942 vmov.u32 y_x4, temp, y_a; \
943 \
944 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
945 \
946 setup_spans_alternate_pre_increment_##alternate_active(); \
947 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
948 right_index); \
949 setup_spans_adjust_interpolants_up(); \
950 setup_spans_prologue_b(); \
951 \
952 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
953 \
954 2: \
955 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
956 subs height, height, #4; \
957 bhi 2b; \
958 \
959 1: \
960
961
962#define setup_spans_epilogue() \
963 ldmia sp!, { r4 - r11, pc } \
964
965
966#define setup_spans_up_up(minor, major) \
967 setup_spans_prologue(); \
968 sub height_minor_a, y_a, y_b; \
969 sub height_minor_b, y_b, y_c; \
970 sub height, y_a, y_c; \
971 \
972 vdup.u32 x_starts, x_a; \
973 vmov.u32 x_ends, x_c, x_b; \
974 \
975 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
976 setup_spans_up(major, minor, minor, yes); \
977 setup_spans_epilogue() \
978
979function(setup_spans_up_left)
980 setup_spans_up_up(left, right)
981
982function(setup_spans_up_right)
983 setup_spans_up_up(right, left)
984
5d834c08 985.pool
75e28f62
E
986
987#define setup_spans_down_down(minor, major) \
988 setup_spans_prologue(); \
989 sub height_minor_a, y_b, y_a; \
990 sub height_minor_b, y_c, y_b; \
991 sub height, y_c, y_a; \
992 \
993 vdup.u32 x_starts, x_a; \
994 vmov.u32 x_ends, x_c, x_b; \
995 \
996 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
997 setup_spans_down(major, minor, minor, yes); \
998 setup_spans_epilogue() \
999
1000function(setup_spans_down_left)
1001 setup_spans_down_down(left, right)
1002
1003function(setup_spans_down_right)
1004 setup_spans_down_down(right, left)
1005
1006
1007#define setup_spans_up_flat() \
1008 sub height, y_a, y_c; \
1009 \
1010 compute_edge_delta_x2(); \
1011 setup_spans_up(left, right, none, no); \
1012 setup_spans_epilogue() \
1013
1014function(setup_spans_up_a)
1015 setup_spans_prologue()
1016
1017 vmov.u32 x_starts, x_a, x_b
1018 vdup.u32 x_ends, x_c
1019
1020 setup_spans_up_flat()
1021
1022function(setup_spans_up_b)
1023 setup_spans_prologue()
1024
1025 vdup.u32 x_starts, x_a
1026 vmov.u32 x_ends, x_b, x_c
1027
1028 setup_spans_up_flat()
1029
1030#define setup_spans_down_flat() \
1031 sub height, y_c, y_a; \
1032 \
1033 compute_edge_delta_x2(); \
1034 setup_spans_down(left, right, none, no); \
1035 setup_spans_epilogue() \
1036
1037function(setup_spans_down_a)
1038 setup_spans_prologue()
1039
1040 vmov.u32 x_starts, x_a, x_b
1041 vdup.u32 x_ends, x_c
1042
1043 setup_spans_down_flat()
1044
1045function(setup_spans_down_b)
1046 setup_spans_prologue()
1047
1048 vdup.u32 x_starts, x_a
1049 vmov.u32 x_ends, x_b, x_c
1050
1051 setup_spans_down_flat()
1052
1053
1054#define middle_y r9
1055
1056#define edges_xy_b q11
1057#define edges_dx_dy_b d26
1058#define edge_shifts_b d27
1059#define edges_dx_dy_and_shifts_b q13
1060#define height_increment d20
1061
1062#define edges_dx_dy_and_shifts q1
1063
1064#define edges_xy_b_left d22
1065#define edges_xy_b_right d23
1066
1067#define setup_spans_up_down_load_edge_set_b() \
1068 vmov edges_xy, edges_xy_b; \
1069 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1070
1071
1072function(setup_spans_up_down)
1073 setup_spans_prologue()
1074
1075 // s32 middle_y = y_a;
1076 sub height_minor_a, y_a, y_b
1077 sub height_minor_b, y_c, y_a
1078 sub height_major, y_c, y_b
1079
1080 vmov.u32 x_starts, x_a, x_c
1081 vdup.u32 x_ends, x_b
1082
1083 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1084
1085 mov temp, #0
1086 vmov.u32 height_increment, temp, height_minor_b
1087 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1088
1089 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1090 vmov edges_xy_b_right, edges_xy_right
1091
1092 vmov edge_shifts_b, edge_shifts
1093 vmov.u32 edge_shifts_b[0], edge_shift_alt
1094
1095 vneg.s32 edges_dx_dy_b, edges_dx_dy
1096 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1097
1098 mov middle_y, y_a
1099
1100 setup_spans_load_b()
1101 sub y_a, y_a, #1
1102
1103 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1104 subs temp, temp, y_b
1105 subgt height_minor_a, height_minor_a, temp
1106
1107 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1108 subs clip, y_a, temp
1109 ble 0f
1110
1111 sub height_minor_a, height_minor_a, clip
1112 sub y_a, y_a, clip
1113 setup_spans_clip(decrement, no)
1114
1115 0:
1116 cmp height_minor_a, #0
1117 ble 3f
1118
1119 orr temp, y_a, y_a, lsl #16
1120 sub temp, temp, #(1 << 16)
1121 sub y_a, temp, #2
1122 sub y_a, y_a, #(2 << 16)
1123 vmov.u32 y_x4, temp, y_a
1124
1125 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1126
1127 strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1128
1129 setup_spans_adjust_edges_alternate_no(left, right);
1130 setup_spans_adjust_interpolants_up()
1131 setup_spans_up_down_load_edge_set_b()
1132
1133 setup_spans_prologue_b()
1134
1135
1136 2:
1137 setup_spans_set_x4_alternate_no(none, up)
1138 subs height_minor_a, height_minor_a, #4
1139 bhi 2b
1140
1141 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1142 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1143 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1144
1145 4:
1146 add temp, psx_gpu, #psx_gpu_uvrg_offset
1147 vld1.32 { uvrg }, [ temp ]
1148 mov y_a, middle_y
1149
1150 setup_spans_load_b()
1151
1152 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1153 subs y_c, y_c, temp
1154 subgt height_minor_b, height_minor_b, y_c
1155 addgt height_minor_b, height_minor_b, #1
1156
1157 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1158 subs clip, temp, y_a
1159 ble 0f
1160
1161 sub height_minor_b, height_minor_b, clip
1162 add y_a, y_a, clip
1163 setup_spans_clip(increment, no)
1164
1165 0:
1166 cmp height_minor_b, #0
1167 ble 1f
1168
1169 orr temp, y_a, y_a, lsl #16
1170 add temp, temp, #(1 << 16)
1171 add y_a, temp, #2
1172 add y_a, y_a, #(2 << 16)
1173 vmov.u32 y_x4, temp, y_a
1174
1175 setup_spans_adjust_edges_alternate_no(left, right)
1176
1177 ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1178 add temp, temp, height_minor_b
b7569147 1179
1180 cmp temp, #MAX_SPANS
1181 beq 5f
1182
75e28f62
E
1183 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1184
1185 2:
1186 setup_spans_set_x4_alternate_no(none, down)
1187 subs height_minor_b, height_minor_b, #4
1188 bhi 2b
1189
1190 1:
1191 setup_spans_epilogue()
1192
1193 3:
1194 setup_spans_up_down_load_edge_set_b()
1195 setup_spans_prologue_b()
1196 bal 4b
1197
b7569147 1198 5:
1199 // FIXME: overflow corner case
1200 sub temp, temp, height_minor_b
1201 bics height_minor_b, #3
1202 add temp, temp, height_minor_b
1203 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1204 bne 2b
1205 bal 1b
1206
5d834c08 1207.pool
75e28f62
E
1208
1209#undef span_uvrg_offset
1210#undef span_edge_data
1211#undef span_b_offset
1212#undef left_x
1213#undef b
1214
1215#define psx_gpu r0
1216#define num_spans r1
1217#define span_uvrg_offset r2
1218#define span_edge_data r3
1219#define span_b_offset r4
1220#define b_dx r5
1221#define span_num_blocks r6
1222#define y r7
1223#define left_x r8
1224#define b r9
1225#define dither_offset_ptr r10
1226#define block_ptr_a r11
1227#define fb_ptr r12
1228#define num_blocks r14
1229
1230#define uvrg_dx_ptr r2
1231#define texture_mask_ptr r3
1232#define dither_shift r8
1233#define dither_row r10
1234
1235#define c_32 r7
1236#define b_dx4 r8
1237#define b_dx8 r9
1238#define block_ptr_b r10
1239
1240#define block_span_ptr r10
1241#define right_mask r8
1242
1243#define color r2
1244#define color_r r3
1245#define color_g r4
1246#define color_b r5
1247
1248#undef uvrg
1249
1250#define u_block q0
1251#define v_block q1
1252#define r_block q2
1253#define g_block q3
1254#define b_block q4
1255
1256#define uv_dx4 d10
1257#define rg_dx4 d11
1258#define uv_dx8 d12
1259#define rg_dx8 d13
1260#define b_whole_8 d14
1261#define fb_mask_ptrs d15
1262
1263#define uvrg_dx4 q5
1264#define uvrg_dx8 q6
1265#define uv_dx8 d12
1266#define rg_dx8 d13
1267
1268#define u_whole q8
1269#define v_whole q9
1270#define r_whole q10
1271#define g_whole q11
1272#define b_whole q12
1273
1274#define u_whole_low d16
1275#define u_whole_high d17
1276#define v_whole_low d18
1277#define v_whole_high d19
1278#define r_whole_low d20
1279#define r_whole_high d21
1280#define g_whole_low d22
1281#define g_whole_high d23
1282#define b_whole_low d24
1283#define b_whole_high d25
1284
1285#define dx4 q13
1286#define dx8 q13
1287
1288#define u_whole_8 d26
1289#define v_whole_8 d27
1290#define u_whole_8b d24
1291#define r_whole_8 d24
1292#define g_whole_8 d25
1293
1294#define uv_whole_8 q13
1295#define uv_whole_8b q14
1296
1297#define dither_offsets q14
1298#define texture_mask q15
1299#define texture_mask_u d30
1300#define texture_mask_v d31
1301
1302#define dither_offsets_short d28
1303
1304#define v_left_x q8
1305#define uvrg q9
1306#define block_span q10
1307
1308#define uv d18
1309#define rg d19
1310
1311#define draw_mask q1
1312#define draw_mask_edge q13
1313#define test_mask q0
1314
1315#define uvrg_dx q3
1316
1317#define colors q2
1318
1319#define setup_blocks_texture_swizzled() \
1320 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1321 vsli.u8 u_whole_8, v_whole_8, #4; \
1322 vsri.u8 v_whole_8, u_whole_8b, #4 \
1323
1324#define setup_blocks_texture_unswizzled() \
1325
1326
1327#define setup_blocks_shaded_textured_builder(swizzling) \
1328.align 3; \
1329 \
1330function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
1331 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1332 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1333 \
1334 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1335 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1336 \
1337 cmp num_spans, #0; \
1338 bxeq lr; \
1339 \
1340 stmdb sp!, { r4 - r11, r14 }; \
1341 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1342 \
1343 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
1344 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1345 \
1346 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1347 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1348 \
1349 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1350 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1351 \
1352 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1353 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1354 \
1355 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1356 \
1357 0: \
1358 vmov.u8 fb_mask_ptrs, #0; \
1359 \
1360 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1361 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1362 \
1363 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1364 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1365 \
1366 cmp span_num_blocks, #0; \
1367 beq 1f; \
1368 \
1369 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1370 add num_blocks, span_num_blocks, num_blocks; \
1371 \
1372 cmp num_blocks, #MAX_BLOCKS; \
1373 bgt 2f; \
1374 \
1375 3: \
1376 ldr b, [ span_b_offset ]; \
1377 add fb_ptr, fb_ptr, y, lsl #11; \
1378 \
1379 vdup.u32 v_left_x, left_x; \
1380 and y, y, #0x3; \
1381 \
1382 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1383 add fb_ptr, fb_ptr, left_x, lsl #1; \
1384 \
1385 mla b, b_dx, left_x, b; \
1386 and dither_shift, left_x, #0x03; \
1387 \
1388 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1389 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1390 \
1391 mov dither_shift, dither_shift, lsl #3; \
1392 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1393 \
1394 mov c_32, #32; \
1395 subs span_num_blocks, span_num_blocks, #1; \
1396 \
1397 mov dither_row, dither_row, ror dither_shift; \
1398 mov b_dx4, b_dx, lsl #2; \
1399 \
1400 vdup.u32 dither_offsets_short, dither_row; \
1401 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1402 \
1403 vdup.u32 b_block, b; \
1404 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1405 \
1406 vdup.u32 u_block, uv[0]; \
1407 mov b_dx8, b_dx, lsl #3; \
1408 \
1409 vdup.u32 v_block, uv[1]; \
1410 vdup.u32 r_block, rg[0]; \
1411 vdup.u32 g_block, rg[1]; \
1412 \
1413 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1414 \
1415 vadd.u32 u_block, u_block, block_span; \
1416 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1417 \
1418 vadd.u32 v_block, v_block, block_span; \
1419 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1420 \
1421 vadd.u32 r_block, r_block, block_span; \
1422 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1423 \
1424 vadd.u32 g_block, g_block, block_span; \
1425 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
1426 \
1427 vadd.u32 b_block, b_block, block_span; \
1428 add block_ptr_b, block_ptr_a, #16; \
1429 \
1430 vshrn.u32 u_whole_low, u_block, #16; \
1431 vshrn.u32 v_whole_low, v_block, #16; \
1432 vshrn.u32 r_whole_low, r_block, #16; \
1433 vshrn.u32 g_whole_low, g_block, #16; \
1434 \
1435 vdup.u32 dx4, uv_dx4[0]; \
1436 vshrn.u32 b_whole_low, b_block, #16; \
1437 \
1438 vaddhn.u32 u_whole_high, u_block, dx4; \
1439 vdup.u32 dx4, uv_dx4[1]; \
1440 \
1441 vaddhn.u32 v_whole_high, v_block, dx4; \
1442 vdup.u32 dx4, rg_dx4[0]; \
1443 \
1444 vaddhn.u32 r_whole_high, r_block, dx4; \
1445 vdup.u32 dx4, rg_dx4[1]; \
1446 \
1447 vaddhn.u32 g_whole_high, g_block, dx4; \
1448 vdup.u32 dx4, b_dx4; \
1449 \
1450 vaddhn.u32 b_whole_high, b_block, dx4; \
1451 vdup.u32 dx8, uv_dx8[0]; \
1452 \
1453 vadd.u32 u_block, u_block, dx8; \
1454 vdup.u32 dx8, uv_dx8[1]; \
1455 \
1456 vadd.u32 v_block, v_block, dx8; \
1457 vdup.u32 dx8, rg_dx8[0]; \
1458 \
1459 vadd.u32 r_block, r_block, dx8; \
1460 vdup.u32 dx8, rg_dx8[1]; \
1461 \
1462 vadd.u32 g_block, g_block, dx8; \
1463 vdup.u32 dx8, b_dx8; \
1464 \
1465 vadd.u32 b_block, b_block, dx8; \
1466 vmovn.u16 u_whole_8, u_whole; \
1467 \
1468 vmovn.u16 v_whole_8, v_whole; \
1469 \
1470 vmovn.u16 b_whole_8, b_whole; \
1471 pld [ fb_ptr ]; \
1472 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1473 \
1474 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1475 setup_blocks_texture_##swizzling(); \
1476 \
1477 vmovn.u16 r_whole_8, r_whole; \
1478 beq 5f; \
1479 \
1480 4: \
1481 vmovn.u16 g_whole_8, g_whole; \
1482 vshrn.u32 u_whole_low, u_block, #16; \
1483 \
1484 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1485 vshrn.u32 v_whole_low, v_block, #16; \
1486 \
1487 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1488 vshrn.u32 r_whole_low, r_block, #16; \
1489 \
1490 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1491 vshrn.u32 g_whole_low, g_block, #16; \
1492 \
1493 vdup.u32 dx4, uv_dx4[0]; \
1494 vshrn.u32 b_whole_low, b_block, #16; \
1495 \
1496 vaddhn.u32 u_whole_high, u_block, dx4; \
1497 vdup.u32 dx4, uv_dx4[1]; \
1498 \
1499 vaddhn.u32 v_whole_high, v_block, dx4; \
1500 vdup.u32 dx4, rg_dx4[0]; \
1501 \
1502 vaddhn.u32 r_whole_high, r_block, dx4; \
1503 vdup.u32 dx4, rg_dx4[1]; \
1504 \
1505 vaddhn.u32 g_whole_high, g_block, dx4; \
1506 vdup.u32 dx4, b_dx4; \
1507 \
1508 vaddhn.u32 b_whole_high, b_block, dx4; \
1509 vdup.u32 dx8, uv_dx8[0]; \
1510 \
1511 vadd.u32 u_block, u_block, dx8; \
1512 vdup.u32 dx8, uv_dx8[1]; \
1513 \
1514 vadd.u32 v_block, v_block, dx8; \
1515 vdup.u32 dx8, rg_dx8[0]; \
1516 \
1517 vadd.u32 r_block, r_block, dx8; \
1518 vdup.u32 dx8, rg_dx8[1]; \
1519 \
1520 vadd.u32 g_block, g_block, dx8; \
1521 vdup.u32 dx8, b_dx8; \
1522 \
1523 vadd.u32 b_block, b_block, dx8; \
1524 vmovn.u16 u_whole_8, u_whole; \
1525 \
1526 add fb_ptr, fb_ptr, #16; \
1527 vmovn.u16 v_whole_8, v_whole; \
1528 \
1529 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1530 vmovn.u16 b_whole_8, b_whole; \
1531 \
1532 pld [ fb_ptr ]; \
1533 \
1534 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1535 subs span_num_blocks, span_num_blocks, #1; \
1536 \
1537 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1538 setup_blocks_texture_##swizzling(); \
1539 \
1540 vmovn.u16 r_whole_8, r_whole; \
1541 bne 4b; \
1542 \
1543 5: \
1544 vmovn.u16 g_whole_8, g_whole; \
1545 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1546 \
1547 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1548 vdup.u8 draw_mask, right_mask; \
1549 \
1550 vmov.u32 fb_mask_ptrs[0], right_mask; \
1551 vtst.u16 draw_mask, draw_mask, test_mask; \
1552 vzip.u8 u_whole_8, v_whole_8; \
1553 \
1554 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1555 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1556 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1557 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1558 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1559 \
1560 1: \
1561 add span_uvrg_offset, span_uvrg_offset, #16; \
1562 add span_b_offset, span_b_offset, #4; \
1563 \
1564 add span_edge_data, span_edge_data, #8; \
1565 subs num_spans, num_spans, #1; \
1566 \
1567 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1568 bne 0b; \
1569 \
1570 ldmia sp!, { r4 - r11, pc }; \
1571 \
1572 2: \
1573 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1574 vpush { texture_mask }; \
1575 vpush { uvrg_dx4 }; \
1576 \
1577 stmdb sp!, { r0 - r3, r12, r14 }; \
1578 bl flush_render_block_buffer; \
1579 ldmia sp!, { r0 - r3, r12, r14 }; \
1580 \
1581 vpop { uvrg_dx4 }; \
1582 vpop { texture_mask }; \
1583 \
1584 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1585 vmov.u8 fb_mask_ptrs, #0; \
1586 \
1587 mov num_blocks, span_num_blocks; \
1588 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1589 bal 3b \
1590
1591
1592setup_blocks_shaded_textured_builder(swizzled)
1593setup_blocks_shaded_textured_builder(unswizzled)
1594
1595
1596#define setup_blocks_unshaded_textured_builder(swizzling) \
1597.align 3; \
1598 \
1599function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
1600 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1601 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1602 \
1603 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1604 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1605 \
1606 cmp num_spans, #0; \
1607 bxeq lr; \
1608 \
1609 stmdb sp!, { r4 - r11, r14 }; \
1610 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1611 \
1612 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1613 \
1614 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1615 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1616 \
1617 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1618 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1619 \
1620 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1621 \
1622 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1623 \
1624 0: \
1625 vmov.u8 fb_mask_ptrs, #0; \
1626 \
1627 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1628 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1629 \
1630 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1631 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1632 \
1633 cmp span_num_blocks, #0; \
1634 beq 1f; \
1635 \
1636 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1637 add num_blocks, span_num_blocks, num_blocks; \
1638 \
1639 cmp num_blocks, #MAX_BLOCKS; \
1640 bgt 2f; \
1641 \
1642 3: \
1643 add fb_ptr, fb_ptr, y, lsl #11; \
1644 \
1645 vdup.u32 v_left_x, left_x; \
1646 and y, y, #0x3; \
1647 \
1648 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1649 add fb_ptr, fb_ptr, left_x, lsl #1; \
1650 \
1651 and dither_shift, left_x, #0x03; \
1652 \
1653 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1654 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1655 \
1656 mov dither_shift, dither_shift, lsl #3; \
1657 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1658 \
1659 mov c_32, #32; \
1660 subs span_num_blocks, span_num_blocks, #1; \
1661 \
1662 mov dither_row, dither_row, ror dither_shift; \
1663 \
1664 vdup.u32 dither_offsets_short, dither_row; \
1665 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1666 \
1667 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1668 \
1669 vdup.u32 u_block, uv[0]; \
1670 \
1671 vdup.u32 v_block, uv[1]; \
1672 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1673 \
1674 vadd.u32 u_block, u_block, block_span; \
1675 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1676 \
1677 vadd.u32 v_block, v_block, block_span; \
1678 add block_ptr_b, block_ptr_a, #16; \
1679 \
1680 vshrn.u32 u_whole_low, u_block, #16; \
1681 vshrn.u32 v_whole_low, v_block, #16; \
1682 \
1683 vdup.u32 dx4, uv_dx4[0]; \
1684 \
1685 vaddhn.u32 u_whole_high, u_block, dx4; \
1686 vdup.u32 dx4, uv_dx4[1]; \
1687 \
1688 vaddhn.u32 v_whole_high, v_block, dx4; \
1689 vdup.u32 dx8, uv_dx8[0]; \
1690 \
1691 vadd.u32 u_block, u_block, dx8; \
1692 vdup.u32 dx8, uv_dx8[1]; \
1693 \
1694 vadd.u32 v_block, v_block, dx8; \
1695 vmovn.u16 u_whole_8, u_whole; \
1696 \
1697 vmovn.u16 v_whole_8, v_whole; \
1698 \
1699 pld [ fb_ptr ]; \
1700 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1701 \
1702 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1703 setup_blocks_texture_##swizzling(); \
1704 \
1705 beq 5f; \
1706 \
1707 4: \
1708 vshrn.u32 u_whole_low, u_block, #16; \
1709 \
1710 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1711 vshrn.u32 v_whole_low, v_block, #16; \
1712 \
1713 add block_ptr_b, block_ptr_b, #32; \
1714 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1715 \
1716 vdup.u32 dx4, uv_dx4[0]; \
1717 vaddhn.u32 u_whole_high, u_block, dx4; \
1718 vdup.u32 dx4, uv_dx4[1]; \
1719 \
1720 vaddhn.u32 v_whole_high, v_block, dx4; \
1721 vdup.u32 dx8, uv_dx8[0]; \
1722 \
1723 vadd.u32 u_block, u_block, dx8; \
1724 vdup.u32 dx8, uv_dx8[1]; \
1725 \
1726 vadd.u32 v_block, v_block, dx8; \
1727 vmovn.u16 u_whole_8, u_whole; \
1728 \
1729 add fb_ptr, fb_ptr, #16; \
1730 vmovn.u16 v_whole_8, v_whole; \
1731 \
1732 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1733 pld [ fb_ptr ]; \
1734 \
1735 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1736 subs span_num_blocks, span_num_blocks, #1; \
1737 \
1738 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1739 setup_blocks_texture_##swizzling(); \
1740 \
1741 bne 4b; \
1742 \
1743 5: \
1744 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1745 \
1746 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1747 vdup.u8 draw_mask, right_mask; \
1748 \
1749 vmov.u32 fb_mask_ptrs[0], right_mask; \
1750 vtst.u16 draw_mask, draw_mask, test_mask; \
1751 vzip.u8 u_whole_8, v_whole_8; \
1752 \
1753 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1754 add block_ptr_b, block_ptr_b, #32; \
1755 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1756 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1757 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1758 \
1759 1: \
1760 add span_uvrg_offset, span_uvrg_offset, #16; \
1761 add span_edge_data, span_edge_data, #8; \
1762 subs num_spans, num_spans, #1; \
1763 \
1764 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1765 bne 0b; \
1766 \
1767 ldmia sp!, { r4 - r11, pc }; \
1768 \
1769 2: \
1770 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1771 vpush { texture_mask }; \
1772 vpush { uvrg_dx4 }; \
1773 \
1774 stmdb sp!, { r0 - r3, r12, r14 }; \
1775 bl flush_render_block_buffer; \
1776 ldmia sp!, { r0 - r3, r12, r14 }; \
1777 \
1778 vpop { uvrg_dx4 }; \
1779 vpop { texture_mask }; \
1780 \
1781 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1782 vmov.u8 fb_mask_ptrs, #0; \
1783 \
1784 mov num_blocks, span_num_blocks; \
1785 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1786 bal 3b \
1787
1788
1789setup_blocks_unshaded_textured_builder(swizzled)
1790setup_blocks_unshaded_textured_builder(unswizzled)
1791
1792
1793.align 3
1794
1795function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1796 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1797 veor.u32 draw_mask, draw_mask, draw_mask
1798
1799 cmp num_spans, #0
1800 bxeq lr
1801
1802 stmdb sp!, { r4 - r11, r14 }
1803 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1804
1805 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1806
1807 ubfx color_r, color, #3, #5
1808 ubfx color_g, color, #11, #5
1809 ubfx color_b, color, #19, #5
1810
1811 orr color, color_r, color_b, lsl #10
1812 orr color, color, color_g, lsl #5
1813
1814 vdup.u16 colors, color
1815
1816 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1817 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1818
1819 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1820 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1821
1822 0:
1823 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1824 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1825
c1817bd9 1826 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1827
1828 cmp span_num_blocks, #0
1829 beq 1f
1830
1831 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1832 add num_blocks, span_num_blocks, num_blocks
1833
1834 cmp num_blocks, #MAX_BLOCKS
1835 bgt 2f
1836
1837 3:
1838 add fb_ptr, fb_ptr, y, lsl #11
1839 and y, y, #0x3
1840
1841 add fb_ptr, fb_ptr, left_x, lsl #1
1842 mov c_32, #32
1843
1844 subs span_num_blocks, span_num_blocks, #1
1845
1846 add block_ptr_b, block_ptr_a, #16
1847 pld [ fb_ptr ]
1848
1849 vmov.u32 fb_mask_ptrs[1], fb_ptr
1850 beq 5f
1851
1852 4:
1853 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1854 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1855 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1856
1857 add fb_ptr, fb_ptr, #16
1858 add block_ptr_b, block_ptr_b, #32
1859
1860 pld [ fb_ptr ]
1861
1862 vmov.u32 fb_mask_ptrs[1], fb_ptr
1863 subs span_num_blocks, span_num_blocks, #1
1864
1865 bne 4b
1866
1867 5:
1868 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1869
1870 vdup.u8 draw_mask_edge, right_mask
1871 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1872
1873 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1874 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1875 add block_ptr_b, block_ptr_b, #32
1876 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1877
1878 1:
1879 add span_edge_data, span_edge_data, #8
1880 subs num_spans, num_spans, #1
1881
1882 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1883 bne 0b
1884
1885 ldmia sp!, { r4 - r11, pc }
1886
1887 2:
1888 vpush { colors }
1889
1890 stmdb sp!, { r0 - r3, r12, r14 }
1891 bl flush_render_block_buffer
1892 ldmia sp!, { r0 - r3, r12, r14 }
1893
1894 vpop { colors }
1895
1896 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1897 veor.u32 draw_mask, draw_mask, draw_mask
1898
1899 mov num_blocks, span_num_blocks
1900 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1901 bal 3b
1902
1903
1904#define mask_msb_scalar r14
1905
1906#define msb_mask q15
1907
1908#define pixels_low d16
1909
1910#define msb_mask_low d30
1911#define msb_mask_high d31
1912
1913
1914.align 3
1915
1916function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1917 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1918
1919 cmp num_spans, #0
1920 bxeq lr
1921
1922 stmdb sp!, { r4 - r11, r14 }
1923
1924 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1925
1926 ubfx color_r, color, #3, #5
1927 ubfx color_g, color, #11, #5
1928
1929 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1930 ubfx color_b, color, #19, #5
1931
1932 orr color, color_r, color_b, lsl #10
1933 orr color, color, color_g, lsl #5
1934 orr color, color, mask_msb_scalar
1935
1936 vdup.u16 colors, color
1937
1938 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
3867c6ef
E
1939 orr color, color, lsl #16
1940
75e28f62
E
1941
1942 0:
1943 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1944 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1945
c1817bd9 1946 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1947
1948 cmp span_num_blocks, #0
1949 beq 1f
1950
1951 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1952
1953 add fb_ptr, fb_ptr, y, lsl #11
1954 subs span_num_blocks, span_num_blocks, #1
1955
1956 add fb_ptr, fb_ptr, left_x, lsl #1
1957 beq 3f
1958
1959 2:
1960 vst1.u32 { colors }, [ fb_ptr ]!
1961 subs span_num_blocks, span_num_blocks, #1
1962
1963 bne 2b
1964
1965 3:
1966 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
75e28f62 1967
3867c6ef
E
1968 cmp right_mask, #0x0
1969 beq 5f
1970
1971 tst right_mask, #0xF
1972 streq color, [ fb_ptr ], #4
1973 moveq right_mask, right_mask, lsr #4
1974 streq color, [ fb_ptr ], #4
1975
1976 tst right_mask, #0x3
1977 streq color, [ fb_ptr ], #4
1978 moveq right_mask, right_mask, lsr #2
1979
1980 tst right_mask, #0x1
1981 streqh color, [ fb_ptr ]
75e28f62
E
1982
1983 1:
1984 add span_edge_data, span_edge_data, #8
1985 subs num_spans, num_spans, #1
75e28f62
E
1986 bne 0b
1987
1988 ldmia sp!, { r4 - r11, pc }
1989
3867c6ef
E
1990 5:
1991 vst1.u32 { colors }, [ fb_ptr ]
1992 bal 1b
75e28f62
E
1993
1994
1995#undef c_64
1996
1997#define c_64 r7
1998#define rg_dx_ptr r2
1999
2000
2001#undef r_block
2002#undef g_block
2003#undef b_block
2004#undef r_whole
2005#undef g_whole
2006#undef b_whole
2007#undef r_whole_low
2008#undef r_whole_high
2009#undef g_whole_low
2010#undef g_whole_high
2011#undef b_whole_low
2012#undef b_whole_high
2013#undef r_whole_8
2014#undef g_whole_8
2015#undef b_whole_8
2016#undef dither_offsets
2017#undef rg_dx4
2018#undef rg_dx8
2019#undef dx4
2020#undef dx8
2021#undef v_left_x
2022#undef uvrg
2023#undef block_span
2024#undef rg
2025#undef draw_mask
2026#undef test_mask
2027
2028#define r_block q0
2029#define g_block q1
2030#define b_block q2
2031
2032#define r_whole q3
2033#define g_whole q4
2034#define b_whole q5
2035
2036#define r_whole_low d6
2037#define r_whole_high d7
2038#define g_whole_low d8
2039#define g_whole_high d9
2040#define b_whole_low d10
2041#define b_whole_high d11
2042
2043#define gb_whole_8 q6
2044
2045#define g_whole_8 d12
2046#define b_whole_8 d13
2047
2048#define r_whole_8 d14
2049
2050#define pixels q8
2051
2052#define rg_dx4 d18
2053#define rg_dx8 d19
2054
2055#define dx4 q10
2056#define dx8 q10
2057
2058#define v_left_x d6
2059#define uvrg q4
2060#define block_span q5
2061
2062#define rg d9
2063
2064#define d64_1 d22
2065#define d64_128 d23
2066
2067#define d128_4 q12
2068#define d128_0x7 q13
2069
2070#define d64_4 d24
2071
2072#define dither_offsets q14
2073#define draw_mask q15
2074
2075#define dither_offsets_low d28
2076
2077#define rg_dx d0
2078#define test_mask q10
2079
2080
2081#define setup_blocks_shaded_untextured_dither_a_dithered() \
2082 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2083 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2084
2085#define setup_blocks_shaded_untextured_dither_b_dithered() \
2086 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2087 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2088
2089#define setup_blocks_shaded_untextured_dither_a_undithered() \
2090
2091#define setup_blocks_shaded_untextured_dither_b_undithered() \
2092
2093
2094#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2095.align 3; \
2096 \
2097function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
2098 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2099 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2100 \
2101 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2102 \
2103 cmp num_spans, #0; \
2104 bxeq lr; \
2105 \
2106 stmdb sp!, { r4 - r11, r14 }; \
2107 vshl.u32 rg_dx4, rg_dx, #2; \
2108 \
2109 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2110 vshl.u32 rg_dx8, rg_dx, #3; \
2111 \
2112 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2113 \
2114 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2115 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2116 \
2117 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2118 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2119 \
2120 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2121 vmov.u8 d64_1, #1; \
2122 \
2123 vmov.u8 d128_4, #4; \
2124 vmov.u8 d64_128, #128; \
2125 \
2126 vmov.u8 d128_0x7, #0x7; \
2127 \
2128 0: \
2129 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2130 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2131 \
2132 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2133 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2134 \
2135 cmp span_num_blocks, #0; \
2136 beq 1f; \
2137 \
2138 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2139 add num_blocks, span_num_blocks, num_blocks; \
2140 \
2141 cmp num_blocks, #MAX_BLOCKS; \
2142 bgt 2f; \
2143 \
2144 3: \
2145 ldr b, [ span_b_offset ]; \
2146 add fb_ptr, fb_ptr, y, lsl #11; \
2147 \
2148 vdup.u32 v_left_x, left_x; \
2149 and y, y, #0x3; \
2150 \
2151 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2152 add fb_ptr, fb_ptr, left_x, lsl #1; \
2153 \
2154 mla b, b_dx, left_x, b; \
2155 and dither_shift, left_x, #0x03; \
2156 \
2157 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2158 vshr.u32 rg_dx, rg_dx4, #2; \
2159 \
2160 mov dither_shift, dither_shift, lsl #3; \
2161 vmla.u32 rg, rg_dx, v_left_x; \
2162 \
2163 mov c_64, #64; \
2164 subs span_num_blocks, span_num_blocks, #1; \
2165 \
2166 mov dither_row, dither_row, ror dither_shift; \
2167 mov b_dx4, b_dx, lsl #2; \
2168 \
2169 vdup.u32 dither_offsets, dither_row; \
2170 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2171 \
2172 vdup.u32 b_block, b; \
2173 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2174 \
2175 mov b_dx8, b_dx, lsl #3; \
2176 vdup.u32 r_block, rg[0]; \
2177 vdup.u32 g_block, rg[1]; \
2178 \
2179 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2180 \
2181 vadd.u32 r_block, r_block, block_span; \
2182 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2183 \
2184 vadd.u32 g_block, g_block, block_span; \
2185 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2186 \
2187 vadd.u32 b_block, b_block, block_span; \
2188 add block_ptr_b, block_ptr_a, #16; \
2189 \
2190 vshrn.u32 r_whole_low, r_block, #16; \
2191 vshrn.u32 g_whole_low, g_block, #16; \
2192 vshrn.u32 b_whole_low, b_block, #16; \
2193 vdup.u32 dx4, rg_dx4[0]; \
2194 \
2195 vaddhn.u32 r_whole_high, r_block, dx4; \
2196 vdup.u32 dx4, rg_dx4[1]; \
2197 \
2198 vaddhn.u32 g_whole_high, g_block, dx4; \
2199 vdup.u32 dx4, b_dx4; \
2200 \
2201 vaddhn.u32 b_whole_high, b_block, dx4; \
2202 vdup.u32 dx8, rg_dx8[0]; \
2203 \
2204 vadd.u32 r_block, r_block, dx8; \
2205 vdup.u32 dx8, rg_dx8[1]; \
2206 \
2207 vadd.u32 g_block, g_block, dx8; \
2208 vdup.u32 dx8, b_dx8; \
2209 \
2210 vadd.u32 b_block, b_block, dx8; \
2211 \
2212 vmovn.u16 r_whole_8, r_whole; \
2213 vmovn.u16 g_whole_8, g_whole; \
2214 vmovn.u16 b_whole_8, b_whole; \
2215 \
2216 beq 5f; \
2217 veor.u32 draw_mask, draw_mask, draw_mask; \
2218 \
2219 4: \
2220 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2221 vshrn.u32 r_whole_low, r_block, #16; \
2222 \
2223 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2224 vshrn.u32 g_whole_low, g_block, #16; \
2225 \
2226 vshrn.u32 b_whole_low, b_block, #16; \
2227 str fb_ptr, [ block_ptr_a, #44 ]; \
2228 \
2229 vdup.u32 dx4, rg_dx4[0]; \
2230 vshr.u8 r_whole_8, r_whole_8, #3; \
2231 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2232 \
2233 vaddhn.u32 r_whole_high, r_block, dx4; \
2234 vdup.u32 dx4, rg_dx4[1]; \
2235 \
2236 vaddhn.u32 g_whole_high, g_block, dx4; \
2237 vdup.u32 dx4, b_dx4; \
2238 \
2239 vaddhn.u32 b_whole_high, b_block, dx4; \
2240 vdup.u32 dx8, rg_dx8[0]; \
2241 \
2242 vmull.u8 pixels, r_whole_8, d64_1; \
2243 vmlal.u8 pixels, g_whole_8, d64_4; \
2244 vmlal.u8 pixels, b_whole_8, d64_128; \
2245 \
2246 vadd.u32 r_block, r_block, dx8; \
2247 vdup.u32 dx8, rg_dx8[1]; \
2248 \
2249 vadd.u32 g_block, g_block, dx8; \
2250 vdup.u32 dx8, b_dx8; \
2251 \
2252 vadd.u32 b_block, b_block, dx8; \
2253 add fb_ptr, fb_ptr, #16; \
2254 \
2255 vmovn.u16 r_whole_8, r_whole; \
2256 vmovn.u16 g_whole_8, g_whole; \
2257 vmovn.u16 b_whole_8, b_whole; \
2258 \
2259 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2260 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2261 \
2262 pld [ fb_ptr ]; \
2263 \
2264 subs span_num_blocks, span_num_blocks, #1; \
2265 bne 4b; \
2266 \
2267 5: \
2268 str fb_ptr, [ block_ptr_a, #44 ]; \
2269 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2270 \
2271 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2272 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2273 \
2274 vshr.u8 r_whole_8, r_whole_8, #3; \
2275 vdup.u8 draw_mask, right_mask; \
2276 \
2277 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2278 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
2279 \
2280 vtst.u16 draw_mask, draw_mask, test_mask; \
2281 \
2282 vmull.u8 pixels, r_whole_8, d64_1; \
2283 vmlal.u8 pixels, g_whole_8, d64_4; \
2284 vmlal.u8 pixels, b_whole_8, d64_128; \
2285 \
2286 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2287 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2288 \
2289 1: \
2290 add span_uvrg_offset, span_uvrg_offset, #16; \
2291 add span_b_offset, span_b_offset, #4; \
2292 \
2293 add span_edge_data, span_edge_data, #8; \
2294 subs num_spans, num_spans, #1; \
2295 \
2296 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2297 bne 0b; \
2298 \
2299 ldmia sp!, { r4 - r11, pc }; \
2300 \
2301 2: \
2302 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2303 vpush { rg_dx4 }; \
2304 \
2305 stmdb sp!, { r0 - r3, r12, r14 }; \
2306 bl flush_render_block_buffer; \
2307 ldmia sp!, { r0 - r3, r12, r14 }; \
2308 \
2309 vpop { rg_dx4 }; \
2310 \
2311 vmov.u8 d64_1, #1; \
2312 vmov.u8 d128_4, #4; \
2313 vmov.u8 d64_128, #128; \
2314 vmov.u8 d128_0x7, #0x7; \
2315 \
2316 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2317 \
2318 mov num_blocks, span_num_blocks; \
2319 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2320 bal 3b \
2321
2322
2323setup_blocks_shaded_untextured_indirect_builder(undithered)
2324setup_blocks_shaded_untextured_indirect_builder(dithered)
2325
2326
2327#undef draw_mask
2328
2329#define mask_msb_ptr r14
2330
2331#define draw_mask q0
2332#define pixels_low d16
3867c6ef 2333#define pixels_high d17
75e28f62
E
2334
2335
2336
2337#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2338.align 3; \
2339 \
2340function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
2341 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2342 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2343 \
2344 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2345 \
2346 cmp num_spans, #0; \
2347 bxeq lr; \
2348 \
2349 stmdb sp!, { r4 - r11, r14 }; \
2350 vshl.u32 rg_dx4, rg_dx, #2; \
2351 \
2352 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2353 vshl.u32 rg_dx8, rg_dx, #3; \
2354 \
2355 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2356 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2357 \
2358 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2359 vmov.u8 d64_1, #1; \
2360 \
2361 vmov.u8 d128_4, #4; \
2362 vmov.u8 d64_128, #128; \
2363 \
2364 vmov.u8 d128_0x7, #0x7; \
2365 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
2366 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
2367 \
2368 0: \
2369 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2370 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2371 \
2372 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2373 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2374 \
2375 cmp span_num_blocks, #0; \
2376 beq 1f; \
2377 \
2378 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2379 add fb_ptr, fb_ptr, y, lsl #11; \
2380 \
2381 ldr b, [ span_b_offset ]; \
2382 vdup.u32 v_left_x, left_x; \
2383 and y, y, #0x3; \
2384 \
2385 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2386 add fb_ptr, fb_ptr, left_x, lsl #1; \
2387 \
2388 mla b, b_dx, left_x, b; \
2389 and dither_shift, left_x, #0x03; \
2390 \
2391 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2392 vshr.u32 rg_dx, rg_dx4, #2; \
2393 \
2394 mov dither_shift, dither_shift, lsl #3; \
2395 vmla.u32 rg, rg_dx, v_left_x; \
2396 \
2397 subs span_num_blocks, span_num_blocks, #1; \
2398 \
2399 mov dither_row, dither_row, ror dither_shift; \
2400 mov b_dx4, b_dx, lsl #2; \
2401 \
2402 vdup.u32 dither_offsets, dither_row; \
2403 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2404 \
2405 vdup.u32 b_block, b; \
2406 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2407 \
2408 mov b_dx8, b_dx, lsl #3; \
2409 vdup.u32 r_block, rg[0]; \
2410 vdup.u32 g_block, rg[1]; \
2411 \
2412 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2413 \
2414 vadd.u32 r_block, r_block, block_span; \
2415 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2416 \
2417 vadd.u32 g_block, g_block, block_span; \
2418 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2419 \
2420 vadd.u32 b_block, b_block, block_span; \
2421 add block_ptr_b, block_ptr_a, #16; \
2422 \
2423 vshrn.u32 r_whole_low, r_block, #16; \
2424 vshrn.u32 g_whole_low, g_block, #16; \
2425 vshrn.u32 b_whole_low, b_block, #16; \
2426 vdup.u32 dx4, rg_dx4[0]; \
2427 \
2428 vaddhn.u32 r_whole_high, r_block, dx4; \
2429 vdup.u32 dx4, rg_dx4[1]; \
2430 \
2431 vaddhn.u32 g_whole_high, g_block, dx4; \
2432 vdup.u32 dx4, b_dx4; \
2433 \
2434 vaddhn.u32 b_whole_high, b_block, dx4; \
2435 vdup.u32 dx8, rg_dx8[0]; \
2436 \
2437 vadd.u32 r_block, r_block, dx8; \
2438 vdup.u32 dx8, rg_dx8[1]; \
2439 \
2440 vadd.u32 g_block, g_block, dx8; \
2441 vdup.u32 dx8, b_dx8; \
2442 \
2443 vadd.u32 b_block, b_block, dx8; \
2444 \
2445 vmovn.u16 r_whole_8, r_whole; \
2446 vmovn.u16 g_whole_8, g_whole; \
2447 vmovn.u16 b_whole_8, b_whole; \
2448 \
2449 beq 3f; \
2450 \
2451 2: \
2452 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2453 vshrn.u32 r_whole_low, r_block, #16; \
2454 \
2455 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2456 vshrn.u32 g_whole_low, g_block, #16; \
2457 \
2458 vshrn.u32 b_whole_low, b_block, #16; \
2459 \
2460 vdup.u32 dx4, rg_dx4[0]; \
2461 vshr.u8 r_whole_8, r_whole_8, #3; \
2462 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2463 \
2464 vaddhn.u32 r_whole_high, r_block, dx4; \
2465 vdup.u32 dx4, rg_dx4[1]; \
2466 \
2467 vmov pixels, msb_mask; \
2468 vaddhn.u32 g_whole_high, g_block, dx4; \
2469 vdup.u32 dx4, b_dx4; \
2470 \
2471 vaddhn.u32 b_whole_high, b_block, dx4; \
2472 vdup.u32 dx8, rg_dx8[0]; \
2473 \
2474 vmlal.u8 pixels, r_whole_8, d64_1; \
2475 vmlal.u8 pixels, g_whole_8, d64_4; \
2476 vmlal.u8 pixels, b_whole_8, d64_128; \
2477 \
2478 vadd.u32 r_block, r_block, dx8; \
2479 vdup.u32 dx8, rg_dx8[1]; \
2480 \
2481 vadd.u32 g_block, g_block, dx8; \
2482 vdup.u32 dx8, b_dx8; \
2483 \
2484 vadd.u32 b_block, b_block, dx8; \
2485 \
2486 vmovn.u16 r_whole_8, r_whole; \
2487 vmovn.u16 g_whole_8, g_whole; \
2488 vmovn.u16 b_whole_8, b_whole; \
2489 \
2490 vst1.u32 { pixels }, [ fb_ptr ]!; \
2491 subs span_num_blocks, span_num_blocks, #1; \
2492 bne 2b; \
2493 \
2494 3: \
2495 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2496 \
3867c6ef 2497 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
75e28f62
E
2498 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2499 \
2500 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2501 rbit right_mask, right_mask; \
75e28f62
E
2502 vmov pixels, msb_mask; \
2503 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2504 clz right_mask, right_mask; \
75e28f62
E
2505 \
2506 vmlal.u8 pixels, r_whole_8, d64_1; \
2507 vmlal.u8 pixels, g_whole_8, d64_4; \
2508 vmlal.u8 pixels, b_whole_8, d64_128; \
2509 \
3867c6ef
E
2510 ldr pc, [ pc, right_mask, lsl #2 ]; \
2511 nop; \
2512 nop; \
2513 .word 4f; \
2514 .word 5f; \
2515 .word 6f; \
2516 .word 7f; \
2517 .word 8f; \
2518 .word 9f; \
2519 .word 10f; \
2520 .word 11f; \
2521 \
75e28f62 2522 4: \
3867c6ef
E
2523 vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \
2524 bal 1f; \
2525 \
2526 5: \
2527 vst1.u32 { pixels_low[0] }, [ fb_ptr ]; \
2528 bal 1f; \
2529 \
2530 6: \
2531 vst1.u32 { pixels_low[0] }, [ fb_ptr ]!; \
2532 vst1.u16 { pixels_low[2] }, [ fb_ptr ]; \
2533 bal 1f; \
2534 \
2535 7: \
2536 vst1.u32 { pixels_low }, [ fb_ptr ]; \
2537 bal 1f; \
2538 \
2539 8: \
2540 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2541 vst1.u16 { pixels_high[0] }, [ fb_ptr ]; \
2542 bal 1f; \
2543 \
2544 9: \
2545 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2546 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2547 bal 1f; \
2548 \
2549 10: \
2550 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2551 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2552 vst1.u16 { pixels_high[2] }, [ fb_ptr ]; \
2553 bal 1f; \
2554 \
2555 11: \
2556 vst1.u32 { pixels }, [ fb_ptr ]; \
2557 bal 1f; \
75e28f62
E
2558 \
2559 1: \
2560 add span_uvrg_offset, span_uvrg_offset, #16; \
2561 add span_b_offset, span_b_offset, #4; \
2562 \
2563 add span_edge_data, span_edge_data, #8; \
2564 subs num_spans, num_spans, #1; \
2565 \
2566 bne 0b; \
2567 \
2568 ldmia sp!, { r4 - r11, pc } \
2569
2570setup_blocks_shaded_untextured_direct_builder(undithered)
2571setup_blocks_shaded_untextured_direct_builder(dithered)
2572
2573
2574#undef psx_gpu
2575#undef num_blocks
2576#undef triangle
2577#undef c_64
2578
2579#define psx_gpu r0
2580#define block_ptr r1
2581#define num_blocks r2
2582#define uv_01 r3
2583#define uv_23 r4
2584#define uv_45 r5
2585#define uv_67 r6
2586#define uv_0 r7
2587#define uv_1 r3
2588#define uv_2 r8
2589#define uv_3 r4
2590#define uv_4 r9
2591#define uv_5 r5
2592#define uv_6 r10
2593#define uv_7 r6
2594#define texture_ptr r11
2595
2596#define pixel_0 r7
2597#define pixel_1 r3
2598#define pixel_2 r8
2599#define pixel_3 r4
2600#define pixel_4 r9
2601#define pixel_5 r5
2602#define pixel_6 r10
2603#define pixel_7 r6
2604
2605#define pixels_a r7
2606#define pixels_b r9
2607#define pixels_c r8
2608#define pixels_d r10
2609
2610#define c_64 r0
2611
2612#define clut_ptr r12
2613#define current_texture_mask r5
2614#define dirty_textures_mask r6
2615
2616#define texels d0
2617
2618#define clut_low_a d2
2619#define clut_low_b d3
2620#define clut_high_a d4
2621#define clut_high_b d5
2622
2623#define clut_a q1
2624#define clut_b q2
2625
2626#define texels_low d6
2627#define texels_high d7
2628
2629.align 3
2630
2631function(texture_blocks_untextured)
2632 bx lr
2633
2634
2635.align 3
2636
2637function(texture_blocks_4bpp)
2638 stmdb sp!, { r3 - r11, r14 }
2639 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2640
2641 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2642 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2643
2644 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2645 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2646
2647 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2648 vuzp.u8 clut_a, clut_b
2649
2650 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2651 tst dirty_textures_mask, current_texture_mask
2652
2653 bne 1f
2654 mov c_64, #64
2655
26560:
2657 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2658
2659 uxtah uv_0, texture_ptr, uv_01
2660 uxtah uv_1, texture_ptr, uv_01, ror #16
2661
2662 uxtah uv_2, texture_ptr, uv_23
2663 uxtah uv_3, texture_ptr, uv_23, ror #16
2664
2665 uxtah uv_4, texture_ptr, uv_45
2666 ldrb pixel_0, [ uv_0 ]
2667
2668 uxtah uv_5, texture_ptr, uv_45, ror #16
2669 ldrb pixel_1, [ uv_1 ]
2670
2671 uxtah uv_6, texture_ptr, uv_67
2672 ldrb pixel_2, [ uv_2 ]
2673
2674 uxtah uv_7, texture_ptr, uv_67, ror #16
2675 ldrb pixel_3, [ uv_3 ]
2676
2677 ldrb pixel_4, [ uv_4 ]
2678 subs num_blocks, num_blocks, #1
2679
2680 ldrb pixel_5, [ uv_5 ]
2681 orr pixels_a, pixel_0, pixel_1, lsl #8
2682
2683 ldrb pixel_6, [ uv_6 ]
2684 orr pixels_b, pixel_4, pixel_5, lsl #8
2685
2686 ldrb pixel_7, [ uv_7 ]
2687 orr pixels_a, pixels_a, pixel_2, lsl #16
2688
2689 orr pixels_b, pixels_b, pixel_6, lsl #16
2690 orr pixels_a, pixels_a, pixel_3, lsl #24
2691
2692 orr pixels_b, pixels_b, pixel_7, lsl #24
2693 vmov.u32 texels, pixels_a, pixels_b
2694
2695 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2696 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2697
2698 vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2699 bne 0b
2700
2701 ldmia sp!, { r3 - r11, pc }
2702
27031:
2704 stmdb sp!, { r1 - r2 }
2705 bl update_texture_4bpp_cache
2706
2707 mov c_64, #64
2708 ldmia sp!, { r1 - r2 }
2709 bal 0b
2710
2711
2712.align 3
2713
2714function(texture_blocks_8bpp)
2715 stmdb sp!, { r3 - r11, r14 }
2716 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2717
2718 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2719 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2720
2721 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2722 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2723
2724 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2725 tst dirty_textures_mask, current_texture_mask
2726
2727 bne 1f
2728 nop
2729
27300:
2731 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2732
2733 uxtah uv_0, texture_ptr, uv_01
2734 uxtah uv_1, texture_ptr, uv_01, ror #16
2735
2736 uxtah uv_2, texture_ptr, uv_23
2737 uxtah uv_3, texture_ptr, uv_23, ror #16
2738
2739 uxtah uv_4, texture_ptr, uv_45
2740 ldrb pixel_0, [ uv_0 ]
2741
2742 uxtah uv_5, texture_ptr, uv_45, ror #16
2743 ldrb pixel_1, [ uv_1 ]
2744
2745 uxtah uv_6, texture_ptr, uv_67
2746 ldrb pixel_2, [ uv_2 ]
2747
2748 uxtah uv_7, texture_ptr, uv_67, ror #16
2749 ldrb pixel_3, [ uv_3 ]
2750
2751 ldrb pixel_4, [ uv_4 ]
2752 add pixel_0, pixel_0, pixel_0
2753
2754 ldrb pixel_5, [ uv_5 ]
2755 add pixel_1, pixel_1, pixel_1
2756
2757 ldrb pixel_6, [ uv_6 ]
2758 add pixel_2, pixel_2, pixel_2
2759
2760 ldrb pixel_7, [ uv_7 ]
2761 add pixel_3, pixel_3, pixel_3
2762
2763 ldrh pixel_0, [ clut_ptr, pixel_0 ]
2764 add pixel_4, pixel_4, pixel_4
2765
2766 ldrh pixel_1, [ clut_ptr, pixel_1 ]
2767 add pixel_5, pixel_5, pixel_5
2768
2769 ldrh pixel_2, [ clut_ptr, pixel_2 ]
2770 add pixel_6, pixel_6, pixel_6
2771
2772 ldrh pixel_3, [ clut_ptr, pixel_3 ]
2773 add pixel_7, pixel_7, pixel_7
2774
2775 ldrh pixel_4, [ clut_ptr, pixel_4 ]
2776 orr pixels_a, pixel_0, pixel_1, lsl #16
2777
2778 ldrh pixel_5, [ clut_ptr, pixel_5 ]
2779 orr pixels_c, pixel_2, pixel_3, lsl #16
2780
2781 ldrh pixel_6, [ clut_ptr, pixel_6 ]
2782 subs num_blocks, num_blocks, #1
2783
2784 ldrh pixel_7, [ clut_ptr, pixel_7 ]
2785 orr pixels_b, pixel_4, pixel_5, lsl #16
2786
2787 orr pixels_d, pixel_6, pixel_7, lsl #16
2788 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2789
2790 add block_ptr, block_ptr, #64
2791 bne 0b
2792
2793 ldmia sp!, { r3 - r11, pc }
2794
27951:
2796 stmdb sp!, { r1 - r2, r12 }
2797
2798 bl update_texture_8bpp_cache
2799
2800 ldmia sp!, { r1 - r2, r12 }
2801 bal 0b
2802
2803
2804#undef uv_0
2805#undef uv_1
2806#undef uv_2
2807#undef uv_3
2808#undef uv_4
2809#undef uv_5
2810#undef uv_6
2811#undef uv_7
2812
2813#undef pixel_0
2814#undef pixel_1
2815#undef pixel_2
2816#undef pixel_3
2817#undef pixel_4
2818#undef pixel_5
2819#undef pixel_6
2820#undef pixel_7
2821
2822#undef texture_ptr
2823
2824#undef pixels_a
2825#undef pixels_b
2826#undef pixels_c
2827#undef pixels_d
2828
2829#define psx_gpu r0
2830#define block_ptr r1
2831#define num_blocks r2
2832
2833#define uv_0 r3
2834#define uv_1 r4
2835#define u_0 r3
2836#define u_1 r4
2837#define v_0 r5
2838#define v_1 r6
2839
2840#define uv_2 r5
2841#define uv_3 r6
2842#define u_2 r5
2843#define u_3 r6
2844#define v_2 r7
2845#define v_3 r8
2846
2847#define uv_4 r7
2848#define uv_5 r8
2849#define u_4 r7
2850#define u_5 r8
2851#define v_4 r9
2852#define v_5 r10
2853
2854#define uv_6 r9
2855#define uv_7 r10
2856#define u_6 r9
2857#define u_7 r10
2858#define v_6 r11
2859#define v_7 r0
2860
2861#define pixel_0 r3
2862#define pixel_1 r4
2863#define pixel_2 r5
2864#define pixel_3 r6
2865#define pixel_4 r7
2866#define pixel_5 r8
2867#define pixel_6 r9
2868#define pixel_7 r10
2869
2870#define pixels_a r3
2871#define pixels_b r5
2872#define pixels_c r7
2873#define pixels_d r9
2874
2875#define texture_ptr r12
2876
2877
2878.align 3
2879
2880function(texture_blocks_16bpp)
2881 stmdb sp!, { r3 - r11, r14 }
2882 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2883
2884 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2885 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2886
28870:
2888 ldrh uv_0, [ block_ptr ]
2889 subs num_blocks, num_blocks, #1
2890
2891 ldrh uv_1, [ block_ptr, #2 ]
2892
2893 and v_0, uv_0, #0xFF00
2894 and v_1, uv_1, #0xFF00
2895
2896 and u_0, uv_0, #0xFF
2897 and u_1, uv_1, #0xFF
2898
2899 add uv_0, u_0, v_0, lsl #2
2900 ldrh uv_2, [ block_ptr, #4 ]
2901
2902 add uv_1, u_1, v_1, lsl #2
2903 ldrh uv_3, [ block_ptr, #6 ]
2904
2905 add uv_0, uv_0, uv_0
2906 add uv_1, uv_1, uv_1
2907
2908 and v_2, uv_2, #0xFF00
2909 and v_3, uv_3, #0xFF00
2910
2911 and u_2, uv_2, #0xFF
2912 and u_3, uv_3, #0xFF
2913
2914 add uv_2, u_2, v_2, lsl #2
2915 ldrh uv_4, [ block_ptr, #8 ]
2916
2917 add uv_3, u_3, v_3, lsl #2
2918 ldrh uv_5, [ block_ptr, #10 ]
2919
2920 add uv_2, uv_2, uv_2
2921 add uv_3, uv_3, uv_3
2922
2923 and v_4, uv_4, #0xFF00
2924 and v_5, uv_5, #0xFF00
2925
2926 and u_4, uv_4, #0xFF
2927 and u_5, uv_5, #0xFF
2928
2929 add uv_4, u_4, v_4, lsl #2
2930 ldrh uv_6, [ block_ptr, #12 ]
2931
2932 add uv_5, u_5, v_5, lsl #2
2933 ldrh uv_7, [ block_ptr, #14 ]
2934
2935 add uv_4, uv_4, uv_4
2936 ldrh pixel_0, [ texture_ptr, uv_0 ]
2937
2938 add uv_5, uv_5, uv_5
2939 ldrh pixel_1, [ texture_ptr, uv_1 ]
2940
2941 and v_6, uv_6, #0xFF00
2942 ldrh pixel_2, [ texture_ptr, uv_2 ]
2943
2944 and v_7, uv_7, #0xFF00
2945 ldrh pixel_3, [ texture_ptr, uv_3 ]
2946
2947 and u_6, uv_6, #0xFF
2948 ldrh pixel_4, [ texture_ptr, uv_4 ]
2949
2950 and u_7, uv_7, #0xFF
2951 ldrh pixel_5, [ texture_ptr, uv_5 ]
2952
2953 add uv_6, u_6, v_6, lsl #2
2954 add uv_7, u_7, v_7, lsl #2
2955
2956 add uv_6, uv_6, uv_6
2957 add uv_7, uv_7, uv_7
2958
2959 orr pixels_a, pixel_0, pixel_1, lsl #16
2960 orr pixels_b, pixel_2, pixel_3, lsl #16
2961
2962 ldrh pixel_6, [ texture_ptr, uv_6 ]
2963 orr pixels_c, pixel_4, pixel_5, lsl #16
2964
2965 ldrh pixel_7, [ texture_ptr, uv_7 ]
2966 orr pixels_d, pixel_6, pixel_7, lsl #16
2967
2968 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2969 add block_ptr, block_ptr, #64
2970
2971 bne 0b
2972
2973 ldmia sp!, { r3 - r11, pc }
2974
2975
2976#undef num_blocks
2977
2978#undef test_mask
2979#undef texels
2980#undef pixels_b
2981#undef pixels
2982#undef d64_1
2983#undef d64_4
2984#undef d64_128
2985#undef draw_mask
2986#undef msb_mask
2987#undef msb_mask_low
2988#undef msb_mask_high
2989#undef fb_pixels
2990
2991#undef c_32
2992#undef fb_ptr
2993#undef mask_msb_ptr
2994
2995#define psx_gpu r0
2996#define num_blocks r1
2997#define color_ptr r2
3867c6ef
E
2998#define colors_scalar r2
2999#define colors_scalar_compare r3
75e28f62
E
3000#define mask_msb_ptr r2
3001
3002#define block_ptr_load_a r0
3003#define block_ptr_store r3
3004#define block_ptr_load_b r12
3005#define c_32 r2
3006
3007#define c_48 r4
3008#define fb_ptr r14
3009#define draw_mask_bits_scalar r5
3010
3011#define d128_0x07 q0
3012#define d128_0x1F q1
3013#define d128_0x8000 q2
3014#define test_mask q3
3015#define texels q4
3016#define colors_rg q5
3017#define colors_b_dm_bits q6
3018#define texels_rg q7
3019#define pixels_r q8
3020#define pixels_g q9
3021#define pixels_b q10
3022#define pixels q11
3023#define zero_mask q4
3024#define draw_mask q12
3025#define msb_mask q13
3026
3027#define fb_pixels q8
3028
3029#define pixels_gb_low q9
3030
3031#define colors_r d10
3032#define colors_g d11
3033#define colors_b d12
3034#define draw_mask_bits d13
3035#define texels_r d14
3036#define texels_g d15
3037#define pixels_r_low d16
3038#define pixels_g_low d18
3039#define pixels_b_low d19
3040#define msb_mask_low d26
3041#define msb_mask_high d27
3042
3043#define d64_1 d28
3044#define d64_4 d29
3045#define d64_128 d30
3046#define texels_b d31
3047
3048#define shade_blocks_textured_modulated_prologue_indirect() \
3049 mov c_48, #48; \
3050 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3051
3052#define shade_blocks_textured_modulated_prologue_direct() \
3053 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3054 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \
3055
75e28f62 3056
3867c6ef
E
3057#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3058
3059#define shade_blocks_textured_false_modulation_check_undithered(target) \
3060 ldr colors_scalar, [ psx_gpu, #psx_gpu_triangle_color_offset ]; \
3061 movw colors_scalar_compare, #0x8080; \
3062 \
3063 movt colors_scalar_compare, #0x80; \
3064 cmp colors_scalar, colors_scalar_compare; \
3065 beq shade_blocks_textured_unmodulated_##target \
3066
3067#define shade_blocks_textured_false_modulation_check_dithered(target) \
3068
3069#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3070 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62
E
3071 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
3072 vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \
3073 vdup.u8 colors_g, colors_r[1]; \
3074 vdup.u8 colors_b, colors_r[2]; \
3075 vdup.u8 colors_r, colors_r[0] \
3076
3077
3078#define shade_blocks_textured_modulated_load_dithered(target) \
3079 vld1.u32 { target }, [ block_ptr_load_b, :128 ] \
3080
3081#define shade_blocks_textured_modulated_load_last_dithered(target) \
3082 vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \
3083
3084#define shade_blocks_textured_modulated_load_undithered(target) \
3085
3086#define shade_blocks_textured_modulated_load_last_undithered(target) \
3087 add block_ptr_load_b, block_ptr_load_b, #32 \
3088
3089#define shade_blocks_textured_modulate_dithered(channel) \
3090 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3091
3092#define shade_blocks_textured_modulate_undithered(channel) \
3093 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3094
3095
3096#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
3097 vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \
3098
3099#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
3100 ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \
3101 vld1.u32 { fb_pixels }, [ fb_ptr ]; \
3102 vbit.u16 pixels, fb_pixels, draw_mask \
3103
3104#define shade_blocks_textured_modulated_store_pixels_indirect() \
3105 vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \
3106
3107#define shade_blocks_textured_modulated_store_pixels_direct() \
3108 vst1.u32 { pixels }, [ fb_ptr ] \
3109
3110
3111#define shade_blocks_textured_modulated_load_rg_shaded() \
3112 vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \
3113
3114#define shade_blocks_textured_modulated_load_rg_unshaded() \
3115 add block_ptr_load_b, block_ptr_load_b, #32 \
3116
3117#define shade_blocks_textured_modulated_load_bdm_shaded() \
3118 vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \
3119
3120#define shade_blocks_textured_modulated_load_bdm_unshaded() \
3121 ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \
3122 add block_ptr_load_a, block_ptr_load_a, #32 \
3123
3124#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3125 vdup.u16 draw_mask, draw_mask_bits[0] \
3126
3127#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3128 vdup.u16 draw_mask, draw_mask_bits_scalar \
3129
3130
3131#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3132
3133#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3134 vorr.u16 pixels, pixels, msb_mask \
3135
3136
3137#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3138.align 3; \
3139 \
3140function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3867c6ef 3141 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62
E
3142 stmdb sp!, { r4 - r5, lr }; \
3143 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3144 \
3145 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
3146 \
3147 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3148 \
3149 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3150 mov c_32, #32; \
3151 \
3152 add block_ptr_load_b, block_ptr_load_a, #16; \
3153 vmov.u8 d64_1, #1; \
3154 vmov.u8 d64_4, #4; \
3155 vmov.u8 d64_128, #128; \
3156 \
3157 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3158 vmov.u8 d128_0x07, #0x07; \
3159 \
3160 shade_blocks_textured_modulated_load_rg_##shading(); \
3161 vmov.u8 d128_0x1F, #0x1F; \
3162 \
3163 shade_blocks_textured_modulated_load_bdm_##shading(); \
3164 vmov.u16 d128_0x8000, #0x8000; \
3165 \
3166 vmovn.u16 texels_r, texels; \
3167 vshrn.u16 texels_g, texels, #5; \
3168 \
3169 vshrn.u16 texels_b, texels, #7; \
3170 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3171 \
3172 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3173 vtst.u16 draw_mask, draw_mask, test_mask; \
3174 \
3175 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3176 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3177 \
3178 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3179 vshr.u8 texels_b, texels_b, #3; \
3180 \
3181 shade_blocks_textured_modulate_##dithering(r); \
3182 shade_blocks_textured_modulate_##dithering(g); \
3183 shade_blocks_textured_modulate_##dithering(b); \
3184 \
3185 vand.u16 pixels, texels, d128_0x8000; \
3186 vceq.u16 zero_mask, texels, #0; \
3187 \
3188 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3189 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3190 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3191 \
3192 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3193 vorr.u16 draw_mask, draw_mask, zero_mask; \
3194 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3195 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3196 \
3197 subs num_blocks, num_blocks, #1; \
3198 beq 1f; \
3199 \
3200 .align 3; \
3201 \
3202 0: \
3203 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3204 shade_blocks_textured_modulated_load_rg_##shading(); \
3205 vshrn.u16 texels_g, texels, #5; \
3206 \
3207 shade_blocks_textured_modulated_load_bdm_##shading(); \
3208 vshrn.u16 texels_b, texels, #7; \
3209 \
59d15d23 3210 pld [ block_ptr_load_a ]; \
75e28f62
E
3211 vmovn.u16 texels_r, texels; \
3212 vmlal.u8 pixels, pixels_r_low, d64_1; \
3213 \
3214 vmlal.u8 pixels, pixels_g_low, d64_4; \
3215 vmlal.u8 pixels, pixels_b_low, d64_128; \
3216 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3217 \
3218 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3219 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3220 \
3221 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3222 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3223 \
3224 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3225 vtst.u16 draw_mask, draw_mask, test_mask; \
3226 \
3227 shade_blocks_textured_modulated_store_pixels_##target(); \
3228 vshr.u8 texels_b, texels_b, #3; \
3229 \
3230 shade_blocks_textured_modulate_##dithering(r); \
3231 shade_blocks_textured_modulate_##dithering(g); \
3232 shade_blocks_textured_modulate_##dithering(b); \
3233 \
3234 vand.u16 pixels, texels, d128_0x8000; \
3235 vceq.u16 zero_mask, texels, #0; \
3236 \
3237 subs num_blocks, num_blocks, #1; \
3238 \
3239 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3240 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3241 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3242 \
3243 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3244 vorr.u16 draw_mask, draw_mask, zero_mask; \
3245 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3246 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3247 \
3248 bne 0b; \
3249 \
3250 1: \
3251 vmlal.u8 pixels, pixels_r_low, d64_1; \
3252 vmlal.u8 pixels, pixels_g_low, d64_4; \
3253 vmlal.u8 pixels, pixels_b_low, d64_128; \
3254 \
3255 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3256 shade_blocks_textured_modulated_store_pixels_##target(); \
3257 \
3258 ldmia sp!, { r4 - r5, pc } \
3259
3260
3261shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3262shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3263shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3264shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3265
3266shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3267shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3268shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3269shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3270
3271
3272#undef c_64
3273#undef fb_ptr
3274#undef color_ptr
3275
3276#undef color_r
3277#undef color_g
3278#undef color_b
3279
3280#undef test_mask
3281#undef pixels
3282#undef draw_mask
3283#undef zero_mask
3284#undef fb_pixels
3285#undef msb_mask
3286#undef msb_mask_low
3287#undef msb_mask_high
3288
3289#define psx_gpu r0
3290#define num_blocks r1
3291#define mask_msb_ptr r2
3292#define color_ptr r3
3293
3294#define block_ptr_load r0
3295#define draw_mask_store_ptr r3
3296#define draw_mask_bits_ptr r12
3297#define draw_mask_ptr r12
3298#define pixel_store_ptr r14
3299
3300#define fb_ptr_cmp r4
3301
3302#define fb_ptr r3
3303#define fb_ptr_next r14
3304
3305#define c_64 r2
3306
3307#define test_mask q0
3308#define pixels q1
3309#define draw_mask q2
3310#define zero_mask q3
3311#define draw_mask_combined q4
3312#define fb_pixels q5
3313#define fb_pixels_next q6
3314#define msb_mask q7
3315
3316#define draw_mask_low d4
3317#define draw_mask_high d5
3318#define msb_mask_low d14
3319#define msb_mask_high d15
3320
3321.align 3
3322function(shade_blocks_textured_unmodulated_indirect)
3323 str r14, [ sp, #-4 ]
3324 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3325
3326 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3327 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3328
3329 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3330 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3331
3332 mov c_64, #64
3333 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3334
3335 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3336 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3337 [ draw_mask_bits_ptr, :16 ], c_64
3338 vceq.u16 zero_mask, pixels, #0
3339
3340 vtst.u16 draw_mask, draw_mask, test_mask
3341 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3342
3343 subs num_blocks, num_blocks, #1
3344 beq 1f
3345
3346 0:
3347 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3348 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3349
3350 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3351 [ draw_mask_bits_ptr, :16 ], c_64
3352 vceq.u16 zero_mask, pixels, #0
3353
3354 vtst.u16 draw_mask, draw_mask, test_mask
3355 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3356
3357 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3358 subs num_blocks, num_blocks, #1
3359
3360 bne 0b
3361
3362 1:
3363 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3364 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3365
3366 ldr pc, [ sp, #-4 ]
3367
3368
3369.align 3
3370
3371function(shade_blocks_textured_unmodulated_direct)
3372 stmdb sp!, { r4, r14 }
3373 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3374
3375 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3376 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3377
3378 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3379 mov c_64, #64
3380
3381 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3382 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3383
3384 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3385 [ draw_mask_bits_ptr, :16 ], c_64
3386 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3387
3388 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3389 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3390 vceq.u16 zero_mask, pixels, #0
3391 vtst.u16 draw_mask, draw_mask, test_mask
3392
3393 subs num_blocks, num_blocks, #1
3394 beq 1f
3395
3396 0:
3397 mov fb_ptr, fb_ptr_next
3398 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3399
3400 vorr.u16 pixels, pixels, msb_mask
3401
3402 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3403 vmov fb_pixels, fb_pixels_next
3404
3405 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3406 [ draw_mask_bits_ptr, :16 ], c_64
3407 vbif.u16 fb_pixels, pixels, draw_mask_combined
3408
75e28f62 3409 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
8438c3c7 3410 pld [ fb_ptr_next, #64 ]
3411
75e28f62 3412 add fb_ptr_cmp, fb_ptr_cmp, #14
8438c3c7 3413 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3414
75e28f62
E
3415 cmp fb_ptr_cmp, #28
3416 bls 4f
3417
3418 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3419 vceq.u16 zero_mask, pixels, #0
3420
3421 vst1.u16 { fb_pixels }, [ fb_ptr ]
3422 vtst.u16 draw_mask, draw_mask, test_mask
3423
3424 3:
3425 subs num_blocks, num_blocks, #1
3426 bne 0b
3427
3428 1:
3429 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3430 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3431
3432 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3433
3434 ldmia sp!, { r4, pc }
3435
3436 4:
3437 vst1.u16 { fb_pixels }, [ fb_ptr ]
3438 vceq.u16 zero_mask, pixels, #0
3439
3440 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3441 vtst.u16 draw_mask, draw_mask, test_mask
3442
3443 bal 3b
3444
3445
3446function(shade_blocks_unshaded_untextured_indirect)
3447 bx lr
3448
3449.align 3
3450
3451function(shade_blocks_unshaded_untextured_direct)
3452 stmdb sp!, { r4, r14 }
3453 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3454
3455 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3456 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3457
3458 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3459 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3460
3461 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3462 vld1.u16 { pixels }, [ color_ptr, :128 ]
3463
3464 mov c_64, #64
3465 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3466
3467 vorr.u16 pixels, pixels, msb_mask
3468 subs num_blocks, num_blocks, #1
3469
3470 ldr fb_ptr_next, [ block_ptr_load ], #64
3471
3472 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3473 beq 1f
3474
3475 0:
3476 vmov fb_pixels, fb_pixels_next
3477 mov fb_ptr, fb_ptr_next
3478 ldr fb_ptr_next, [ block_ptr_load ], #64
3479
3480 vbif.u16 fb_pixels, pixels, draw_mask
3481 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3482
3483 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3484 add fb_ptr_cmp, fb_ptr_cmp, #14
3485 cmp fb_ptr_cmp, #28
3486 bls 4f
3487
3488 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3489 vst1.u16 { fb_pixels }, [ fb_ptr ]
3490
3491 3:
3492 subs num_blocks, num_blocks, #1
3493 bne 0b
3494
3495 1:
3496 vbif.u16 fb_pixels_next, pixels, draw_mask
3497 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3498
3499 ldmia sp!, { r4, pc }
3500
3501 4:
3502 vst1.u16 { fb_pixels }, [ fb_ptr ]
3503 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3504 bal 3b
3505
3506
3507#undef draw_mask_ptr
3508#undef c_64
3509#undef fb_ptr
3510#undef fb_ptr_next
3511#undef fb_ptr_cmp
3512
3513#define psx_gpu r0
3514#define num_blocks r1
3515#define msb_mask_ptr r2
3516#define pixel_ptr r3
3517#define draw_mask_ptr r0
3518#define c_64 r2
3519#define fb_ptr r12
3520#define fb_ptr_next r14
3521#define fb_ptr_cmp r4
3522
3523#undef msb_mask
3524#undef draw_mask
3525#undef pixels
3526#undef fb_pixels
3527#undef d128_0x8000
3528#undef msb_mask_low
3529#undef msb_mask_high
3530#undef draw_mask_next
3531#undef pixels_g
3532#undef blend_pixels
3533#undef fb_pixels_next
3534
3535#define msb_mask q0
3536#define draw_mask q1
3537#define pixels q2
3538#define fb_pixels q3
3539#define blend_pixels q4
3540#define pixels_no_msb q5
3541#define blend_mask q6
3542#define fb_pixels_no_msb q7
3543#define d128_0x8000 q8
3544#define d128_0x0421 q9
3545#define fb_pixels_next q10
3546#define blend_pixels_next q11
3547#define pixels_next q12
3548#define draw_mask_next q13
3549#define write_mask q14
3550
3551#define pixels_rb q5
3552#define pixels_mg q7
3553#define pixels_g q7
3554#define d128_0x7C1F q8
3555#define d128_0x03E0 q9
3556#define fb_pixels_rb q10
3557#define fb_pixels_g q11
3558#define fb_pixels_masked q11
3559#define d128_0x83E0 q15
3560#define pixels_fourth q7
3561#define d128_0x1C07 q12
3562#define d128_0x00E0 q13
3563#define d128_0x80E0 q13
3564
3565#define msb_mask_low d0
3566#define msb_mask_high d1
3567
3568#define blend_blocks_average_set_blend_mask_textured(source) \
3569 vclt.s16 blend_mask, source, #0 \
3570
3571#define blend_blocks_average_set_stp_bit_textured() \
3572 vorr.u16 blend_pixels, #0x8000 \
3573
3574#define blend_blocks_average_combine_textured(source) \
3575 vbif.u16 blend_pixels, source, blend_mask \
3576
3577#define blend_blocks_average_set_blend_mask_untextured(source) \
3578
3579#define blend_blocks_average_set_stp_bit_untextured() \
3580
3581#define blend_blocks_average_combine_untextured(source) \
3582
3583#define blend_blocks_average_mask_set_on() \
3584 vclt.s16 write_mask, fb_pixels_next, #0 \
3585
3586#define blend_blocks_average_mask_copy_on() \
3587 vorr.u16 draw_mask, draw_mask_next, write_mask \
3588
3589#define blend_blocks_average_mask_copy_b_on() \
3590 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3591
3592#define blend_blocks_average_mask_set_off() \
3593
3594#define blend_blocks_average_mask_copy_off() \
3595 vmov draw_mask, draw_mask_next \
3596
3597#define blend_blocks_average_mask_copy_b_off() \
3598
3599#define blend_blocks_average_builder(texturing, mask_evaluate) \
3600.align 3; \
3601 \
3602function(blend_blocks_##texturing##_average_##mask_evaluate) \
3603 stmdb sp!, { r4, r14 }; \
3604 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3605 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3606 \
3607 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3608 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3609 \
3610 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3611 mov c_64, #64; \
3612 \
3613 vmov.u16 d128_0x8000, #0x8000; \
3614 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3615 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3616 \
3617 vmov.u16 d128_0x0421, #0x0400; \
3618 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3619 \
3620 vorr.u16 d128_0x0421, #0x0021; \
3621 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3622 \
3623 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3624 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3625 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3626 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3627 blend_blocks_average_mask_set_##mask_evaluate(); \
3628 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3629 \
3630 subs num_blocks, num_blocks, #1; \
3631 beq 1f; \
3632 \
3633 0: \
3634 mov fb_ptr, fb_ptr_next; \
3635 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3636 \
3637 vmov pixels, pixels_next; \
3638 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3639 \
3640 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3641 \
3642 blend_blocks_average_mask_copy_##mask_evaluate(); \
3643 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3644 \
3645 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3646 blend_blocks_average_set_stp_bit_##texturing(); \
3647 vmov fb_pixels, fb_pixels_next; \
3648 blend_blocks_average_combine_##texturing(pixels); \
3649 \
3650 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3651 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3652 cmp fb_ptr_cmp, #28; \
3653 bls 2f; \
3654 \
3655 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3656 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3657 \
3658 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3659 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3660 \
3661 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3662 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3663 \
3664 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3665 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3666 blend_blocks_average_mask_set_##mask_evaluate(); \
3667 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3668 \
3669 3: \
3670 subs num_blocks, num_blocks, #1; \
3671 bne 0b; \
3672 \
3673 1: \
3674 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3675 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3676 \
3677 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3678 blend_blocks_average_set_stp_bit_##texturing(); \
3679 blend_blocks_average_combine_##texturing(pixels_next); \
3680 \
3681 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3682 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
3683 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3684 \
3685 ldmia sp!, { r4, pc }; \
3686 \
3687 2: \
3688 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3689 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3690 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3691 \
3692 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3693 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3694 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3695 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3696 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3697 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3698 \
3699 bal 3b \
3700
3701blend_blocks_average_builder(textured, off)
3702blend_blocks_average_builder(untextured, off)
3703blend_blocks_average_builder(textured, on)
3704blend_blocks_average_builder(untextured, on)
3705
3706
3707#define blend_blocks_add_mask_set_on() \
3708 vclt.s16 write_mask, fb_pixels, #0 \
3709
3710#define blend_blocks_add_mask_copy_on() \
3711 vorr.u16 draw_mask, draw_mask, write_mask \
3712
3713#define blend_blocks_add_mask_set_off() \
3714
3715#define blend_blocks_add_mask_copy_off() \
3716
3717
3718#define blend_blocks_add_textured_builder(mask_evaluate) \
3719.align 3; \
3720 \
3721function(blend_blocks_textured_add_##mask_evaluate) \
3722 stmdb sp!, { r4, r14 }; \
3723 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3724 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3725 \
3726 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3727 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3728 \
3729 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3730 mov c_64, #64; \
3731 \
3732 vmov.u16 d128_0x7C1F, #0x7C00; \
3733 vmov.u16 d128_0x03E0, #0x0300; \
3734 vmov.u16 d128_0x83E0, #0x8000; \
3735 vorr.u16 d128_0x03E0, #0x00E0; \
3736 vorr.u16 d128_0x7C1F, #0x001F; \
3737 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3738 \
3739 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3740 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3741 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3742 vclt.s16 blend_mask, pixels, #0; \
3743 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3744 blend_blocks_add_mask_set_##mask_evaluate(); \
3745 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3746 \
3747 blend_blocks_add_mask_copy_##mask_evaluate(); \
3748 vorr.u16 pixels, pixels, msb_mask; \
3749 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3750 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3751 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3752 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3753 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3754 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3755 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3756 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3757 \
3758 subs num_blocks, num_blocks, #1; \
3759 beq 1f; \
3760 \
3761 0: \
3762 mov fb_ptr, fb_ptr_next; \
3763 \
3764 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3765 \
3766 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3767 vclt.s16 blend_mask, pixels, #0; \
3768 \
3769 vorr.u16 pixels, pixels, msb_mask; \
3770 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3771 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3772 \
8438c3c7 3773 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3774 pld [ fb_ptr_next, #64 ]; \
75e28f62
E
3775 \
3776 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
8438c3c7 3777 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3778 \
75e28f62 3779 add fb_ptr_cmp, fb_ptr_cmp, #14; \
8438c3c7 3780 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3781 \
75e28f62
E
3782 cmp fb_ptr_cmp, #28; \
3783 bls 2f; \
3784 \
3785 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3786 blend_blocks_add_mask_set_##mask_evaluate(); \
3787 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3788 blend_blocks_add_mask_copy_##mask_evaluate(); \
3789 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3790 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3791 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3792 \
3793 3: \
3794 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3795 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3796 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3797 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3798 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3799 \
3800 subs num_blocks, num_blocks, #1; \
3801 bne 0b; \
3802 \
3803 1: \
3804 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3805 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3806 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3807 \
3808 ldmia sp!, { r4, pc }; \
3809 \
3810 2: \
3811 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3812 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3813 \
3814 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3815 blend_blocks_add_mask_set_##mask_evaluate(); \
3816 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3817 blend_blocks_add_mask_copy_##mask_evaluate(); \
3818 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3819 bal 3b \
3820
3821
3822#define blend_blocks_add_untextured_builder(mask_evaluate) \
3823.align 3; \
3824 \
3825function(blend_blocks_untextured_add_##mask_evaluate) \
3826 stmdb sp!, { r4, r14 }; \
3827 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3828 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3829 \
3830 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3831 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3832 \
3833 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3834 mov c_64, #64; \
3835 \
3836 vmov.u16 d128_0x7C1F, #0x7C00; \
3837 vmov.u16 d128_0x03E0, #0x0300; \
3838 vorr.u16 d128_0x7C1F, #0x001F; \
3839 vorr.u16 d128_0x03E0, #0x00E0; \
3840 \
3841 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3842 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3843 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3844 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3845 blend_blocks_add_mask_set_##mask_evaluate(); \
3846 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3847 \
3848 blend_blocks_add_mask_copy_##mask_evaluate(); \
3849 vand.u16 pixels_g, pixels, d128_0x03E0; \
3850 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3851 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3852 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3853 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3854 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3855 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3856 \
3857 subs num_blocks, num_blocks, #1; \
3858 beq 1f; \
3859 \
3860 0: \
3861 mov fb_ptr, fb_ptr_next; \
3862 \
3863 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3864 \
3865 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3866 \
3867 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3868 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3869 vand.u16 pixels_g, pixels, d128_0x03E0; \
3870 \
3871 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3872 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3873 \
3874 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3875 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3876 cmp fb_ptr_cmp, #28; \
3877 bls 2f; \
3878 \
3879 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3880 blend_blocks_add_mask_set_##mask_evaluate(); \
3881 blend_blocks_add_mask_copy_##mask_evaluate(); \
3882 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3883 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3884 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3885 \
3886 3: \
3887 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3888 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3889 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3890 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3891 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3892 \
3893 subs num_blocks, num_blocks, #1; \
3894 bne 0b; \
3895 \
3896 1: \
3897 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3898 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3899 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3900 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3901 \
3902 ldmia sp!, { r4, pc }; \
3903 \
3904 2: \
3905 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3906 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3907 \
3908 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3909 blend_blocks_add_mask_set_##mask_evaluate(); \
3910 blend_blocks_add_mask_copy_##mask_evaluate(); \
3911 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3912 bal 3b \
3913
3914
3915blend_blocks_add_textured_builder(off)
3916blend_blocks_add_textured_builder(on)
3917blend_blocks_add_untextured_builder(off)
3918blend_blocks_add_untextured_builder(on)
3919
3920#define blend_blocks_subtract_set_blend_mask_textured() \
3921 vclt.s16 blend_mask, pixels_next, #0 \
3922
3923#define blend_blocks_subtract_combine_textured() \
3924 vbif.u16 blend_pixels, pixels, blend_mask \
3925
3926#define blend_blocks_subtract_set_stb_textured() \
3927 vorr.u16 blend_pixels, #0x8000 \
3928
3929#define blend_blocks_subtract_msb_mask_textured() \
3930 vorr.u16 pixels, pixels_next, msb_mask \
3931
3932#define blend_blocks_subtract_set_blend_mask_untextured() \
3933
3934#define blend_blocks_subtract_combine_untextured() \
3935
3936#define blend_blocks_subtract_set_stb_untextured() \
3937 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3938
3939#define blend_blocks_subtract_msb_mask_untextured() \
3940
3941
3942#define blend_blocks_subtract_mask_set_on() \
3943 vclt.s16 write_mask, fb_pixels, #0 \
3944
3945#define blend_blocks_subtract_mask_copy_on() \
3946 vorr.u16 draw_mask, draw_mask_next, write_mask \
3947
3948#define blend_blocks_subtract_mask_set_off() \
3949
3950#define blend_blocks_subtract_mask_copy_off() \
3951 vmov draw_mask, draw_mask_next \
3952
3953
3954#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3955.align 3; \
3956 \
3957function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3958 stmdb sp!, { r4, r14 }; \
3959 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3960 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3961 \
3962 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3963 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3964 \
3965 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3966 mov c_64, #64; \
3967 \
3968 vmov.u16 d128_0x7C1F, #0x7C00; \
3969 vmov.u16 d128_0x03E0, #0x0300; \
3970 vorr.u16 d128_0x7C1F, #0x001F; \
3971 vorr.u16 d128_0x03E0, #0x00E0; \
3972 \
3973 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3974 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3975 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3976 blend_blocks_subtract_set_blend_mask_##texturing(); \
3977 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3978 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3979 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3980 \
3981 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3982 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3983 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3984 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3985 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3986 \
3987 subs num_blocks, num_blocks, #1; \
3988 beq 1f; \
3989 \
3990 0: \
3991 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3992 mov fb_ptr, fb_ptr_next; \
3993 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3994 \
3995 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3996 blend_blocks_subtract_msb_mask_##texturing(); \
3997 \
3998 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3999 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4000 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
4001 blend_blocks_subtract_set_stb_##texturing(); \
4002 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4003 blend_blocks_subtract_combine_##texturing(); \
4004 blend_blocks_subtract_set_blend_mask_##texturing(); \
4005 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4006 \
4007 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4008 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4009 cmp fb_ptr_cmp, #28; \
4010 bls 2f; \
4011 \
4012 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4013 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4014 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4015 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4016 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4017 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4018 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4019 \
4020 3: \
4021 subs num_blocks, num_blocks, #1; \
4022 bne 0b; \
4023 \
4024 1: \
4025 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4026 \
4027 blend_blocks_subtract_msb_mask_##texturing(); \
4028 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4029 blend_blocks_subtract_set_stb_##texturing(); \
4030 blend_blocks_subtract_combine_##texturing(); \
4031 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4032 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4033 \
4034 ldmia sp!, { r4, pc }; \
4035 \
4036 2: \
4037 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4038 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4039 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4040 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4041 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4042 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4043 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4044 bal 3b \
4045
4046
4047blend_blocks_subtract_builder(textured, off)
4048blend_blocks_subtract_builder(textured, on)
4049blend_blocks_subtract_builder(untextured, off)
4050blend_blocks_subtract_builder(untextured, on)
4051
4052
4053#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4054.align 3; \
4055 \
4056function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4057 stmdb sp!, { r4, r14 }; \
4058 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4059 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4060 \
4061 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4062 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4063 \
4064 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4065 mov c_64, #64; \
4066 \
4067 vmov.u16 d128_0x7C1F, #0x7C00; \
4068 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4069 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4070 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4071 vorr.u16 d128_0x7C1F, #0x001F; \
4072 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4073 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62
E
4074 \
4075 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4076 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4077 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4078 vclt.s16 blend_mask, pixels, #0; \
4079 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4080 blend_blocks_add_mask_set_##mask_evaluate(); \
4081 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4082 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4083 \
4084 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4085 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4086 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4087 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4088 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4089 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4090 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4091 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4092 \
4093 subs num_blocks, num_blocks, #1; \
4094 beq 1f; \
4095 \
4096 0: \
4097 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4098 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4099 \
d1c75d1e
E
4100 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4101 vbif.u16 blend_pixels, pixels, blend_mask; \
4102 \
75e28f62
E
4103 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4104 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4105 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4106 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4107 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4108 \
4109 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4110 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4111 \
4112 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4113 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4114 cmp fb_ptr_cmp, #28; \
4115 bls 2f; \
4116 \
4117 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4118 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4119 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4120 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4121 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4122 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4123 \
4124 3: \
d1c75d1e 4125 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4126 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4127 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4128 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4129 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4130 \
4131 subs num_blocks, num_blocks, #1; \
4132 bne 0b; \
4133 \
4134 1: \
4135 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
d1c75d1e
E
4136 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4137 vbif.u16 blend_pixels, pixels, blend_mask; \
75e28f62
E
4138 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4139 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4140 \
4141 ldmia sp!, { r4, pc }; \
4142 \
4143 2: \
4144 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
d1c75d1e 4145 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62
E
4146 \
4147 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4148 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4149 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4150 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4151 bal 3b \
4152
4153
d1c75d1e 4154
75e28f62
E
4155#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4156.align 3; \
4157 \
4158function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4159 stmdb sp!, { r4, r14 }; \
4160 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4161 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4162 \
4163 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4164 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4165 \
4166 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4167 mov c_64, #64; \
4168 \
4169 vmov.u16 d128_0x7C1F, #0x7C00; \
4170 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4171 vmov.u16 d128_0x1C07, #0x1C00; \
4172 vmov.u16 d128_0x00E0, #0x00E0; \
4173 vorr.u16 d128_0x7C1F, #0x001F; \
4174 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4175 vorr.u16 d128_0x1C07, #0x0007; \
4176 \
4177 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4178 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4179 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4180 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4181 blend_blocks_add_mask_set_##mask_evaluate(); \
4182 vshr.s16 pixels_fourth, pixels, #2; \
4183 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4184 \
4185 blend_blocks_add_mask_copy_##mask_evaluate(); \
4186 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4187 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4188 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4189 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4190 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4191 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4192 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4193 \
4194 subs num_blocks, num_blocks, #1; \
4195 beq 1f; \
4196 \
4197 0: \
4198 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4199 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4200 \
4201 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4202 \
4203 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4204 vshr.s16 pixels_fourth, pixels, #2; \
4205 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4206 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4207 \
4208 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4209 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4210 \
4211 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4212 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4213 cmp fb_ptr_cmp, #28; \
4214 bls 2f; \
4215 \
4216 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4217 blend_blocks_add_mask_set_##mask_evaluate(); \
4218 blend_blocks_add_mask_copy_##mask_evaluate(); \
4219 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4220 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4221 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4222 \
4223 3: \
4224 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4225 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4226 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4227 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4228 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4229 \
4230 subs num_blocks, num_blocks, #1; \
4231 bne 0b; \
4232 \
4233 1: \
4234 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4235 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4236 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4237 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4238 \
4239 ldmia sp!, { r4, pc }; \
4240 \
4241 2: \
4242 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4243 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4244 \
4245 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4246 blend_blocks_add_mask_set_##mask_evaluate(); \
4247 blend_blocks_add_mask_copy_##mask_evaluate(); \
4248 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4249 bal 3b \
4250
4251
4252blend_blocks_add_fourth_textured_builder(off)
4253blend_blocks_add_fourth_textured_builder(on)
4254blend_blocks_add_fourth_untextured_builder(off)
4255blend_blocks_add_fourth_untextured_builder(on)
4256
4257// TODO: Optimize this more. Need a scene that actually uses it for
4258// confirmation..
4259
4260.align 3
4261
4262function(blend_blocks_textured_unblended_on)
4263 stmdb sp!, { r4, r14 }
4264 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4265 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4266
4267 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4268 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4269
4270 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4271 mov c_64, #64
4272
4273 ldr fb_ptr, [ pixel_ptr, #28 ]
4274 vld1.u16 { fb_pixels }, [ fb_ptr ]
4275 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4276 vclt.s16 write_mask, fb_pixels, #0
4277 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4278
4279 subs num_blocks, num_blocks, #1
4280 beq 1f
4281
4282 0:
134f81ec 4283 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4284 vorr.u16 draw_mask, draw_mask, write_mask
4285 vbif.u16 fb_pixels, pixels, draw_mask
4286 vst1.u16 { fb_pixels }, [ fb_ptr ]
4287
4288 ldr fb_ptr, [ pixel_ptr, #28 ]
4289 vld1.u16 { fb_pixels }, [ fb_ptr ]
4290 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4291 vclt.s16 write_mask, fb_pixels, #0
4292 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4293
4294 subs num_blocks, num_blocks, #1
4295 bne 0b
4296
4297 1:
134f81ec 4298 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4299 vorr.u16 draw_mask, draw_mask, write_mask
4300 vbif.u16 fb_pixels, pixels, draw_mask
4301 vst1.u16 { fb_pixels }, [ fb_ptr ]
4302
4303 ldmia sp!, { r4, pc }
4304
4305
4306function(blend_blocks_textured_unblended_off)
4307 bx lr
4308
4309
4310function(warmup)
4311 mov r3, #64
4312 cmp r0, #0
4313 bxeq lr
4314
4315 0:
4316 vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4317
4318 subs r0, r0, #1
4319 bne 0b
4320
4321 bx lr
4322
6c4a10c4 4323#undef vram_ptr
75e28f62 4324#undef color
6c4a10c4 4325#undef width
75e28f62 4326#undef height
6c4a10c4 4327#undef pitch
75e28f62
E
4328
4329#define vram_ptr r0
6c4a10c4
E
4330#define color r1
4331#define width r2
4332#define height r3
75e28f62 4333
6c4a10c4 4334#define pitch r1
75e28f62 4335
6c4a10c4 4336#define num_width r12
75e28f62 4337
87c45ad1
E
4338#undef colors_a
4339#undef colors_b
75e28f62 4340
87c45ad1
E
4341#define colors_a q0
4342#define colors_b q1
75e28f62
E
4343
4344.align 3
4345
4346function(render_block_fill_body)
87c45ad1 4347 vdup.u16 colors_a, color
6c4a10c4 4348 mov pitch, #2048
75e28f62 4349
87c45ad1 4350 vmov colors_b, colors_a
75e28f62 4351 sub pitch, pitch, width, lsl #1
75e28f62 4352
6c4a10c4 4353 mov num_width, width
75e28f62 4354
6c4a10c4
E
4355 0:
4356 vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]!
75e28f62 4357
d1c75d1e 4358 subs num_width, num_width, #16
6c4a10c4 4359 bne 0b
75e28f62 4360
75e28f62 4361 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4362 mov num_width, width
4363
75e28f62
E
4364 subs height, height, #1
4365 bne 0b
75e28f62 4366
6c4a10c4
E
4367 bx lr
4368
75e28f62
E
4369
4370#undef x
4371#undef y
4372#undef width
4373#undef height
4374#undef fb_ptr
4375#undef texture_mask
4376#undef num_blocks
4377#undef temp
4378#undef dirty_textures_mask
4379#undef clut_ptr
4380#undef current_texture_mask
4381
4382#define psx_gpu r0
4383#define x r1
4384#define y r2
4385#define u r3
4386#define v r4
4387#define width r5
4388#define height r6
4389#define offset_u r8
4390#define offset_v r9
4391#define offset_u_right r10
4392#define width_rounded r11
4393#define height_rounded r12
4394
4395#define texture_offset_base r1
4396#define tile_width r2
4397#define tile_height r3
4398#define num_blocks r4
4399#define block r5
4400#define sub_tile_height r6
4401#define fb_ptr r7
4402#define texture_mask r8
4403#define column_data r9
4404#define texture_offset r10
4405#define tiles_remaining r11
4406#define fb_ptr_advance_column r12
4407#define texture_block_ptr r14
4408
4409#define texture_page_ptr r3
4410#define left_block_mask r4
4411#define right_block_mask r5
4412#define texture_mask_rev r10
4413#define control_mask r11
4414
4415#define dirty_textures_mask r4
4416#define clut_ptr r5
4417#define current_texture_mask r6
4418
4419
4420#undef texels
4421#undef clut_low_a
4422#undef clut_low_b
4423#undef clut_high_a
4424#undef clut_high_b
4425#undef clut_a
4426#undef clut_b
4427#undef texels_low
4428#undef texels_high
4429
4430#define texels d0
4431#define draw_masks_fb_ptrs q1
4432
4433#define draw_mask_fb_ptr_left d2
4434#define draw_mask_fb_ptr_right d3
4435
59d15d23 4436#define draw_mask_fb_ptr_left_a d2
4437#define draw_mask_fb_ptr_left_b d3
4438#define draw_mask_fb_ptr_right_a d10
4439#define draw_mask_fb_ptr_right_b d11
4440#define draw_masks_fb_ptrs2 q5
4441
75e28f62
E
4442#define clut_low_a d4
4443#define clut_low_b d5
4444#define clut_high_a d6
4445#define clut_high_b d7
4446
4447#define block_masks d8
4448#define block_masks_shifted d9
4449
4450#define clut_a q2
4451#define clut_b q3
4452
59d15d23 4453#define texels_low d12
4454#define texels_high d13
75e28f62 4455
59d15d23 4456#define texels_wide_low d14
4457#define texels_wide_high d15
4458#define texels_wide q7
75e28f62
E
4459
4460
59d15d23 4461setup_sprite_flush_blocks:
4462 vpush { q1 - q5 }
75e28f62
E
4463
4464 stmdb sp!, { r0 - r3, r12, r14 }
4465 bl flush_render_block_buffer
4466 ldmia sp!, { r0 - r3, r12, r14 }
4467
59d15d23 4468 vpop { q1 - q5 }
75e28f62
E
4469
4470 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4471 bx lr
4472
4473
4474setup_sprite_update_texture_4bpp_cache:
4475 stmdb sp!, { r0 - r3, r14 }
4476 bl update_texture_4bpp_cache
4477 ldmia sp!, { r0 - r3, pc }
4478
4479
4480setup_sprite_update_texture_8bpp_cache:
4481 stmdb sp!, { r0 - r3, r14 }
4482 bl update_texture_8bpp_cache
4483 ldmia sp!, { r0 - r3, pc }
4484
4485
4486#define setup_sprite_tiled_initialize_4bpp() \
4487 ldr dirty_textures_mask, \
4488 [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \
4489 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4490 \
4491 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4492 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4493 \
4494 tst current_texture_mask, dirty_textures_mask; \
4495 vuzp.u8 clut_a, clut_b; \
4496 \
4497 blne setup_sprite_update_texture_4bpp_cache \
4498
4499#define setup_sprite_tiled_initialize_8bpp() \
4500 ldr dirty_textures_mask, \
4501 [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \
4502 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4503 \
4504 tst current_texture_mask, dirty_textures_mask; \
4505 blne setup_sprite_update_texture_8bpp_cache \
4506
4507
75e28f62
E
4508#define setup_sprite_block_count_single() \
4509 sub_tile_height \
4510
4511#define setup_sprite_block_count_double() \
4512 sub_tile_height, lsl #1 \
4513
4514#define setup_sprite_tile_add_blocks(type) \
4515 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4516 cmp num_blocks, #MAX_BLOCKS; \
4517 \
59d15d23 4518 movgt num_blocks, setup_sprite_block_count_##type(); \
4519 blgt setup_sprite_flush_blocks \
75e28f62
E
4520
4521
4522#define setup_sprite_tile_full_4bpp(edge) \
4523 setup_sprite_tile_add_blocks(double); \
4524 \
4525 4: \
4526 and texture_block_ptr, texture_offset, texture_mask; \
4527 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4528 \
4529 pld [ fb_ptr ]; \
4530 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4531 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4532 \
4533 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4534 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4535 \
4536 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4537 add texture_block_ptr, texture_offset, #8; \
4538 \
4539 and texture_block_ptr, texture_block_ptr, texture_mask; \
4540 add block, block, #40; \
4541 \
4542 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4543 add fb_ptr, fb_ptr, #16; \
4544 \
4545 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4546 add block, block, #24; \
4547 \
4548 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4549 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4550 \
4551 pld [ fb_ptr ]; \
4552 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4553 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4554 \
4555 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4556 add block, block, #40; \
4557 \
4558 add texture_offset, texture_offset, #0x10; \
4559 add fb_ptr, fb_ptr, #(2048 - 16); \
4560 \
4561 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4562 add block, block, #24; \
4563 \
4564 subs sub_tile_height, sub_tile_height, #1; \
4565 bne 4b; \
4566 \
4567 add texture_offset, texture_offset, #0xF00; \
4568 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4569
4570
4571#define setup_sprite_tile_half_4bpp(edge) \
4572 setup_sprite_tile_add_blocks(single); \
4573 \
4574 4: \
4575 and texture_block_ptr, texture_offset, texture_mask; \
4576 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4577 \
4578 pld [ fb_ptr ]; \
4579 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4580 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4581 \
4582 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4583 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4584 \
4585 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4586 add block, block, #40; \
4587 \
4588 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4589 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4590 \
4591 add block, block, #24; \
4592 add texture_offset, texture_offset, #0x10; \
4593 \
4594 add fb_ptr, fb_ptr, #2048; \
4595 subs sub_tile_height, sub_tile_height, #1; \
4596 \
4597 bne 4b; \
4598 \
4599 add texture_offset, texture_offset, #0xF00; \
4600 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4601
4602
4603#define setup_sprite_tile_full_8bpp(edge) \
4604 setup_sprite_tile_add_blocks(double); \
4605 add block, block, #16; \
4606 \
4607 4: \
4608 and texture_block_ptr, texture_offset, texture_mask; \
4609 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4610 \
4611 pld [ fb_ptr ]; \
4612 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4613 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4614 \
4615 add texture_block_ptr, texture_offset, #8; \
4616 vst1.u32 { texels }, [ block, :64 ]; \
4617 \
4618 and texture_block_ptr, texture_block_ptr, texture_mask; \
4619 add block, block, #24; \
4620 \
4621 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4622 \
4623 add fb_ptr, fb_ptr, #16; \
4624 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4625 \
4626 add block, block, #40; \
4627 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4628 pld [ fb_ptr ]; \
4629 \
4630 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4631 vst1.u32 { texels }, [ block, :64 ]; \
4632 add block, block, #24; \
4633 \
4634 add texture_offset, texture_offset, #0x10; \
4635 add fb_ptr, fb_ptr, #(2048 - 16); \
4636 \
4637 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4638 add block, block, #40; \
4639 \
4640 subs sub_tile_height, sub_tile_height, #1; \
4641 bne 4b; \
4642 \
4643 sub block, block, #16; \
4644 add texture_offset, texture_offset, #0xF00; \
4645 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4646
4647
4648#define setup_sprite_tile_half_8bpp(edge) \
4649 setup_sprite_tile_add_blocks(single); \
4650 add block, block, #16; \
4651 \
4652 4: \
4653 and texture_block_ptr, texture_offset, texture_mask; \
4654 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4655 pld [ fb_ptr ]; \
4656 \
4657 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4658 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4659 \
4660 vst1.u32 { texels }, [ block, :64 ]; \
4661 add block, block, #24; \
4662 \
4663 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4664 add block, block, #40; \
4665 \
4666 add texture_offset, texture_offset, #0x10; \
4667 add fb_ptr, fb_ptr, #2048; \
4668 \
4669 subs sub_tile_height, sub_tile_height, #1; \
4670 bne 4b; \
4671 \
4672 sub block, block, #16; \
4673 add texture_offset, texture_offset, #0xF00; \
4674 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4675
4676
4677#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4678 add texture_offset, texture_offset_base, #8; \
4679 add fb_ptr, fb_ptr, #16 \
4680
4681#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4682 mov texture_offset, texture_offset_base \
4683
4684#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4685 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4686
4687#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4688 mov texture_offset, texture_offset_base \
4689
4690#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4691 sub fb_ptr, fb_ptr, #16 \
4692
4693#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4694
4695#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4696 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4697
4698#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4699
4700
59d15d23 4701#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4702 x4mode) \
75e28f62 4703 mov sub_tile_height, column_data; \
59d15d23 4704 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4705 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4706 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4707
59d15d23 4708#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4709 x4mode) \
75e28f62
E
4710 and sub_tile_height, column_data, #0xFF; \
4711 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4712 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4713 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4714 \
4715 subs tiles_remaining, tiles_remaining, #1; \
4716 beq 2f; \
4717 \
4718 3: \
4719 mov sub_tile_height, #16; \
59d15d23 4720 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4721 subs tiles_remaining, tiles_remaining, #1; \
4722 bne 3b; \
4723 \
4724 2: \
4725 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4726 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4727 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4728
4729
4730#define setup_sprite_column_data_single() \
4731 mov column_data, height; \
4732 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \
4733
4734#define setup_sprite_column_data_multi() \
4735 and height_rounded, height_rounded, #0xF; \
4736 rsb column_data, offset_v, #16; \
4737 \
4738 add height_rounded, height_rounded, #1; \
4739 sub tile_height, tile_height, #1; \
4740 \
4741 orr column_data, column_data, tile_height, lsl #16; \
4742 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \
4743 \
4744 orr column_data, column_data, height_rounded, lsl #8 \
4745
59d15d23 4746#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4747 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4748 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4749
4750#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4751 mov fb_ptr_advance_column, #32; \
4752 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4753 \
4754 sub fb_ptr_advance_column, height, lsl #11; \
4755 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4756
4757#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4758 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4759 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4760
4761#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4762 edge, x4mode) \
4763 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4764 setup_sprite_column_data_##multi_height(); \
4765 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4766 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4767 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4768 \
59d15d23 4769 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
75e28f62
E
4770 ldmia sp!, { r4 - r11, pc } \
4771
4772#define setup_sprite_tiled_advance_column() \
4773 add texture_offset_base, texture_offset_base, #0x100; \
4774 tst texture_offset_base, #0xF00; \
4775 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4776
4777#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4778 right_mode, x4mode) \
4779 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4780 setup_sprite_column_data_##multi_height(); \
75e28f62 4781 \
59d15d23 4782 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4783 \
59d15d23 4784 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4785 \
4786 subs tile_width, tile_width, #2; \
4787 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4788 \
75e28f62
E
4789 beq 1f; \
4790 \
59d15d23 4791 vmov.u8 draw_masks_fb_ptrs, #0; \
4792 vmov.u8 draw_masks_fb_ptrs2, #0; \
4793 \
75e28f62
E
4794 0: \
4795 setup_sprite_tiled_advance_column(); \
59d15d23 4796 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4797 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4798 subs tile_width, tile_width, #1; \
4799 bne 0b; \
4800 \
4801 1: \
59d15d23 4802 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4803 \
4804 setup_sprite_tiled_advance_column(); \
59d15d23 4805 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
75e28f62
E
4806 ldmia sp!, { r4 - r11, pc } \
4807
4808
59d15d23 4809#define setup_sprite_offset_u_adjust() \
4810
4811#define setup_sprite_get_left_block_mask() \
4812 and left_block_mask, left_block_mask, #0xFF \
4813
4814#define setup_sprite_compare_left_block_mask() \
4815 cmp left_block_mask, #0xFF \
4816
4817#define setup_sprite_get_right_block_mask() \
4818 uxtb right_block_mask, right_block_mask, ror #8 \
4819
4820#define setup_sprite_compare_right_block_mask() \
4821 cmp right_block_mask, #0xFF \
4822
4823
4824
4825/* 4x stuff */
4826#define fb_ptr2 column_data
4827
4828#define setup_sprite_offset_u_adjust_4x() \
4829 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4830 lsl offset_u_right, #1; \
4831 lsl offset_u, #1; \
4832 add offset_u_right, #1 \
4833
4834#define setup_sprite_get_left_block_mask_4x() \
4835 sxth left_block_mask, left_block_mask \
4836
4837#define setup_sprite_compare_left_block_mask_4x() \
4838 cmp left_block_mask, #0xFFFFFFFF \
4839
4840#define setup_sprite_get_right_block_mask_4x() \
4841 sxth right_block_mask, right_block_mask, ror #16 \
4842
4843#define setup_sprite_compare_right_block_mask_4x() \
4844 cmp right_block_mask, #0xFFFFFFFF \
4845
4846
4847#define widen_texels_16bpp(texels_) \
4848 vmov texels_wide_low, texels_; \
4849 vmov texels_wide_high, texels_; \
4850 vzip.16 texels_wide_low, texels_wide_high \
4851
4852#define widen_texels_8bpp(texels_) \
4853 vmov texels_wide_low, texels_; \
4854 vmov texels_wide_high, texels_; \
4855 vzip.8 texels_wide_low, texels_wide_high \
4856
4857#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4858 vst1.u32 { texels_ }, [ block_, :128 ]; \
4859 add block_, block_, #40; \
4860 \
4861 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4862 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4863 add block_, block_, #24 \
4864
4865/* assumes 16-byte offset already added to block_ */
4866#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4867 vst1.u32 { texels_ }, [ block_, :64 ]; \
4868 add block_, block_, #24; \
4869 \
4870 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4871 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4872 add block_, block_, #40 \
4873
4874#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4875 draw_mask_fb_ptr_b_) \
4876 widen_texels_16bpp(texels_low); \
4877 add fb_ptr_tmp, fb_ptr, #1024*2; \
4878 \
4879 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4880 \
4881 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4882 widen_texels_16bpp(texels_high); \
4883 \
4884 add fb_ptr_tmp, fb_ptr, #8*2; \
4885 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4886 \
4887 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4888 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4889
4890#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4891 draw_mask_fb_ptr_b_) \
4892 widen_texels_8bpp(texels); \
4893 add fb_ptr_tmp, fb_ptr, #1024*2; \
4894 \
4895 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4896 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4897 \
4898 add fb_ptr_tmp, fb_ptr, #8*2; \
4899 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4900 \
4901 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4902 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4903
4904
4905#define setup_sprite_tiled_initialize_4bpp_4x() \
4906 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4907 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4908 \
4909 vuzp.u8 clut_a, clut_b \
4910
4911#define setup_sprite_tiled_initialize_8bpp_4x() \
4912
4913
4914#define setup_sprite_block_count_single_4x() \
4915 sub_tile_height, lsl #2 \
4916
4917#define setup_sprite_block_count_double_4x() \
4918 sub_tile_height, lsl #(1+2) \
4919
4920#define setup_sprite_tile_full_4bpp_4x(edge) \
4921 setup_sprite_tile_add_blocks(double_4x); \
4922 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4923 \
4924 4: \
4925 and texture_block_ptr, texture_offset, texture_mask; \
4926 pld [ fb_ptr ]; \
4927 \
4928 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4929 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4930 \
4931 add texture_block_ptr, texture_offset, #8; \
4932 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4933 \
4934 and texture_block_ptr, texture_block_ptr, texture_mask; \
4935 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4936 \
4937 vzip.8 texels_low, texels_high; \
4938 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4939 draw_mask_fb_ptr_left_b); \
4940 \
4941 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
8438c3c7 4942 pld [ fb_ptr, #2048 ]; \
59d15d23 4943 \
4944 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
8438c3c7 4945 add fb_ptr, fb_ptr, #16*2; \
59d15d23 4946 \
8438c3c7 4947 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
59d15d23 4948 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4949 \
4950 vzip.8 texels_low, texels_high; \
4951 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4952 draw_mask_fb_ptr_right_b); \
4953 \
4954 add texture_offset, texture_offset, #0x10; \
4955 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4956 \
4957 subs sub_tile_height, sub_tile_height, #1; \
4958 bne 4b; \
4959 \
4960 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4961 add texture_offset, texture_offset, #0xF00; \
4962 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4963
4964
4965#define setup_sprite_tile_half_4bpp_4x(edge) \
4966 setup_sprite_tile_add_blocks(single_4x); \
4967 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4968 \
4969 4: \
4970 and texture_block_ptr, texture_offset, texture_mask; \
4971 pld [ fb_ptr ]; \
4972 \
4973 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4974 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4975 \
4976 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4977 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4978 \
4979 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4980 add texture_offset, texture_offset, #0x10; \
4981 \
4982 vzip.8 texels_low, texels_high; \
4983 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
4984 draw_mask_fb_ptr_##edge##_b); \
4985 \
8438c3c7 4986 pld [ fb_ptr, #2048 ]; \
59d15d23 4987 add fb_ptr, fb_ptr, #2048 * 2; \
59d15d23 4988 \
8438c3c7 4989 subs sub_tile_height, sub_tile_height, #1; \
59d15d23 4990 bne 4b; \
4991 \
4992 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4993 add texture_offset, texture_offset, #0xF00; \
4994 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4995
4996
4997#define setup_sprite_tile_full_8bpp_4x(edge) \
4998 setup_sprite_tile_add_blocks(double_4x); \
4999 add block, block, #16; \
5000 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5001 \
5002 4: \
5003 and texture_block_ptr, texture_offset, texture_mask; \
5004 pld [ fb_ptr ]; \
5005 \
5006 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5007 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5008 \
5009 add texture_block_ptr, texture_offset, #8; \
5010 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5011 draw_mask_fb_ptr_left_b); \
5012 \
8438c3c7 5013 pld [ fb_ptr, #2048 ]; \
59d15d23 5014 and texture_block_ptr, texture_block_ptr, texture_mask; \
5015 \
5016 add fb_ptr, fb_ptr, #16*2; \
5017 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5018 \
5019 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
59d15d23 5020 \
5021 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5022 draw_mask_fb_ptr_right_b); \
5023 \
5024 add texture_offset, texture_offset, #0x10; \
5025 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5026 \
5027 subs sub_tile_height, sub_tile_height, #1; \
5028 bne 4b; \
5029 \
5030 sub block, block, #16; \
5031 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5032 add texture_offset, texture_offset, #0xF00; \
5033 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5034
5035
5036#define setup_sprite_tile_half_8bpp_4x(edge) \
5037 setup_sprite_tile_add_blocks(single_4x); \
5038 add block, block, #16; \
5039 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5040 \
5041 4: \
5042 and texture_block_ptr, texture_offset, texture_mask; \
5043 pld [ fb_ptr ]; \
5044 \
5045 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5046 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5047 \
8438c3c7 5048 pld [ fb_ptr, #2048 ]; \
59d15d23 5049 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5050 draw_mask_fb_ptr_##edge##_b); \
5051 \
5052 add texture_offset, texture_offset, #0x10; \
5053 add fb_ptr, fb_ptr, #2048 * 2; \
5054 \
5055 subs sub_tile_height, sub_tile_height, #1; \
5056 bne 4b; \
5057 \
5058 sub block, block, #16; \
5059 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5060 add texture_offset, texture_offset, #0xF00; \
5061 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5062
5063
5064#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5065 add texture_offset, texture_offset_base, #8; \
5066 add fb_ptr, fb_ptr, #16 * 2 \
5067
5068#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5069 mov texture_offset, texture_offset_base \
5070
5071#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5072 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5073
5074#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5075 mov texture_offset, texture_offset_base \
5076
5077#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5078 sub fb_ptr, fb_ptr, #16 * 2 \
5079
5080#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5081
5082#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5083 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5084
5085#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5086
5087
5088#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5089 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5090 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5091 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5092 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5093
5094#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5095 mov fb_ptr_advance_column, #32 * 2; \
5096 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5097 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5098 sub fb_ptr_advance_column, height, lsl #11 + 1; \
5099 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5100 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5101
5102#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5103 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5104 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5105 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5106 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5107
5108
75e28f62
E
5109// r0: psx_gpu
5110// r1: x
5111// r2: y
5112// r3: u
5113// [ sp ]: v
5114// [ sp + 4 ]: width
5115// [ sp + 8 ]: height
5116// [ sp + 12 ]: color (unused)
5117
59d15d23 5118#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5119 \
5120setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5121 x4mode); \
5122setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5123 x4mode); \
5124setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5125 x4mode); \
5126setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5127 x4mode); \
5128setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5129 x4mode); \
5130setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5131 x4mode); \
5132setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5133 x4mode); \
5134setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5135 x4mode); \
5136setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5137 x4mode); \
5138setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5139 x4mode); \
5140setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5141 x4mode); \
5142setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5143 x4mode); \
5144setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5145 x4mode); \
5146setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5147 x4mode); \
75e28f62
E
5148 \
5149.align 4; \
5150 \
59d15d23 5151function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5152 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5153 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62
E
5154 \
5155 ldr v, [ sp, #36 ]; \
5156 and offset_u, u, #0xF; \
5157 \
5158 ldr width, [ sp, #40 ]; \
c1817bd9 5159 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
5160 \
5161 ldr height, [ sp, #44 ]; \
5162 add fb_ptr, fb_ptr, y, lsl #11; \
5163 \
5164 add fb_ptr, fb_ptr, x, lsl #1; \
5165 and offset_v, v, #0xF; \
5166 \
5167 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5168 add width_rounded, offset_u, width; \
5169 \
5170 add height_rounded, offset_v, height; \
5171 add width_rounded, width_rounded, #15; \
5172 \
5173 add height_rounded, height_rounded, #15; \
5174 mov tile_width, width_rounded, lsr #4; \
5175 \
5176 /* texture_offset_base = VH-VL-00-00 */\
5177 mov texture_offset_base, v, lsl #8; \
5178 and offset_u_right, width_rounded, #0xF; \
5179 \
5180 /* texture_offset_base = VH-UH-UL-00 */\
5181 bfi texture_offset_base, u, #4, #8; \
59d15d23 5182 mov right_block_mask, #0xFFFFFFFE; \
5183 \
5184 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5185 \
5186 /* texture_offset_base = VH-UH-VL-00 */\
5187 bfi texture_offset_base, v, #4, #4; \
59d15d23 5188 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5189 \
5190 mov tile_height, height_rounded, lsr #4; \
5191 mvn left_block_mask, left_block_mask, lsl offset_u; \
5192 \
5193 /* texture_mask = HH-HL-WH-WL */\
5194 ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \
5195 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5196 \
5197 /* texture_mask_rev = WH-WL-HH-HL */\
5198 rev16 texture_mask_rev, texture_mask; \
5199 vmov block_masks, left_block_mask, right_block_mask; \
5200 \
5201 /* texture_mask = HH-HL-HL-WL */\
5202 bfi texture_mask, texture_mask_rev, #4, #4; \
5203 /* texture_mask_rev = 00-00-00-WH */\
5204 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5205 \
5206 /* texture_mask = HH-WH-HL-WL */\
5207 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5208 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5209 \
5210 mov control_mask, #0; \
59d15d23 5211 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5212 \
59d15d23 5213 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5214 orreq control_mask, control_mask, #0x4; \
5215 \
5216 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
59d15d23 5217 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5218 \
5219 orreq control_mask, control_mask, #0x8; \
5220 cmp tile_width, #1; \
5221 \
5222 add block, psx_gpu, #psx_gpu_blocks_offset; \
5223 orreq control_mask, control_mask, #0x1; \
5224 \
5225 cmp tile_height, #1; \
5226 add block, block, num_blocks, lsl #6; \
5227 \
5228 orreq control_mask, control_mask, #0x2; \
5229 ldr pc, [ pc, control_mask, lsl #2 ]; \
5230 nop; \
5231 \
59d15d23 5232 .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \
5233 .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \
5234 .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \
5235 .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \
5236 .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \
5237 .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \
5238 .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \
5239 .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \
5240 .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \
5241 .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \
5242 .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \
5243 .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \
5244 .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \
75e28f62 5245 .word 0x00000000; \
59d15d23 5246 .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \
5247
5248
5249setup_sprite_tiled_builder(4bpp,);
5250setup_sprite_tiled_builder(8bpp,);
75e28f62 5251
59d15d23 5252#undef draw_mask_fb_ptr_left
5253#undef draw_mask_fb_ptr_right
75e28f62 5254
59d15d23 5255setup_sprite_tiled_builder(4bpp, _4x);
5256setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5257
5258
5259#undef block_ptr
5260#undef num_blocks
5261#undef clut_ptr
5262
5263#define psx_gpu r0
5264#define block_ptr r0
5265#define num_blocks r1
5266#define clut_ptr r2
5267#define texel_shift_mask r3
5268#define block_pixels_a r4
5269#define block_pixels_b r5
5270#define texel_0 r6
5271#define texel_2 r7
5272#define texel_4 r8
5273#define texel_6 r9
5274#define texel_1 r10
5275#define texel_3 r11
5276#define texel_5 r12
5277#define texel_7 r14
5278#define texels_01 r6
5279#define texels_23 r7
5280#define texels_45 r8
5281#define texels_67 r9
5282
5283function(texture_sprite_blocks_8bpp)
5284 stmdb sp!, { r4 - r11, r14 }
5285 movw texel_shift_mask, #(0xFF << 1)
5286
5287 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5288 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
5289
5290 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5291 ldr block_pixels_a, [ block_ptr, #16 ]
5292
5293 0:
5294 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5295 ldr block_pixels_b, [ block_ptr, #20 ]
5296
5297 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5298 ldrh texel_0, [ clut_ptr, texel_0 ]
5299
5300 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5301 ldrh texel_1, [ clut_ptr, texel_1 ]
5302
5303 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5304 ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
5305
5306 ldrh texel_2, [ clut_ptr, texel_2 ]
5307 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5308
5309 ldrh texel_3, [ clut_ptr, texel_3 ]
5310 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5311
5312 ldrh texel_4, [ clut_ptr, texel_4 ]
5313 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5314
5315 ldrh texel_5, [ clut_ptr, texel_5 ]
5316 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5317
5318 ldrh texel_6, [ clut_ptr, texel_6 ]
5319 orr texels_01, texel_0, texel_1, lsl #16
5320
5321 ldrh texel_7, [ clut_ptr, texel_7 ]
5322 orr texels_23, texel_2, texel_3, lsl #16
5323
5324 orr texels_45, texel_4, texel_5, lsl #16
5325 str texels_01, [ block_ptr, #0 ]
5326
5327 orr texels_67, texel_6, texel_7, lsl #16
5328 str texels_23, [ block_ptr, #4 ]
5329
5330 subs num_blocks, num_blocks, #1
5331 str texels_45, [ block_ptr, #8 ]
5332
5333 str texels_67, [ block_ptr, #12 ]
5334 add block_ptr, block_ptr, #64
5335
5336 bne 0b
5337
5338 ldmia sp!, { r4 - r11, pc }
5339
5340
5341#undef width_rounded
5342#undef texture_mask
5343#undef num_blocks
5344#undef texture_offset
59d15d23 5345#undef texels_low
5346#undef texels_high
5347#undef texels_wide_low
5348#undef texels_wide_high
5349#undef texels_wide
5350#undef fb_ptr2
75e28f62
E
5351
5352#define psx_gpu r0
5353#define x r1
5354#define y r2
5355#define u r3
5356#define v r4
5357#define width r5
5358#define height r6
5359#define left_offset r8
5360#define width_rounded r9
5361#define right_width r10
59d15d23 5362
75e28f62
E
5363#define block_width r11
5364
5365#define texture_offset_base r1
5366#define texture_mask r2
5367#define texture_page_ptr r3
5368#define num_blocks r4
5369#define block r5
5370#define fb_ptr r7
5371#define texture_offset r8
5372#define blocks_remaining r9
59d15d23 5373#define fb_ptr2 r10
75e28f62
E
5374#define fb_ptr_pitch r12
5375#define texture_block_ptr r14
5376
5377#define texture_mask_width r2
5378#define texture_mask_height r3
5379#define left_mask_bits r4
5380#define right_mask_bits r5
5381
5382
5383#undef block_masks
5384#undef block_masks_shifted
5385#undef texels
5386
5387#define block_masks d0
5388#define block_masks_shifted d1
5389#define draw_mask_fb_ptr d2
5390#define texels q2
5391
59d15d23 5392#define draw_mask_fb_ptr_a d2
5393#define draw_mask_fb_ptr_b d3
5394#define texels_low d4
5395#define texels_high d5
5396#define texels_wide_low d6
5397#define texels_wide_high d7
5398#define texels_wide q3
75e28f62 5399
75e28f62 5400
59d15d23 5401setup_sprites_16bpp_flush:
5402 vpush { d0 - d3 }
75e28f62
E
5403
5404 stmdb sp!, { r0 - r3, r12, r14 }
5405 bl flush_render_block_buffer
5406 ldmia sp!, { r0 - r3, r12, r14 }
5407
59d15d23 5408 vpop { d0 - d3 }
75e28f62
E
5409
5410 add block, psx_gpu, #psx_gpu_blocks_offset
5411 mov num_blocks, block_width
5412
5413 bx lr
5414
5415function(setup_sprite_16bpp)
5416 stmdb sp!, { r4 - r11, r14 }
c1817bd9 5417 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
5418
5419 ldr v, [ sp, #36 ]
5420 add fb_ptr, fb_ptr, y, lsl #11
5421
5422 ldr width, [ sp, #40 ]
5423 add fb_ptr, fb_ptr, x, lsl #1
5424
5425 ldr height, [ sp, #44 ]
5426 and left_offset, u, #0x7
5427
5428 add texture_offset_base, u, u
5429 add width_rounded, width, #7
5430
5431 add texture_offset_base, v, lsl #11
5432 mov left_mask_bits, #0xFF
5433
5434 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5435 add width_rounded, width_rounded, left_offset
5436
5437 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5438 sub fb_ptr, fb_ptr, left_offset, lsl #1
5439
5440 add texture_mask, texture_mask_width, texture_mask_width
5441 mov right_mask_bits, #0xFE
5442
5443 and right_width, width_rounded, #0x7
5444 mvn left_mask_bits, left_mask_bits, lsl left_offset
5445
5446 add texture_mask, texture_mask_height, lsl #11
5447 mov block_width, width_rounded, lsr #3
5448
5449 mov right_mask_bits, right_mask_bits, lsl right_width
5450 movw fb_ptr_pitch, #(2048 + 16)
5451
5452 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5453 vmov block_masks, left_mask_bits, right_mask_bits
5454
5455 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5456 add block, psx_gpu, #psx_gpu_blocks_offset
5457
6ea0f7bf 5458 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5459 cmp block_width, #1
5460
5461 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5462 add block, block, num_blocks, lsl #6
5463
5464 bne 0f
5465
5466 vext.32 block_masks_shifted, block_masks, block_masks, #1
5467 vorr.u32 block_masks, block_masks, block_masks_shifted
5468 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5469
5470 1:
5471 add num_blocks, num_blocks, #1
5472 cmp num_blocks, #MAX_BLOCKS
59d15d23 5473 blgt setup_sprites_16bpp_flush
75e28f62
E
5474
5475 and texture_block_ptr, texture_offset_base, texture_mask
5476 subs height, height, #1
5477
5478 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5479 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5480
5481 vst1.u32 { texels }, [ block, :128 ]
5482 add block, block, #40
5483
5484 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5485 pld [ fb_ptr ]
5486
5487 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5488
5489 add block, block, #24
5490 add texture_offset_base, texture_offset_base, #2048
5491 add fb_ptr, fb_ptr, #2048
5492 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5493 bne 1b
5494
5495 ldmia sp!, { r4 - r11, pc }
5496
5497 0:
5498 add num_blocks, num_blocks, block_width
5499 mov texture_offset, texture_offset_base
5500
5501 cmp num_blocks, #MAX_BLOCKS
59d15d23 5502 blgt setup_sprites_16bpp_flush
75e28f62
E
5503
5504 add texture_offset_base, texture_offset_base, #2048
5505 and texture_block_ptr, texture_offset, texture_mask
5506
5507 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5508 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5509
5510 vst1.u32 { texels }, [ block, :128 ]
5511 add block, block, #40
5512
5513 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5514 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5515 pld [ fb_ptr ]
5516
5517 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5518 subs blocks_remaining, block_width, #2
5519
5520 add texture_offset, texture_offset, #16
5521 add fb_ptr, fb_ptr, #16
5522
5523 vmov.u8 draw_mask_fb_ptr, #0
5524
5525 add block, block, #24
5526 beq 2f
5527
5528 1:
5529 and texture_block_ptr, texture_offset, texture_mask
5530 subs blocks_remaining, blocks_remaining, #1
5531
5532 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5533 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5534
5535 vst1.u32 { texels }, [ block, :128 ]
5536 add block, block, #40
5537
5538 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5539 pld [ fb_ptr ]
5540
5541 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5542
5543 add texture_offset, texture_offset, #16
5544 add fb_ptr, fb_ptr, #16
5545
5546 add block, block, #24
5547 bne 1b
5548
5549 2:
5550 and texture_block_ptr, texture_offset, texture_mask
5551 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5552
5553 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5554 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5555
5556 vst1.u32 { texels }, [ block, :128 ]
5557 add block, block, #40
5558
5559 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5560 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5561
5562 add block, block, #24
5563 subs height, height, #1
5564
5565 add fb_ptr, fb_ptr, fb_ptr_pitch
5566 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5567
5568 bne 0b
5569
5570 ldmia sp!, { r4 - r11, pc }
5571
5572
59d15d23 5573// 4x version
5574// FIXME: duplicate code with normal version :(
5575#undef draw_mask_fb_ptr
5576
5577function(setup_sprite_16bpp_4x)
5578 stmdb sp!, { r4 - r11, r14 }
5579 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5580
5581 ldr v, [ sp, #36 ]
5582 add fb_ptr, fb_ptr, y, lsl #11
5583
5584 ldr width, [ sp, #40 ]
5585 add fb_ptr, fb_ptr, x, lsl #1
5586
5587 ldr height, [ sp, #44 ]
5588 and left_offset, u, #0x7
5589
5590 add texture_offset_base, u, u
5591 add width_rounded, width, #7
5592
5593 add texture_offset_base, v, lsl #11
5594 movw left_mask_bits, #0xFFFF
5595
5596 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5597 add width_rounded, width_rounded, left_offset
5598
5599 lsl left_offset, #1
5600
5601 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5602 sub fb_ptr, fb_ptr, left_offset, lsl #1
5603
5604 add texture_mask, texture_mask_width, texture_mask_width
5605 movw right_mask_bits, #0xFFFC
5606
5607 and right_width, width_rounded, #0x7
5608 mvn left_mask_bits, left_mask_bits, lsl left_offset
5609
5610 lsl right_width, #1
5611
5612 add texture_mask, texture_mask_height, lsl #11
5613 mov block_width, width_rounded, lsr #3
5614
5615 mov right_mask_bits, right_mask_bits, lsl right_width
5616 movw fb_ptr_pitch, #(2048 + 16) * 2
5617
5618 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5619 vmov block_masks, left_mask_bits, right_mask_bits
5620
5621 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5622 add block, psx_gpu, #psx_gpu_blocks_offset
5623
5624 bic texture_offset_base, texture_offset_base, #0xF
5625 cmp block_width, #1
5626
5627 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5628 add block, block, num_blocks, lsl #6
5629
5630 lsl block_width, #2
5631 bne 0f
5632
5633 vext.32 block_masks_shifted, block_masks, block_masks, #1
5634 vorr.u32 block_masks, block_masks, block_masks_shifted
5635 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5636 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5637
5638 1:
5639 add num_blocks, num_blocks, block_width
5640 cmp num_blocks, #MAX_BLOCKS
5641 blgt setup_sprites_16bpp_flush
5642
5643 and texture_block_ptr, texture_offset_base, texture_mask
5644 subs height, height, #1
5645
5646 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5647 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5648
5649 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5650
5651 add texture_offset_base, texture_offset_base, #2048
5652 add fb_ptr, fb_ptr, #2048*2
5653 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5654 bne 1b
5655
5656 ldmia sp!, { r4 - r11, pc }
5657
5658 0:
5659 add num_blocks, num_blocks, block_width
5660 mov texture_offset, texture_offset_base
5661
5662 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5663 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5664
5665 cmp num_blocks, #MAX_BLOCKS
5666 blgt setup_sprites_16bpp_flush
5667
5668 add texture_offset_base, texture_offset_base, #2048
5669 and texture_block_ptr, texture_offset, texture_mask
5670
5671 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5672 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5673
5674 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5675
5676 subs blocks_remaining, block_width, #2*4
5677 add texture_offset, texture_offset, #16
5678
5679 vmov.u8 draw_mask_fb_ptr_a, #0
5680 vmov.u8 draw_mask_fb_ptr_b, #0
5681
5682 add fb_ptr, fb_ptr, #16*2
5683 beq 2f
5684
5685 1:
5686 and texture_block_ptr, texture_offset, texture_mask
5687 subs blocks_remaining, blocks_remaining, #4
5688
5689 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5690 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5691
5692 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5693 add texture_offset, texture_offset, #16
5694
5695 add fb_ptr, fb_ptr, #16*2
5696 bgt 1b
5697
5698 2:
5699 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5700 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5701
5702 and texture_block_ptr, texture_offset, texture_mask
5703 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5704
5705 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5706
5707 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5708 subs height, height, #1
5709
5710 add fb_ptr, fb_ptr, fb_ptr_pitch
5711 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5712
5713 bne 0b
5714
5715 ldmia sp!, { r4 - r11, pc }
5716
5717
f0931e56 5718#undef width
5719#undef right_width
5720#undef right_mask_bits
5721#undef color
5722#undef height
5723#undef blocks_remaining
5724#undef colors
5725#undef right_mask
5726#undef test_mask
5727#undef draw_mask
5728
5729#define psx_gpu r0
5730#define x r1
5731#define y r2
5732#define width r3
5733#define right_width r5
5734#define right_mask_bits r6
5735#define fb_ptr r7
5736#define color r8
5737#define height r9
5738#define fb_ptr_pitch r12
5739
5740// referenced by setup_sprites_16bpp_flush
5741#define num_blocks r4
5742#define block r5
5743#define block_width r11
5744
5745#define color_r r1
5746#define color_g r2
5747#define color_b r8
5748#define blocks_remaining r6
5749
5750#define colors q0
5751#define right_mask q1
5752#define test_mask q2
5753#define draw_mask q2
5754#define draw_mask_bits_fb_ptr d6
5755
5756
5757.align 3
5758
5759function(setup_sprite_untextured)
5760 ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
5761 tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
5762 | RENDER_FLAGS_BLEND)
d5c08ed3 5763 ldreqb r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
5764 tsteq r12, #RENDER_INTERLACE_ENABLED
f0931e56 5765 beq setup_sprite_untextured_simple
5766
5767 stmdb sp!, { r4 - r11, r14 }
5768
5769 ldr width, [ sp, #40 ]
5770 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5771
5772 ldr height, [ sp, #44 ]
5773 add fb_ptr, fb_ptr, y, lsl #11
5774
5775 add fb_ptr, fb_ptr, x, lsl #1
5776 sub right_width, width, #1
5777
5778 ldr color, [ sp, #48 ]
5779 and right_width, #7
5780
5781 add block_width, width, #7
5782 add right_width, #1
5783
5784 lsr block_width, #3
5785 mov right_mask_bits, #0xff
5786
5787 sub fb_ptr_pitch, block_width, #1
5788 lsl right_mask_bits, right_width
5789
5790 lsl fb_ptr_pitch, #3+1
5791 ubfx color_r, color, #3, #5
5792
5793 rsb fb_ptr_pitch, #1024*2
5794 ubfx color_g, color, #11, #5
5795
5796 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
5797 ubfx color_b, color, #19, #5
5798
5799 vdup.u16 right_mask, right_mask_bits
5800 orr color, color_r, color_b, lsl #10
5801
5802 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5803 orr color, color, color_g, lsl #5
5804
5805 vtst.u16 right_mask, right_mask, test_mask
5806 add block, psx_gpu, #psx_gpu_blocks_offset
5807
5808 vdup.u16 colors, color
5809 add block, block, num_blocks, lsl #6
5810
5811
5812setup_sprite_untextured_height_loop:
5813 add num_blocks, block_width
5814 sub blocks_remaining, block_width, #1
5815
5816 cmp num_blocks, #MAX_BLOCKS
5817 blgt setup_sprites_16bpp_flush
5818
5819 cmp blocks_remaining, #0
5820 ble 1f
5821
5822 vmov.u8 draw_mask, #0 /* zero_mask */
5823 vmov.u8 draw_mask_bits_fb_ptr, #0
5824
5825 0:
5826 vst1.u32 { draw_mask }, [ block, :128 ]!
5827 subs blocks_remaining, #1
5828
5829 vst1.u32 { colors }, [ block, :128 ]
5830 add block, block, #24
5831
5832 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5833 vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5834
5835 add block, block, #24
5836 add fb_ptr, #8*2
5837 bgt 0b
5838
5839 1:
5840 vst1.u32 { right_mask }, [ block, :128 ]!
5841 subs height, #1
5842
5843 vst1.u32 { colors }, [ block, :128 ]
5844 add block, block, #24
5845
5846 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5847 vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5848
5849 add block, block, #24
5850 add fb_ptr, fb_ptr_pitch
5851
5852 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5853 bgt setup_sprite_untextured_height_loop
5854
5855 ldmia sp!, { r4 - r11, pc }
5856
5857
5858
75e28f62
E
5859#undef texture_page_ptr
5860#undef vram_ptr
5861#undef dirty_textures_mask
5862#undef current_texture_mask
5863
5864#define psx_gpu r0
5865#define current_texture_page r1
5866#define texture_page_ptr r2
5867#define vram_ptr_a r3
5868#define current_texture_page_x r12
5869#define current_texture_page_y r4
5870#define dirty_textures_mask r5
5871#define tile_y r6
5872#define tile_x r7
5873#define sub_y r8
5874#define current_texture_mask r9
5875#define c_4096 r10
5876#define vram_ptr_b r11
5877
5878#define texel_block_a d0
5879#define texel_block_b d1
5880#define texel_block_expanded_a q1
5881#define texel_block_expanded_b q2
5882#define texel_block_expanded_ab q2
5883#define texel_block_expanded_c q3
5884#define texel_block_expanded_d q4
5885#define texel_block_expanded_cd q3
5886
5887function(update_texture_4bpp_cache)
5888 stmdb sp!, { r4 - r11, r14 }
5889 vpush { q0 - q3 }
5890
5891 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5892
3867c6ef 5893 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5894 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5895
5896 and current_texture_page_x, current_texture_page, #0xF
5897 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5898
5899 mov current_texture_page_y, current_texture_page, lsr #4
5900 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5901
5902 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5903 mov tile_y, #16
5904
5905 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5906 bic dirty_textures_mask, current_texture_mask
5907
5908 mov tile_x, #16
5909 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5910
5911 mov sub_y, #8
5912 movw c_4096, #4096
5913
5914 add vram_ptr_b, vram_ptr_a, #2048
5915
5916 0:
5917 vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5918 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5919
5920 vmovl.u8 texel_block_expanded_a, texel_block_a
5921 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5922 vmovl.u8 texel_block_expanded_c, texel_block_b
5923 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5924
5925 vbic.u16 texel_block_expanded_a, #0x00F0
5926 vbic.u16 texel_block_expanded_b, #0x00F0
5927 vbic.u16 texel_block_expanded_c, #0x00F0
5928 vbic.u16 texel_block_expanded_d, #0x00F0
5929
5930 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5931 texel_block_expanded_b
5932 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5933 texel_block_expanded_d
5934
5935 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
5936 [ texture_page_ptr, :256 ]!
5937
5938 subs sub_y, sub_y, #1
5939 bne 0b
5940
5941 mov sub_y, #8
5942 add vram_ptr_a, vram_ptr_a, #8
5943 add vram_ptr_b, vram_ptr_b, #8
5944
5945 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5946 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5947
5948 subs tile_x, tile_x, #1
5949 bne 0b
5950
5951 mov tile_x, #16
5952 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5953 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5954
5955 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5956 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5957
5958 subs tile_y, tile_y, #1
5959 bne 0b
5960
5961 vpop { q0 - q3 }
5962 ldmia sp!, { r4 - r11, pc }
5963
5964
5965#undef current_texture_page
5966
5967#define psx_gpu r0
5968#define texture_page r1
5969#define texture_page_ptr r2
5970#define vram_ptr_a r3
5971#define texture_page_x r12
5972#define texture_page_y r4
5973#define current_texture_page r5
5974#define tile_y r6
5975#define tile_x r7
5976#define sub_y r8
5977#define c_4096 r10
5978#define vram_ptr_b r11
5979
5980
5981#undef texels_a
5982#undef texels_b
5983
5984#define texels_a q0
5985#define texels_b q1
5986#define texels_c q2
5987#define texels_d q3
5988
5989
5990function(update_texture_8bpp_cache_slice)
5991 stmdb sp!, { r4 - r11, r14 }
5992 vpush { q0 - q3 }
5993
5994 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5995 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5996
3867c6ef 5997 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5998 mov tile_y, #16
5999
6000 and texture_page_x, texture_page, #0xF
6001 mov texture_page_y, texture_page, lsr #4
6002
6003 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
6004 mov tile_x, #8
6005
6006 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6007 eor current_texture_page, current_texture_page, texture_page
6008
6009 ands current_texture_page, current_texture_page, #0x1
6010 mov sub_y, #4
6011
6012 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6013 movw c_4096, #4096
6014
6015 add vram_ptr_b, vram_ptr_a, #2048
6016
6017 0:
6018 vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
6019 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
6020 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
6021 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
6022
6023 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
6024 vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
6025
6026 subs sub_y, sub_y, #1
6027 bne 0b
6028
6029 mov sub_y, #4
6030
6031 add vram_ptr_a, vram_ptr_a, #16
6032 add vram_ptr_b, vram_ptr_b, #16
6033
6034 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6035 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6036
6037 subs tile_x, tile_x, #1
6038 bne 0b
6039
6040 mov tile_x, #8
6041
6042 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6043 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6044
6045 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6046 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6047
6048 subs tile_y, tile_y, #1
6049 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6050
6051 bne 0b
6052
6053 vpop { q0 - q3 }
6054 ldmia sp!, { r4 - r11, pc }
6055
50f9355a 6056
6057/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6058function(scale2x_tiles8)
6059 push { r4, r14 }
6060
6061 mov r4, r1
6062 add r12, r0, #1024*2
6063 mov r14, r2
6064
60650:
6066 vld1.u16 { q0 }, [ r1, :128 ]!
6067 vld1.u16 { q2 }, [ r1, :128 ]!
6068 vmov q1, q0
6069 vmov q3, q2
6070 vzip.16 q0, q1
6071 vzip.16 q2, q3
6072 subs r14, #2
6073 vst1.u16 { q0, q1 }, [ r0, :128 ]!
6074 vst1.u16 { q0, q1 }, [ r12, :128 ]!
6075 blt 1f
6076 vst1.u16 { q2, q3 }, [ r0, :128 ]!
6077 vst1.u16 { q2, q3 }, [ r12, :128 ]!
6078 bgt 0b
60791:
6080 subs r3, #1
6081 mov r14, r2
6082 add r0, #1024*2*2
6083 add r4, #1024*2
6084 sub r0, r2, lsl #4+1
6085 mov r1, r4
6086 add r12, r0, #1024*2
6087 bgt 0b
6088 nop
6089
6090 pop { r4, pc }
59d15d23 6091
6092// vim:filetype=armasm