cdrom: attempt to deal with broken subq :(
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
16#define MAX_SPANS 512
17#define MAX_BLOCKS 64
18#define MAX_BLOCKS_PER_ROW 128
19
f0931e56 20#define RENDER_STATE_MASK_EVALUATE 0x20
21#define RENDER_FLAGS_MODULATE_TEXELS 0x1
22#define RENDER_FLAGS_BLEND 0x2
d5c08ed3 23#define RENDER_INTERLACE_ENABLED 0x1
f0931e56 24
cb88320b 25#include "psx_gpu_offsets.h"
75e28f62 26
cb88320b 27#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 28
75e28f62
E
29#define edge_data_left_x_offset 0
30#define edge_data_num_blocks_offset 2
31#define edge_data_right_mask_offset 4
32#define edge_data_y_offset 6
33
34
35#define psx_gpu r0
36#define v_a r1
37#define v_b r2
38#define v_c r3
39
40#define x0 r4
41#define x1 r5
42#define x2 r6
43#define x0_x1 r5
44#define x1_x2 r6
45#define y0 r7
46#define y1 r8
47#define y2 r9
48#define y0_y1 r7
49#define y1_y2 r8
50#define b0 r9
51#define b1 r10
52#define b2 r11
53#define b0_b1 r10
54#define b1_b2 r11
55
56
57#define area_r_s r5
58
59#define g_bx0 r2
60#define g_bx r3
61#define g_bx2 r4
62#define g_bx3 r5
63#define b_base r6
64#define g_by r8
65
66#define gs_bx r7
67#define gs_by r10
68
69#define ga_bx g_bx
70#define ga_by g_by
71
72#define gw_bx_h g_bx
73#define gw_by_h g_by
74
75#define gw_bx_l r11
76#define gw_by_l gw_bx_l
77
78#define store_a r0
79#define store_b r1
80#define store_inc r5
81
82
83#define v0 q0
84#define uvrgb0 d0
85#define x0_y0 d1
86
87#define v1 q1
88#define uvrgb1 d2
89#define x1_y1 d3
90
91#define v2 q2
92#define uvrgb2 d4
93#define x2_y2 d5
94
95#define x0_ab q3
96#define uvrg_xxxx0 q3
97#define uvrg0 d6
98#define xxxx0 d7
99
100#define x1_ab q4
101#define uvrg_xxxx1 q4
102#define uvrg1 d8
103#define xxxx1 d9
104
105#define x2_ab q5
106#define uvrg_xxxx2 q5
107#define uvrg2 d10
108#define xxxx2 d11
109
110#define y0_ab q6
111#define yyyy_uvrg0 q6
112#define yyyy0 d12
113#define uvrg0b d13
114
115#define y1_ab q7
116#define yyyy_uvrg1 q7
117#define yyyy1 d14
118#define uvrg1b d15
119
120#define y2_ab q8
121#define yyyy_uvrg2 q8
122#define yyyy2 d16
123#define uvrg2b d17
124
125#define d0_ab q9
126#define d0_a d18
127#define d0_b d19
128
129#define d1_ab q10
130#define d1_a d20
131#define d1_b d21
132
133#define d2_ab q11
134#define d2_a d22
135#define d2_b d23
136
137#define d3_ab q12
138#define d3_a d24
139#define d3_b d25
140
141#define ga_uvrg_x q1
142#define ga_uvrg_y q4
143
144#define dx x0_x1
145#define dy y0_y1
146#define db b0_b1
147
148#define uvrg_base q11
149
150#define gs_uvrg_x q5
151#define gs_uvrg_y q6
152
153#define g_uvrg_x q1
154#define ga_uv_x d2
155#define g_uv_x d2
156#define ga_rg_x d3
157#define g_rg_x d3
158
159#define g_uvrg_y q4
160#define ga_uv_y d8
161#define g_uv_y d8
162#define ga_rg_y d9
163#define g_rg_y d9
164
165#define gw_uv_x q1
166#define gw_rg_x q2
167#define gw_uv_y q4
168#define gw_rg_y q3
169
170#define w_mask q9
171#define w_mask_l d18
172
173#define r_shift q10
174
175#define uvrg_dx0 q0
176#define uvrg_dx0l d0
177#define uvrg_dx0h d1
178
179#define uvrg_dx1 q1
180#define uvrg_dx1l d2
181#define uvrg_dx1h d3
182
183#define uvrg_dx2 q2
184#define uvrg_dx2l d4
185#define uvrg_dx2h d5
186
187#define uvrg_dx3 q3
188#define uvrg_dx3l d6
189#define uvrg_dx3h d7
190
c6063f89 191#define uvrgb_phase q13
75e28f62
E
192
193.align 4
194
5d834c08 195/* FIXME: users of this should be in psx_gpu instead */
196#ifndef __PIC__
197#define load_pointer(register, pointer) \
198 movw register, :lower16:pointer; \
199 movt register, :upper16:pointer; \
200
201#else
202#define load_pointer(register, pointer) \
203 ldr register, =pointer \
204
205#endif
206
75e28f62
E
207#define function(name) \
208 .global name; \
209 name: \
210
211@ r0: psx_gpu
212@ r1: v_a
213@ r2: v_b
214@ r3: v_c
215
216function(compute_all_gradients)
217 // First compute the triangle area reciprocal and shift. The division will
218 // happen concurrently with much of the work which follows.
219 @ r12 = psx_gpu->triangle_area
220 ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
221 stmdb sp!, { r4 - r11, lr }
222
223 @ load exponent of 62 into upper half of double
224 movw r4, #0
225 clz r14, r12 @ r14 = shift
226
227 movt r4, #((62 + 1023) << 4)
228 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
229
230 @ load area normalized into lower half of double
231 mov r5, r12, lsr #10
232 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
233
234 movt r4, #((1022 + 31) << 4)
235 mov r5, r12, lsl #20
236
237 add r4, r4, r12, lsr #11
238 vmov.f64 d31, r5, r4
239
240 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
241
242 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
243 // ( d0 * d1 ) - ( d2 * d3 ) =
244 // ( m0 ) - ( m1 ) = gradient
245
246 // This is split to do 12 elements at a time over three sets: a, b, and c.
247 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
248 // two of the slots are unused.
249
250 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
251 // is g.
252
253 // First type is: uvrg bxxx xxxx
254 // Second type is: yyyy ybyy uvrg
255 // Since x_a and y_c are the same the same variable is used for both.
256
257 vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 }
258 ldrsh x0, [ v_a, #8 ] @ load x0
259
260 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1}
261 ldrh x1, [ v_b, #8 ] @ load x1
262
263 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 }
264 ldrh x2, [ v_c, #8 ] @ load x2
265
266 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
267 ldrh y0, [ v_a, #10 ] @ load y0
268
269 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
270 ldrh y1, [ v_b, #10 ] @ load y1
271
272 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
273 ldrh y2, [ v_c, #10 ] @ load y2
274
275 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
276 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
277
278 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
279 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
280
281 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
282 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
283
284 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
285 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
286
287 ldrb b2, [ v_c, #4 ] @ load b2
288 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
289
290 ldrb b1, [ v_b, #4 ] @ load b1
291 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
292
293 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
294 vsub.s16 d0_ab, x1_ab, x0_ab
295
296 ldrb b0, [ v_a, #4 ] @ load b0
297 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
298
299 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
300 vsub.s16 d2_ab, x2_ab, x1_ab
301
302 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
303 vsub.s16 d1_ab, y2_ab, y1_ab
304
305 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
306 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
307
308 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
309 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
310
311 vsub.s16 d3_ab, y1_ab, y0_ab
312 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
313 @ ((x2 - X1) * (b1 - b0))
314 vmull.s16 ga_uvrg_x, d0_a, d1_a
315 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
316 @ ((b2 - b1) * (y1 - y0))
317 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
318 movs gs_bx, ga_bx, asr #31
319
320 vmull.s16 ga_uvrg_y, d0_b, d1_b
321 rsbmi ga_bx, ga_bx, #0
322
c6063f89 323 @ r12 = psx_gpu->uvrgb_phase
324 ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
325
75e28f62
E
326 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
327 movs gs_by, ga_by, asr #31
328
329 vshr.u64 d0, d30, #22
c6063f89 330 add b_base, r12, b0, lsl #16
331
332 vdup.u32 uvrgb_phase, r12
75e28f62
E
333
334 rsbmi ga_by, ga_by, #0
335 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
336
337 @ r12 = psx_gpu->triangle_winding_offset
338 ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
339 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
340
75e28f62
E
341 rsb r12, r12, #0 @ r12 = -(triangle->winding)
342
343 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
344 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
345
346 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
347 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
348
c6063f89 349 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
350 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
351
352 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
353 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
354
355 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
356 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
357 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
358 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
359
360 vshl.u64 gw_rg_x, gw_rg_x, r_shift
361 vshl.u64 gw_uv_x, gw_uv_x, r_shift
362 vshl.u64 gw_rg_y, gw_rg_y, r_shift
363 vshl.u64 gw_uv_y, gw_uv_y, r_shift
364
365 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
366 vmovn.u64 g_uv_x, gw_uv_x
367
368 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
369 vmovn.u64 g_rg_x, gw_rg_x
370
371 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
372 vmovn.u64 g_uv_y, gw_uv_y
373
374 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
375 vmovn.u64 g_rg_y, gw_rg_y
376
377 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
378 mov ga_bx, ga_bx, lsl #13
379
380 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
381 mov ga_by, ga_by, lsl #13
382
383 vdup.u32 x0_y0, x0
384 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
385
386 vshl.u32 g_uvrg_x, g_uvrg_x, #4
387 vshl.u32 g_uvrg_y, g_uvrg_y, #4
388
389 umull gw_by_l, gw_by_h, ga_by, area_r_s
390 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
391
392 eor gs_bx, gs_bx, r12
393 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
394
395 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
396 eor gs_by, gs_by, r12
397
398 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
399 add store_a, psx_gpu, #psx_gpu_uvrg_offset
400
401 sub r11, r11, #(32 - 13)
402
403 add store_b, store_a, #16
404 mov store_inc, #32
405
406 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
407 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
408
409 vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
410 mov g_bx, gw_bx_h, lsr r11
411
412 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
413 mov g_by, gw_by_h, lsr r11
414
415 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
416 [ store_b, : 128 ], store_inc
417 eor g_bx, g_bx, gs_bx
418
419 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
420 [ store_b, : 128 ], store_inc
421 sub g_bx, g_bx, gs_bx
422
423 lsl g_bx, g_bx, #4
424 eor g_by, g_by, gs_by
425
426 mls b_base, g_bx, x0, b_base
427 sub g_by, g_by, gs_by
428
429 lsl g_by, g_by, #4
430 mov g_bx0, #0
431
432 add g_bx2, g_bx, g_bx
433 add g_bx3, g_bx, g_bx2
434
435 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
436
437 ldmia sp!, { r4 - r11, pc }
438
439
440#define psx_gpu r0
441#define v_a r1
442#define v_b r2
443#define v_c r3
444
445#define temp r14
446
447#define x_a r4
448#define x_b r5
449#define x_c r6
450#define y_a r1
451#define y_b r2
452#define y_c r3
453
454#define height_minor_a r7
455#define height_minor_b r8
456#define height_major r9
457#define height r9
458
459#define reciprocal_table_ptr r10
460
461#define edge_alt_low r4
462#define edge_alt_high r5
463#define edge_dx_dy_alt r6
464#define edge_shift_alt r10
465
466#define edge_dx_dy_alt_low r4
467#define edge_dx_dy_alt_high r5
468
469#define span_edge_data r4
470#define span_uvrg_offset r5
471#define span_b_offset r6
472
473#define clip r14
474
475#define b r11
476#define b_dy r12
477
478
479#define alternate_x q0
480#define alternate_dx_dy q1
481#define alternate_x_32 q2
482
483#define alternate_x_low d0
484#define alternate_x_high d1
485#define alternate_dx_dy_low d2
486#define alternate_dx_dy_high d3
487#define alternate_x_32_low d4
488#define alternate_x_32_high d5
489
490#define left_x q3
491#define right_x q4
492#define left_dx_dy q5
493#define right_dx_dy q6
494#define left_edge q7
495#define right_edge q8
496
497#define left_x_low d6
498#define left_x_high d7
499#define right_x_low d8
500#define right_x_high d9
501#define left_dx_dy_low d10
502#define left_dx_dy_high d11
503#define right_dx_dy_low d12
504#define right_dx_dy_high d13
505#define left_edge_low d14
506#define left_edge_high d15
507#define right_edge_low d16
508#define right_edge_high d17
509
510#define y_mid_point d18
511#define c_0x0004 d19
512
513#define left_right_x_16 q11
514#define span_shifts_y q12
515#define c_0x0001 q13
516
517#define span_shifts d24
518#define y_x4 d25
519#define c_0xFFFE d26
520#define c_0x0007 d27
521
522#define left_right_x_16_low d22
523#define left_right_x_16_high d23
524
525#define uvrg q14
526#define uvrg_dy q15
527
528#define alternate_x_16 d4
529
530#define v_clip q3
531#define v_clip_low d6
532
533#define right_x_32 q10
534#define left_x_32 q11
535#define alternate_select d24
536
537#define right_x_32_low d20
538#define right_x_32_high d21
539#define left_x_32_low d22
540#define left_x_32_high d23
541
542#define edges_xy q0
543#define edges_dx_dy d2
544#define edge_shifts d3
545#define edge_shifts_64 q2
546
547#define edges_xy_left d0
548#define edges_xy_right d1
549
550#define height_reciprocals d6
551#define heights d7
552
553#define widths d8
554#define c_0x01 d9
555#define x_starts d10
556#define x_ends d11
557
558#define heights_b d12
559#define edges_dx_dy_64 q10
560
561#define edges_dx_dy_64_left d20
562#define edges_dx_dy_64_right d21
563
564
565#define setup_spans_prologue() \
566 stmdb sp!, { r4 - r11, lr }; \
567 \
568 ldrsh x_a, [ v_a, #8 ]; \
569 ldrsh x_b, [ v_b, #8 ]; \
570 ldrsh x_c, [ v_c, #8 ]; \
571 ldrsh y_a, [ v_a, #10 ]; \
572 ldrsh y_b, [ v_b, #10 ]; \
573 ldrsh y_c, [ v_c, #10 ]; \
574 \
575 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
576 vld1.32 { uvrg }, [ temp ]; \
577 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
578 vld1.32 { uvrg_dy }, [ temp ]; \
5d834c08 579 load_pointer(reciprocal_table_ptr, reciprocal_table); \
75e28f62
E
580 \
581 vmov.u32 c_0x01, #0x01 \
582
583#define setup_spans_load_b() \
584 ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \
585 ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \
586
587#define setup_spans_prologue_b() \
588 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
589 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
590 \
591 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
592 vmov.u16 c_0x0004, #0x0004; \
593 \
594 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
595 vmov.u16 c_0x0001, #0x0001; \
596 \
597 vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \
598 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
599 \
600 vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \
601 vadd.u16 right_edge, right_edge, c_0x0001; \
602 \
603 vmov.u16 c_0x0007, #0x0007; \
604 vmvn.u16 c_0xFFFE, #0x0001 \
605
606
607#define compute_edge_delta_x2() \
608 ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \
609 \
610 vdup.u32 heights, height; \
611 vsub.u32 widths, x_ends, x_starts; \
612 \
613 vdup.u32 edge_shifts, temp; \
614 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 615 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
616 \
617 vmla.s32 heights_b, x_starts, heights; \
618 vbic.u16 edge_shifts, #0xE0; \
619 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
620 vmull.s32 edges_xy, heights_b, height_reciprocals \
621
622#define width_alt r6
623#define height_reciprocal_alt r11
624#define height_b_alt r12
625
626#define compute_edge_delta_x3(start_c, height_a, height_b) \
627 vmov.u32 heights, height_a, height_b; \
628 ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
629 vmov.u32 edge_shifts[0], temp; \
630 ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
631 vmov.u32 edge_shifts[1], temp; \
632 ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \
633 \
634 vsub.u32 widths, x_ends, x_starts; \
635 sub width_alt, x_c, start_c; \
636 \
637 vsub.u32 heights_b, heights, c_0x01; \
638 sub height_b_alt, height_minor_b, #1; \
639 \
7d5140f5
E
640 vshr.u32 height_reciprocals, edge_shifts, #10; \
641 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
642 \
643 vmla.s32 heights_b, x_starts, heights; \
644 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
645 \
646 vbic.u16 edge_shifts, #0xE0; \
647 and edge_shift_alt, edge_shift_alt, #0x1F; \
648 \
649 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
650 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
651 \
652 vmull.s32 edges_xy, heights_b, height_reciprocals; \
653 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
654
655
656#define setup_spans_adjust_y_up() \
657 vsub.u32 y_x4, y_x4, c_0x0004 \
658
659#define setup_spans_adjust_y_down() \
660 vadd.u32 y_x4, y_x4, c_0x0004 \
661
662#define setup_spans_adjust_interpolants_up() \
663 vsub.u32 uvrg, uvrg, uvrg_dy; \
664 sub b, b, b_dy \
665
666#define setup_spans_adjust_interpolants_down() \
667 vadd.u32 uvrg, uvrg, uvrg_dy; \
668 add b, b, b_dy \
669
670
671#define setup_spans_clip_interpolants_increment() \
672 mla b, b_dy, clip, b; \
673 vmla.s32 uvrg, uvrg_dy, v_clip \
674
675#define setup_spans_clip_interpolants_decrement() \
676 mls b, b_dy, clip, b; \
677 vmls.s32 uvrg, uvrg_dy, v_clip \
678
679#define setup_spans_clip_alternate_yes() \
680 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
681
682#define setup_spans_clip_alternate_no() \
683
684#define setup_spans_clip(direction, alternate_active) \
685 vdup.u32 v_clip, clip; \
686 setup_spans_clip_alternate_##alternate_active(); \
687 setup_spans_clip_interpolants_##direction(); \
688 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
689
690
691#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
692 vmovl.s32 edge_shifts_64, edge_shifts; \
693 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
694 \
695 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
696 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
697 \
698 vmov left_x_low, edges_xy_##left_index; \
699 vmov right_x_low, edges_xy_##right_index; \
700 \
701 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
702 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
703 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
704 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
705 \
706 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
707 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
708 \
709 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
710 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
711
712
713#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
714 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
715 \
716 vdup.u16 y_mid_point, y_b; \
717 rsb temp, edge_shift_alt, #32; \
718 \
719 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
720 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
721 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
722 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
723 \
724 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
725 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
726 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
727 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
728 \
729 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
730 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
731
732
733#define setup_spans_y_select_up() \
734 vclt.s16 alternate_select, y_x4, y_mid_point \
735
736#define setup_spans_y_select_down() \
737 vcgt.s16 alternate_select, y_x4, y_mid_point \
738
739
740#define setup_spans_alternate_select_left() \
741 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
742
743#define setup_spans_alternate_select_right() \
744 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
745
746
747#define setup_spans_set_x4_alternate_yes(alternate, direction) \
748 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
749 vshrn.s64 left_x_32_low, left_x, #32; \
750 vshrn.s64 right_x_32_low, right_x, #32; \
751 \
752 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
753 vadd.u64 left_x, left_x, left_dx_dy; \
754 vadd.u64 right_x, right_x, right_dx_dy; \
755 \
756 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
757 vshrn.s64 left_x_32_high, left_x, #32; \
758 vshrn.s64 right_x_32_high, right_x, #32; \
759 \
760 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
761 vadd.u64 left_x, left_x, left_dx_dy; \
762 vadd.u64 right_x, right_x, right_dx_dy; \
763 \
764 vmovn.u32 alternate_x_16, alternate_x_32; \
765 setup_spans_y_select_##direction(); \
766 vmovn.u32 left_right_x_16_low, left_x_32; \
767 \
768 vmovn.u32 left_right_x_16_high, right_x_32; \
769 setup_spans_alternate_select_##alternate(); \
770 \
771 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
772 str b, [ span_b_offset ], #4; \
773 setup_spans_adjust_interpolants_##direction(); \
774 \
775 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
776 \
777 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
778 str b, [ span_b_offset ], #4; \
779 setup_spans_adjust_interpolants_##direction(); \
780 \
781 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
782 \
783 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
784 str b, [ span_b_offset ], #4; \
785 setup_spans_adjust_interpolants_##direction(); \
786 \
787 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
788 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
789 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
790 \
791 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
792 str b, [ span_b_offset ], #4; \
793 setup_spans_adjust_interpolants_##direction(); \
794 \
795 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
796 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
797 \
798 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
799 \
800 setup_spans_adjust_y_##direction() \
801
802
803#define setup_spans_set_x4_alternate_no(alternate, direction) \
804 vshrn.s64 left_x_32_low, left_x, #32; \
805 vshrn.s64 right_x_32_low, right_x, #32; \
806 \
807 vadd.u64 left_x, left_x, left_dx_dy; \
808 vadd.u64 right_x, right_x, right_dx_dy; \
809 \
810 vshrn.s64 left_x_32_high, left_x, #32; \
811 vshrn.s64 right_x_32_high, right_x, #32; \
812 \
813 vadd.u64 left_x, left_x, left_dx_dy; \
814 vadd.u64 right_x, right_x, right_dx_dy; \
815 \
816 vmovn.u32 left_right_x_16_low, left_x_32; \
817 vmovn.u32 left_right_x_16_high, right_x_32; \
818 \
819 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
820 str b, [ span_b_offset ], #4; \
821 setup_spans_adjust_interpolants_##direction(); \
822 \
823 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
824 \
825 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
826 str b, [ span_b_offset ], #4; \
827 setup_spans_adjust_interpolants_##direction(); \
828 \
829 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
830 \
831 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
832 str b, [ span_b_offset ], #4; \
833 setup_spans_adjust_interpolants_##direction(); \
834 \
835 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
836 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
837 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
838 \
839 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
840 str b, [ span_b_offset ], #4; \
841 setup_spans_adjust_interpolants_##direction(); \
842 \
843 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
844 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
845 \
846 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
847 \
848 setup_spans_adjust_y_##direction() \
849
850
851#define edge_adjust_low r11
852#define edge_adjust_high r12
853
854#define setup_spans_alternate_adjust_yes() \
855 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
856 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
857 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
858
859#define setup_spans_alternate_adjust_no() \
860
861
862#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
863 setup_spans_alternate_adjust_##alternate_active(); \
864 setup_spans_load_b(); \
865 \
866 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
867 subs y_c, y_c, temp; \
868 subgt height, height, y_c; \
869 addgt height, height, #1; \
870 \
871 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
872 subs clip, temp, y_a; \
873 ble 0f; \
874 \
875 sub height, height, clip; \
876 add y_a, y_a, clip; \
877 setup_spans_clip(increment, alternate_active); \
878 \
879 0: \
880 cmp height, #0; \
881 ble 1f; \
882 \
883 orr temp, y_a, y_a, lsl #16; \
884 add temp, temp, #(1 << 16); \
885 add y_a, temp, #2; \
886 add y_a, y_a, #(2 << 16); \
887 vmov.u32 y_x4, temp, y_a; \
888 \
889 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
890 right_index); \
891 setup_spans_prologue_b(); \
892 \
893 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
894 \
895 2: \
896 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
897 subs height, height, #4; \
898 bhi 2b; \
899 \
900 1: \
901
902
903#define setup_spans_alternate_pre_increment_yes() \
904 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
905 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
906
907#define setup_spans_alternate_pre_increment_no() \
908
909
910#define setup_spans_up_decrement_yes() \
911 suble height, height, #1 \
912
913#define setup_spans_up_decrement_no() \
914
915
916#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
917 setup_spans_alternate_adjust_##alternate_active(); \
918 setup_spans_load_b(); \
919 sub y_a, y_a, #1; \
920 \
921 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
922 subs temp, temp, y_c; \
923 subgt height, height, temp; \
924 setup_spans_up_decrement_##alternate_active(); \
925 \
926 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
927 subs clip, y_a, temp; \
928 ble 0f; \
929 \
930 sub height, height, clip; \
931 sub y_a, y_a, clip; \
932 setup_spans_clip(decrement, alternate_active); \
933 \
934 0: \
935 cmp height, #0; \
936 ble 1f; \
937 \
938 orr temp, y_a, y_a, lsl #16; \
939 sub temp, temp, #(1 << 16); \
940 sub y_a, temp, #2; \
941 sub y_a, y_a, #(2 << 16); \
942 vmov.u32 y_x4, temp, y_a; \
943 \
944 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
945 \
946 setup_spans_alternate_pre_increment_##alternate_active(); \
947 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
948 right_index); \
949 setup_spans_adjust_interpolants_up(); \
950 setup_spans_prologue_b(); \
951 \
952 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
953 \
954 2: \
955 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
956 subs height, height, #4; \
957 bhi 2b; \
958 \
959 1: \
960
961
962#define setup_spans_epilogue() \
963 ldmia sp!, { r4 - r11, pc } \
964
965
966#define setup_spans_up_up(minor, major) \
967 setup_spans_prologue(); \
968 sub height_minor_a, y_a, y_b; \
969 sub height_minor_b, y_b, y_c; \
970 sub height, y_a, y_c; \
971 \
972 vdup.u32 x_starts, x_a; \
973 vmov.u32 x_ends, x_c, x_b; \
974 \
975 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
976 setup_spans_up(major, minor, minor, yes); \
977 setup_spans_epilogue() \
978
979function(setup_spans_up_left)
980 setup_spans_up_up(left, right)
981
982function(setup_spans_up_right)
983 setup_spans_up_up(right, left)
984
5d834c08 985.pool
75e28f62
E
986
987#define setup_spans_down_down(minor, major) \
988 setup_spans_prologue(); \
989 sub height_minor_a, y_b, y_a; \
990 sub height_minor_b, y_c, y_b; \
991 sub height, y_c, y_a; \
992 \
993 vdup.u32 x_starts, x_a; \
994 vmov.u32 x_ends, x_c, x_b; \
995 \
996 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
997 setup_spans_down(major, minor, minor, yes); \
998 setup_spans_epilogue() \
999
1000function(setup_spans_down_left)
1001 setup_spans_down_down(left, right)
1002
1003function(setup_spans_down_right)
1004 setup_spans_down_down(right, left)
1005
1006
1007#define setup_spans_up_flat() \
1008 sub height, y_a, y_c; \
1009 \
1010 compute_edge_delta_x2(); \
1011 setup_spans_up(left, right, none, no); \
1012 setup_spans_epilogue() \
1013
1014function(setup_spans_up_a)
1015 setup_spans_prologue()
1016
1017 vmov.u32 x_starts, x_a, x_b
1018 vdup.u32 x_ends, x_c
1019
1020 setup_spans_up_flat()
1021
1022function(setup_spans_up_b)
1023 setup_spans_prologue()
1024
1025 vdup.u32 x_starts, x_a
1026 vmov.u32 x_ends, x_b, x_c
1027
1028 setup_spans_up_flat()
1029
1030#define setup_spans_down_flat() \
1031 sub height, y_c, y_a; \
1032 \
1033 compute_edge_delta_x2(); \
1034 setup_spans_down(left, right, none, no); \
1035 setup_spans_epilogue() \
1036
1037function(setup_spans_down_a)
1038 setup_spans_prologue()
1039
1040 vmov.u32 x_starts, x_a, x_b
1041 vdup.u32 x_ends, x_c
1042
1043 setup_spans_down_flat()
1044
1045function(setup_spans_down_b)
1046 setup_spans_prologue()
1047
1048 vdup.u32 x_starts, x_a
1049 vmov.u32 x_ends, x_b, x_c
1050
1051 setup_spans_down_flat()
1052
1053
1054#define middle_y r9
1055
1056#define edges_xy_b q11
1057#define edges_dx_dy_b d26
1058#define edge_shifts_b d27
1059#define edges_dx_dy_and_shifts_b q13
1060#define height_increment d20
1061
1062#define edges_dx_dy_and_shifts q1
1063
1064#define edges_xy_b_left d22
1065#define edges_xy_b_right d23
1066
1067#define setup_spans_up_down_load_edge_set_b() \
1068 vmov edges_xy, edges_xy_b; \
1069 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1070
1071
1072function(setup_spans_up_down)
1073 setup_spans_prologue()
1074
1075 // s32 middle_y = y_a;
1076 sub height_minor_a, y_a, y_b
1077 sub height_minor_b, y_c, y_a
1078 sub height_major, y_c, y_b
1079
1080 vmov.u32 x_starts, x_a, x_c
1081 vdup.u32 x_ends, x_b
1082
1083 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1084
1085 mov temp, #0
1086 vmov.u32 height_increment, temp, height_minor_b
1087 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1088
1089 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1090 vmov edges_xy_b_right, edges_xy_right
1091
1092 vmov edge_shifts_b, edge_shifts
1093 vmov.u32 edge_shifts_b[0], edge_shift_alt
1094
1095 vneg.s32 edges_dx_dy_b, edges_dx_dy
1096 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1097
1098 mov middle_y, y_a
1099
1100 setup_spans_load_b()
1101 sub y_a, y_a, #1
1102
1103 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1104 subs temp, temp, y_b
1105 subgt height_minor_a, height_minor_a, temp
1106
1107 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1108 subs clip, y_a, temp
1109 ble 0f
1110
1111 sub height_minor_a, height_minor_a, clip
1112 sub y_a, y_a, clip
1113 setup_spans_clip(decrement, no)
1114
1115 0:
1116 cmp height_minor_a, #0
1117 ble 3f
1118
1119 orr temp, y_a, y_a, lsl #16
1120 sub temp, temp, #(1 << 16)
1121 sub y_a, temp, #2
1122 sub y_a, y_a, #(2 << 16)
1123 vmov.u32 y_x4, temp, y_a
1124
1125 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1126
1127 strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1128
1129 setup_spans_adjust_edges_alternate_no(left, right);
1130 setup_spans_adjust_interpolants_up()
1131 setup_spans_up_down_load_edge_set_b()
1132
1133 setup_spans_prologue_b()
1134
1135
1136 2:
1137 setup_spans_set_x4_alternate_no(none, up)
1138 subs height_minor_a, height_minor_a, #4
1139 bhi 2b
1140
1141 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1142 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1143 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1144
1145 4:
1146 add temp, psx_gpu, #psx_gpu_uvrg_offset
1147 vld1.32 { uvrg }, [ temp ]
1148 mov y_a, middle_y
1149
1150 setup_spans_load_b()
1151
1152 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1153 subs y_c, y_c, temp
1154 subgt height_minor_b, height_minor_b, y_c
1155 addgt height_minor_b, height_minor_b, #1
1156
1157 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1158 subs clip, temp, y_a
1159 ble 0f
1160
1161 sub height_minor_b, height_minor_b, clip
1162 add y_a, y_a, clip
1163 setup_spans_clip(increment, no)
1164
1165 0:
1166 cmp height_minor_b, #0
1167 ble 1f
1168
1169 orr temp, y_a, y_a, lsl #16
1170 add temp, temp, #(1 << 16)
1171 add y_a, temp, #2
1172 add y_a, y_a, #(2 << 16)
1173 vmov.u32 y_x4, temp, y_a
1174
1175 setup_spans_adjust_edges_alternate_no(left, right)
1176
1177 ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1178 add temp, temp, height_minor_b
b7569147 1179
1180 cmp temp, #MAX_SPANS
1181 beq 5f
1182
75e28f62
E
1183 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1184
1185 2:
1186 setup_spans_set_x4_alternate_no(none, down)
1187 subs height_minor_b, height_minor_b, #4
1188 bhi 2b
1189
1190 1:
1191 setup_spans_epilogue()
1192
1193 3:
1194 setup_spans_up_down_load_edge_set_b()
1195 setup_spans_prologue_b()
1196 bal 4b
1197
b7569147 1198 5:
1199 // FIXME: overflow corner case
1200 sub temp, temp, height_minor_b
1201 bics height_minor_b, #3
1202 add temp, temp, height_minor_b
1203 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1204 bne 2b
1205 bal 1b
1206
5d834c08 1207.pool
75e28f62
E
1208
1209#undef span_uvrg_offset
1210#undef span_edge_data
1211#undef span_b_offset
1212#undef left_x
1213#undef b
1214
1215#define psx_gpu r0
1216#define num_spans r1
1217#define span_uvrg_offset r2
1218#define span_edge_data r3
1219#define span_b_offset r4
1220#define b_dx r5
1221#define span_num_blocks r6
1222#define y r7
1223#define left_x r8
1224#define b r9
1225#define dither_offset_ptr r10
1226#define block_ptr_a r11
1227#define fb_ptr r12
1228#define num_blocks r14
1229
1230#define uvrg_dx_ptr r2
1231#define texture_mask_ptr r3
1232#define dither_shift r8
1233#define dither_row r10
1234
1235#define c_32 r7
1236#define b_dx4 r8
1237#define b_dx8 r9
1238#define block_ptr_b r10
1239
1240#define block_span_ptr r10
1241#define right_mask r8
1242
1243#define color r2
1244#define color_r r3
1245#define color_g r4
1246#define color_b r5
1247
1248#undef uvrg
1249
1250#define u_block q0
1251#define v_block q1
1252#define r_block q2
1253#define g_block q3
1254#define b_block q4
1255
1256#define uv_dx4 d10
1257#define rg_dx4 d11
1258#define uv_dx8 d12
1259#define rg_dx8 d13
1260#define b_whole_8 d14
1261#define fb_mask_ptrs d15
1262
1263#define uvrg_dx4 q5
1264#define uvrg_dx8 q6
1265#define uv_dx8 d12
1266#define rg_dx8 d13
1267
1268#define u_whole q8
1269#define v_whole q9
1270#define r_whole q10
1271#define g_whole q11
1272#define b_whole q12
1273
1274#define u_whole_low d16
1275#define u_whole_high d17
1276#define v_whole_low d18
1277#define v_whole_high d19
1278#define r_whole_low d20
1279#define r_whole_high d21
1280#define g_whole_low d22
1281#define g_whole_high d23
1282#define b_whole_low d24
1283#define b_whole_high d25
1284
1285#define dx4 q13
1286#define dx8 q13
1287
1288#define u_whole_8 d26
1289#define v_whole_8 d27
1290#define u_whole_8b d24
1291#define r_whole_8 d24
1292#define g_whole_8 d25
1293
1294#define uv_whole_8 q13
1295#define uv_whole_8b q14
1296
1297#define dither_offsets q14
1298#define texture_mask q15
1299#define texture_mask_u d30
1300#define texture_mask_v d31
1301
1302#define dither_offsets_short d28
1303
1304#define v_left_x q8
1305#define uvrg q9
1306#define block_span q10
1307
1308#define uv d18
1309#define rg d19
1310
1311#define draw_mask q1
1312#define draw_mask_edge q13
1313#define test_mask q0
1314
1315#define uvrg_dx q3
1316
1317#define colors q2
1318
1319#define setup_blocks_texture_swizzled() \
1320 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1321 vsli.u8 u_whole_8, v_whole_8, #4; \
1322 vsri.u8 v_whole_8, u_whole_8b, #4 \
1323
1324#define setup_blocks_texture_unswizzled() \
1325
1326
1327#define setup_blocks_shaded_textured_builder(swizzling) \
1328.align 3; \
1329 \
1330function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
1331 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1332 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1333 \
1334 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1335 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1336 \
1337 cmp num_spans, #0; \
1338 bxeq lr; \
1339 \
1340 stmdb sp!, { r4 - r11, r14 }; \
1341 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1342 \
1343 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
1344 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1345 \
1346 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1347 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1348 \
1349 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1350 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1351 \
1352 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1353 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1354 \
1355 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1356 \
1357 0: \
1358 vmov.u8 fb_mask_ptrs, #0; \
1359 \
1360 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1361 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1362 \
1363 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1364 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1365 \
1366 cmp span_num_blocks, #0; \
1367 beq 1f; \
1368 \
1369 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1370 add num_blocks, span_num_blocks, num_blocks; \
1371 \
1372 cmp num_blocks, #MAX_BLOCKS; \
1373 bgt 2f; \
1374 \
1375 3: \
1376 ldr b, [ span_b_offset ]; \
1377 add fb_ptr, fb_ptr, y, lsl #11; \
1378 \
1379 vdup.u32 v_left_x, left_x; \
1380 and y, y, #0x3; \
1381 \
1382 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1383 add fb_ptr, fb_ptr, left_x, lsl #1; \
1384 \
1385 mla b, b_dx, left_x, b; \
1386 and dither_shift, left_x, #0x03; \
1387 \
1388 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1389 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1390 \
1391 mov dither_shift, dither_shift, lsl #3; \
1392 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1393 \
1394 mov c_32, #32; \
1395 subs span_num_blocks, span_num_blocks, #1; \
1396 \
1397 mov dither_row, dither_row, ror dither_shift; \
1398 mov b_dx4, b_dx, lsl #2; \
1399 \
1400 vdup.u32 dither_offsets_short, dither_row; \
1401 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1402 \
1403 vdup.u32 b_block, b; \
1404 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1405 \
1406 vdup.u32 u_block, uv[0]; \
1407 mov b_dx8, b_dx, lsl #3; \
1408 \
1409 vdup.u32 v_block, uv[1]; \
1410 vdup.u32 r_block, rg[0]; \
1411 vdup.u32 g_block, rg[1]; \
1412 \
1413 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1414 \
1415 vadd.u32 u_block, u_block, block_span; \
1416 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1417 \
1418 vadd.u32 v_block, v_block, block_span; \
1419 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1420 \
1421 vadd.u32 r_block, r_block, block_span; \
1422 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1423 \
1424 vadd.u32 g_block, g_block, block_span; \
1425 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
1426 \
1427 vadd.u32 b_block, b_block, block_span; \
1428 add block_ptr_b, block_ptr_a, #16; \
1429 \
1430 vshrn.u32 u_whole_low, u_block, #16; \
1431 vshrn.u32 v_whole_low, v_block, #16; \
1432 vshrn.u32 r_whole_low, r_block, #16; \
1433 vshrn.u32 g_whole_low, g_block, #16; \
1434 \
1435 vdup.u32 dx4, uv_dx4[0]; \
1436 vshrn.u32 b_whole_low, b_block, #16; \
1437 \
1438 vaddhn.u32 u_whole_high, u_block, dx4; \
1439 vdup.u32 dx4, uv_dx4[1]; \
1440 \
1441 vaddhn.u32 v_whole_high, v_block, dx4; \
1442 vdup.u32 dx4, rg_dx4[0]; \
1443 \
1444 vaddhn.u32 r_whole_high, r_block, dx4; \
1445 vdup.u32 dx4, rg_dx4[1]; \
1446 \
1447 vaddhn.u32 g_whole_high, g_block, dx4; \
1448 vdup.u32 dx4, b_dx4; \
1449 \
1450 vaddhn.u32 b_whole_high, b_block, dx4; \
1451 vdup.u32 dx8, uv_dx8[0]; \
1452 \
1453 vadd.u32 u_block, u_block, dx8; \
1454 vdup.u32 dx8, uv_dx8[1]; \
1455 \
1456 vadd.u32 v_block, v_block, dx8; \
1457 vdup.u32 dx8, rg_dx8[0]; \
1458 \
1459 vadd.u32 r_block, r_block, dx8; \
1460 vdup.u32 dx8, rg_dx8[1]; \
1461 \
1462 vadd.u32 g_block, g_block, dx8; \
1463 vdup.u32 dx8, b_dx8; \
1464 \
1465 vadd.u32 b_block, b_block, dx8; \
1466 vmovn.u16 u_whole_8, u_whole; \
1467 \
1468 vmovn.u16 v_whole_8, v_whole; \
1469 \
1470 vmovn.u16 b_whole_8, b_whole; \
1471 pld [ fb_ptr ]; \
1472 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1473 \
1474 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1475 setup_blocks_texture_##swizzling(); \
1476 \
1477 vmovn.u16 r_whole_8, r_whole; \
1478 beq 5f; \
1479 \
1480 4: \
1481 vmovn.u16 g_whole_8, g_whole; \
1482 vshrn.u32 u_whole_low, u_block, #16; \
1483 \
1484 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1485 vshrn.u32 v_whole_low, v_block, #16; \
1486 \
1487 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1488 vshrn.u32 r_whole_low, r_block, #16; \
1489 \
1490 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1491 vshrn.u32 g_whole_low, g_block, #16; \
1492 \
1493 vdup.u32 dx4, uv_dx4[0]; \
1494 vshrn.u32 b_whole_low, b_block, #16; \
1495 \
1496 vaddhn.u32 u_whole_high, u_block, dx4; \
1497 vdup.u32 dx4, uv_dx4[1]; \
1498 \
1499 vaddhn.u32 v_whole_high, v_block, dx4; \
1500 vdup.u32 dx4, rg_dx4[0]; \
1501 \
1502 vaddhn.u32 r_whole_high, r_block, dx4; \
1503 vdup.u32 dx4, rg_dx4[1]; \
1504 \
1505 vaddhn.u32 g_whole_high, g_block, dx4; \
1506 vdup.u32 dx4, b_dx4; \
1507 \
1508 vaddhn.u32 b_whole_high, b_block, dx4; \
1509 vdup.u32 dx8, uv_dx8[0]; \
1510 \
1511 vadd.u32 u_block, u_block, dx8; \
1512 vdup.u32 dx8, uv_dx8[1]; \
1513 \
1514 vadd.u32 v_block, v_block, dx8; \
1515 vdup.u32 dx8, rg_dx8[0]; \
1516 \
1517 vadd.u32 r_block, r_block, dx8; \
1518 vdup.u32 dx8, rg_dx8[1]; \
1519 \
1520 vadd.u32 g_block, g_block, dx8; \
1521 vdup.u32 dx8, b_dx8; \
1522 \
1523 vadd.u32 b_block, b_block, dx8; \
1524 vmovn.u16 u_whole_8, u_whole; \
1525 \
1526 add fb_ptr, fb_ptr, #16; \
1527 vmovn.u16 v_whole_8, v_whole; \
1528 \
1529 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1530 vmovn.u16 b_whole_8, b_whole; \
1531 \
1532 pld [ fb_ptr ]; \
1533 \
1534 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1535 subs span_num_blocks, span_num_blocks, #1; \
1536 \
1537 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1538 setup_blocks_texture_##swizzling(); \
1539 \
1540 vmovn.u16 r_whole_8, r_whole; \
1541 bne 4b; \
1542 \
1543 5: \
1544 vmovn.u16 g_whole_8, g_whole; \
1545 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1546 \
1547 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1548 vdup.u8 draw_mask, right_mask; \
1549 \
1550 vmov.u32 fb_mask_ptrs[0], right_mask; \
1551 vtst.u16 draw_mask, draw_mask, test_mask; \
1552 vzip.u8 u_whole_8, v_whole_8; \
1553 \
1554 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1555 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1556 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1557 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1558 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1559 \
1560 1: \
1561 add span_uvrg_offset, span_uvrg_offset, #16; \
1562 add span_b_offset, span_b_offset, #4; \
1563 \
1564 add span_edge_data, span_edge_data, #8; \
1565 subs num_spans, num_spans, #1; \
1566 \
1567 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1568 bne 0b; \
1569 \
1570 ldmia sp!, { r4 - r11, pc }; \
1571 \
1572 2: \
1573 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1574 vpush { texture_mask }; \
1575 vpush { uvrg_dx4 }; \
1576 \
1577 stmdb sp!, { r0 - r3, r12, r14 }; \
1578 bl flush_render_block_buffer; \
1579 ldmia sp!, { r0 - r3, r12, r14 }; \
1580 \
1581 vpop { uvrg_dx4 }; \
1582 vpop { texture_mask }; \
1583 \
1584 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1585 vmov.u8 fb_mask_ptrs, #0; \
1586 \
1587 mov num_blocks, span_num_blocks; \
1588 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1589 bal 3b \
1590
1591
1592setup_blocks_shaded_textured_builder(swizzled)
1593setup_blocks_shaded_textured_builder(unswizzled)
1594
1595
1596#define setup_blocks_unshaded_textured_builder(swizzling) \
1597.align 3; \
1598 \
1599function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
1600 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1601 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1602 \
1603 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1604 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1605 \
1606 cmp num_spans, #0; \
1607 bxeq lr; \
1608 \
1609 stmdb sp!, { r4 - r11, r14 }; \
1610 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1611 \
1612 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1613 \
1614 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1615 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1616 \
1617 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1618 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1619 \
1620 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1621 \
1622 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1623 \
1624 0: \
1625 vmov.u8 fb_mask_ptrs, #0; \
1626 \
1627 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1628 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1629 \
1630 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1631 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1632 \
1633 cmp span_num_blocks, #0; \
1634 beq 1f; \
1635 \
1636 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1637 add num_blocks, span_num_blocks, num_blocks; \
1638 \
1639 cmp num_blocks, #MAX_BLOCKS; \
1640 bgt 2f; \
1641 \
1642 3: \
1643 add fb_ptr, fb_ptr, y, lsl #11; \
1644 \
1645 vdup.u32 v_left_x, left_x; \
1646 and y, y, #0x3; \
1647 \
1648 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1649 add fb_ptr, fb_ptr, left_x, lsl #1; \
1650 \
1651 and dither_shift, left_x, #0x03; \
1652 \
1653 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1654 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1655 \
1656 mov dither_shift, dither_shift, lsl #3; \
1657 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1658 \
1659 mov c_32, #32; \
1660 subs span_num_blocks, span_num_blocks, #1; \
1661 \
1662 mov dither_row, dither_row, ror dither_shift; \
1663 \
1664 vdup.u32 dither_offsets_short, dither_row; \
1665 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1666 \
1667 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1668 \
1669 vdup.u32 u_block, uv[0]; \
1670 \
1671 vdup.u32 v_block, uv[1]; \
1672 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1673 \
1674 vadd.u32 u_block, u_block, block_span; \
1675 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1676 \
1677 vadd.u32 v_block, v_block, block_span; \
1678 add block_ptr_b, block_ptr_a, #16; \
1679 \
1680 vshrn.u32 u_whole_low, u_block, #16; \
1681 vshrn.u32 v_whole_low, v_block, #16; \
1682 \
1683 vdup.u32 dx4, uv_dx4[0]; \
1684 \
1685 vaddhn.u32 u_whole_high, u_block, dx4; \
1686 vdup.u32 dx4, uv_dx4[1]; \
1687 \
1688 vaddhn.u32 v_whole_high, v_block, dx4; \
1689 vdup.u32 dx8, uv_dx8[0]; \
1690 \
1691 vadd.u32 u_block, u_block, dx8; \
1692 vdup.u32 dx8, uv_dx8[1]; \
1693 \
1694 vadd.u32 v_block, v_block, dx8; \
1695 vmovn.u16 u_whole_8, u_whole; \
1696 \
1697 vmovn.u16 v_whole_8, v_whole; \
1698 \
1699 pld [ fb_ptr ]; \
1700 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1701 \
1702 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1703 setup_blocks_texture_##swizzling(); \
1704 \
1705 beq 5f; \
1706 \
1707 4: \
1708 vshrn.u32 u_whole_low, u_block, #16; \
1709 \
1710 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1711 vshrn.u32 v_whole_low, v_block, #16; \
1712 \
1713 add block_ptr_b, block_ptr_b, #32; \
1714 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1715 \
1716 vdup.u32 dx4, uv_dx4[0]; \
1717 vaddhn.u32 u_whole_high, u_block, dx4; \
1718 vdup.u32 dx4, uv_dx4[1]; \
1719 \
1720 vaddhn.u32 v_whole_high, v_block, dx4; \
1721 vdup.u32 dx8, uv_dx8[0]; \
1722 \
1723 vadd.u32 u_block, u_block, dx8; \
1724 vdup.u32 dx8, uv_dx8[1]; \
1725 \
1726 vadd.u32 v_block, v_block, dx8; \
1727 vmovn.u16 u_whole_8, u_whole; \
1728 \
1729 add fb_ptr, fb_ptr, #16; \
1730 vmovn.u16 v_whole_8, v_whole; \
1731 \
1732 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1733 pld [ fb_ptr ]; \
1734 \
1735 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1736 subs span_num_blocks, span_num_blocks, #1; \
1737 \
1738 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1739 setup_blocks_texture_##swizzling(); \
1740 \
1741 bne 4b; \
1742 \
1743 5: \
1744 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1745 \
1746 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1747 vdup.u8 draw_mask, right_mask; \
1748 \
1749 vmov.u32 fb_mask_ptrs[0], right_mask; \
1750 vtst.u16 draw_mask, draw_mask, test_mask; \
1751 vzip.u8 u_whole_8, v_whole_8; \
1752 \
1753 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1754 add block_ptr_b, block_ptr_b, #32; \
1755 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1756 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1757 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1758 \
1759 1: \
1760 add span_uvrg_offset, span_uvrg_offset, #16; \
1761 add span_edge_data, span_edge_data, #8; \
1762 subs num_spans, num_spans, #1; \
1763 \
1764 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1765 bne 0b; \
1766 \
1767 ldmia sp!, { r4 - r11, pc }; \
1768 \
1769 2: \
1770 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1771 vpush { texture_mask }; \
1772 vpush { uvrg_dx4 }; \
1773 \
1774 stmdb sp!, { r0 - r3, r12, r14 }; \
1775 bl flush_render_block_buffer; \
1776 ldmia sp!, { r0 - r3, r12, r14 }; \
1777 \
1778 vpop { uvrg_dx4 }; \
1779 vpop { texture_mask }; \
1780 \
1781 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1782 vmov.u8 fb_mask_ptrs, #0; \
1783 \
1784 mov num_blocks, span_num_blocks; \
1785 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1786 bal 3b \
1787
1788
1789setup_blocks_unshaded_textured_builder(swizzled)
1790setup_blocks_unshaded_textured_builder(unswizzled)
1791
1792
1793.align 3
1794
1795function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1796 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1797 veor.u32 draw_mask, draw_mask, draw_mask
1798
1799 cmp num_spans, #0
1800 bxeq lr
1801
1802 stmdb sp!, { r4 - r11, r14 }
1803 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1804
1805 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1806
1807 ubfx color_r, color, #3, #5
1808 ubfx color_g, color, #11, #5
1809 ubfx color_b, color, #19, #5
1810
1811 orr color, color_r, color_b, lsl #10
1812 orr color, color, color_g, lsl #5
1813
1814 vdup.u16 colors, color
1815
1816 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1817 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1818
1819 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1820 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1821
1822 0:
1823 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1824 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1825
c1817bd9 1826 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1827
1828 cmp span_num_blocks, #0
1829 beq 1f
1830
1831 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1832 add num_blocks, span_num_blocks, num_blocks
1833
1834 cmp num_blocks, #MAX_BLOCKS
1835 bgt 2f
1836
1837 3:
1838 add fb_ptr, fb_ptr, y, lsl #11
1839 and y, y, #0x3
1840
1841 add fb_ptr, fb_ptr, left_x, lsl #1
1842 mov c_32, #32
1843
1844 subs span_num_blocks, span_num_blocks, #1
1845
1846 add block_ptr_b, block_ptr_a, #16
1847 pld [ fb_ptr ]
1848
1849 vmov.u32 fb_mask_ptrs[1], fb_ptr
1850 beq 5f
1851
1852 4:
1853 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1854 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1855 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1856
1857 add fb_ptr, fb_ptr, #16
1858 add block_ptr_b, block_ptr_b, #32
1859
1860 pld [ fb_ptr ]
1861
1862 vmov.u32 fb_mask_ptrs[1], fb_ptr
1863 subs span_num_blocks, span_num_blocks, #1
1864
1865 bne 4b
1866
1867 5:
1868 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1869
1870 vdup.u8 draw_mask_edge, right_mask
1871 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1872
1873 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1874 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1875 add block_ptr_b, block_ptr_b, #32
1876 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1877
1878 1:
1879 add span_edge_data, span_edge_data, #8
1880 subs num_spans, num_spans, #1
1881
1882 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1883 bne 0b
1884
1885 ldmia sp!, { r4 - r11, pc }
1886
1887 2:
1888 vpush { colors }
1889
1890 stmdb sp!, { r0 - r3, r12, r14 }
1891 bl flush_render_block_buffer
1892 ldmia sp!, { r0 - r3, r12, r14 }
1893
1894 vpop { colors }
1895
1896 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1897 veor.u32 draw_mask, draw_mask, draw_mask
1898
1899 mov num_blocks, span_num_blocks
1900 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1901 bal 3b
1902
1903
1904#define mask_msb_scalar r14
1905
1906#define msb_mask q15
1907
1908#define pixels_low d16
1909
1910#define msb_mask_low d30
1911#define msb_mask_high d31
1912
1913
1914.align 3
1915
1916function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1917 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1918
1919 cmp num_spans, #0
1920 bxeq lr
1921
1922 stmdb sp!, { r4 - r11, r14 }
1923
1924 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1925
1926 ubfx color_r, color, #3, #5
1927 ubfx color_g, color, #11, #5
1928
1929 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1930 ubfx color_b, color, #19, #5
1931
1932 orr color, color_r, color_b, lsl #10
1933 orr color, color, color_g, lsl #5
1934 orr color, color, mask_msb_scalar
1935
1936 vdup.u16 colors, color
1937
1938 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
3867c6ef
E
1939 orr color, color, lsl #16
1940
75e28f62
E
1941
1942 0:
1943 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1944 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1945
c1817bd9 1946 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1947
1948 cmp span_num_blocks, #0
1949 beq 1f
1950
1951 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1952
1953 add fb_ptr, fb_ptr, y, lsl #11
1954 subs span_num_blocks, span_num_blocks, #1
1955
1956 add fb_ptr, fb_ptr, left_x, lsl #1
1957 beq 3f
1958
1959 2:
1960 vst1.u32 { colors }, [ fb_ptr ]!
1961 subs span_num_blocks, span_num_blocks, #1
1962
1963 bne 2b
1964
1965 3:
1966 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
75e28f62 1967
3867c6ef
E
1968 cmp right_mask, #0x0
1969 beq 5f
1970
1971 tst right_mask, #0xF
1972 streq color, [ fb_ptr ], #4
1973 moveq right_mask, right_mask, lsr #4
1974 streq color, [ fb_ptr ], #4
1975
1976 tst right_mask, #0x3
1977 streq color, [ fb_ptr ], #4
1978 moveq right_mask, right_mask, lsr #2
1979
1980 tst right_mask, #0x1
1981 streqh color, [ fb_ptr ]
75e28f62
E
1982
1983 1:
1984 add span_edge_data, span_edge_data, #8
1985 subs num_spans, num_spans, #1
75e28f62
E
1986 bne 0b
1987
1988 ldmia sp!, { r4 - r11, pc }
1989
3867c6ef
E
1990 5:
1991 vst1.u32 { colors }, [ fb_ptr ]
1992 bal 1b
75e28f62
E
1993
1994
1995#undef c_64
1996
1997#define c_64 r7
1998#define rg_dx_ptr r2
1999
2000
2001#undef r_block
2002#undef g_block
2003#undef b_block
2004#undef r_whole
2005#undef g_whole
2006#undef b_whole
2007#undef r_whole_low
2008#undef r_whole_high
2009#undef g_whole_low
2010#undef g_whole_high
2011#undef b_whole_low
2012#undef b_whole_high
2013#undef r_whole_8
2014#undef g_whole_8
2015#undef b_whole_8
2016#undef dither_offsets
2017#undef rg_dx4
2018#undef rg_dx8
2019#undef dx4
2020#undef dx8
2021#undef v_left_x
2022#undef uvrg
2023#undef block_span
2024#undef rg
2025#undef draw_mask
2026#undef test_mask
2027
2028#define r_block q0
2029#define g_block q1
2030#define b_block q2
2031
2032#define r_whole q3
2033#define g_whole q4
2034#define b_whole q5
2035
2036#define r_whole_low d6
2037#define r_whole_high d7
2038#define g_whole_low d8
2039#define g_whole_high d9
2040#define b_whole_low d10
2041#define b_whole_high d11
2042
2043#define gb_whole_8 q6
2044
2045#define g_whole_8 d12
2046#define b_whole_8 d13
2047
2048#define r_whole_8 d14
2049
2050#define pixels q8
2051
2052#define rg_dx4 d18
2053#define rg_dx8 d19
2054
2055#define dx4 q10
2056#define dx8 q10
2057
2058#define v_left_x d6
2059#define uvrg q4
2060#define block_span q5
2061
2062#define rg d9
2063
2064#define d64_1 d22
2065#define d64_128 d23
2066
2067#define d128_4 q12
2068#define d128_0x7 q13
2069
2070#define d64_4 d24
2071
2072#define dither_offsets q14
2073#define draw_mask q15
2074
2075#define dither_offsets_low d28
2076
2077#define rg_dx d0
2078#define test_mask q10
2079
2080
2081#define setup_blocks_shaded_untextured_dither_a_dithered() \
2082 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2083 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2084
2085#define setup_blocks_shaded_untextured_dither_b_dithered() \
2086 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2087 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2088
2089#define setup_blocks_shaded_untextured_dither_a_undithered() \
2090
2091#define setup_blocks_shaded_untextured_dither_b_undithered() \
2092
2093
2094#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2095.align 3; \
2096 \
2097function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
2098 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2099 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2100 \
2101 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2102 \
2103 cmp num_spans, #0; \
2104 bxeq lr; \
2105 \
2106 stmdb sp!, { r4 - r11, r14 }; \
2107 vshl.u32 rg_dx4, rg_dx, #2; \
2108 \
2109 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2110 vshl.u32 rg_dx8, rg_dx, #3; \
2111 \
2112 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2113 \
2114 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2115 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2116 \
2117 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2118 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2119 \
2120 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2121 vmov.u8 d64_1, #1; \
2122 \
2123 vmov.u8 d128_4, #4; \
2124 vmov.u8 d64_128, #128; \
2125 \
2126 vmov.u8 d128_0x7, #0x7; \
2127 \
2128 0: \
2129 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2130 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2131 \
2132 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2133 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2134 \
2135 cmp span_num_blocks, #0; \
2136 beq 1f; \
2137 \
2138 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2139 add num_blocks, span_num_blocks, num_blocks; \
2140 \
2141 cmp num_blocks, #MAX_BLOCKS; \
2142 bgt 2f; \
2143 \
2144 3: \
2145 ldr b, [ span_b_offset ]; \
2146 add fb_ptr, fb_ptr, y, lsl #11; \
2147 \
2148 vdup.u32 v_left_x, left_x; \
2149 and y, y, #0x3; \
2150 \
2151 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2152 add fb_ptr, fb_ptr, left_x, lsl #1; \
2153 \
2154 mla b, b_dx, left_x, b; \
2155 and dither_shift, left_x, #0x03; \
2156 \
2157 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2158 vshr.u32 rg_dx, rg_dx4, #2; \
2159 \
2160 mov dither_shift, dither_shift, lsl #3; \
2161 vmla.u32 rg, rg_dx, v_left_x; \
2162 \
2163 mov c_64, #64; \
2164 subs span_num_blocks, span_num_blocks, #1; \
2165 \
2166 mov dither_row, dither_row, ror dither_shift; \
2167 mov b_dx4, b_dx, lsl #2; \
2168 \
2169 vdup.u32 dither_offsets, dither_row; \
2170 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2171 \
2172 vdup.u32 b_block, b; \
2173 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2174 \
2175 mov b_dx8, b_dx, lsl #3; \
2176 vdup.u32 r_block, rg[0]; \
2177 vdup.u32 g_block, rg[1]; \
2178 \
2179 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2180 \
2181 vadd.u32 r_block, r_block, block_span; \
2182 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2183 \
2184 vadd.u32 g_block, g_block, block_span; \
2185 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2186 \
2187 vadd.u32 b_block, b_block, block_span; \
2188 add block_ptr_b, block_ptr_a, #16; \
2189 \
2190 vshrn.u32 r_whole_low, r_block, #16; \
2191 vshrn.u32 g_whole_low, g_block, #16; \
2192 vshrn.u32 b_whole_low, b_block, #16; \
2193 vdup.u32 dx4, rg_dx4[0]; \
2194 \
2195 vaddhn.u32 r_whole_high, r_block, dx4; \
2196 vdup.u32 dx4, rg_dx4[1]; \
2197 \
2198 vaddhn.u32 g_whole_high, g_block, dx4; \
2199 vdup.u32 dx4, b_dx4; \
2200 \
2201 vaddhn.u32 b_whole_high, b_block, dx4; \
2202 vdup.u32 dx8, rg_dx8[0]; \
2203 \
2204 vadd.u32 r_block, r_block, dx8; \
2205 vdup.u32 dx8, rg_dx8[1]; \
2206 \
2207 vadd.u32 g_block, g_block, dx8; \
2208 vdup.u32 dx8, b_dx8; \
2209 \
2210 vadd.u32 b_block, b_block, dx8; \
2211 \
2212 vmovn.u16 r_whole_8, r_whole; \
2213 vmovn.u16 g_whole_8, g_whole; \
2214 vmovn.u16 b_whole_8, b_whole; \
2215 \
2216 beq 5f; \
2217 veor.u32 draw_mask, draw_mask, draw_mask; \
2218 \
2219 4: \
2220 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2221 vshrn.u32 r_whole_low, r_block, #16; \
2222 \
2223 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2224 vshrn.u32 g_whole_low, g_block, #16; \
2225 \
2226 vshrn.u32 b_whole_low, b_block, #16; \
2227 str fb_ptr, [ block_ptr_a, #44 ]; \
2228 \
2229 vdup.u32 dx4, rg_dx4[0]; \
2230 vshr.u8 r_whole_8, r_whole_8, #3; \
2231 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2232 \
2233 vaddhn.u32 r_whole_high, r_block, dx4; \
2234 vdup.u32 dx4, rg_dx4[1]; \
2235 \
2236 vaddhn.u32 g_whole_high, g_block, dx4; \
2237 vdup.u32 dx4, b_dx4; \
2238 \
2239 vaddhn.u32 b_whole_high, b_block, dx4; \
2240 vdup.u32 dx8, rg_dx8[0]; \
2241 \
2242 vmull.u8 pixels, r_whole_8, d64_1; \
2243 vmlal.u8 pixels, g_whole_8, d64_4; \
2244 vmlal.u8 pixels, b_whole_8, d64_128; \
2245 \
2246 vadd.u32 r_block, r_block, dx8; \
2247 vdup.u32 dx8, rg_dx8[1]; \
2248 \
2249 vadd.u32 g_block, g_block, dx8; \
2250 vdup.u32 dx8, b_dx8; \
2251 \
2252 vadd.u32 b_block, b_block, dx8; \
2253 add fb_ptr, fb_ptr, #16; \
2254 \
2255 vmovn.u16 r_whole_8, r_whole; \
2256 vmovn.u16 g_whole_8, g_whole; \
2257 vmovn.u16 b_whole_8, b_whole; \
2258 \
2259 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2260 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2261 \
2262 pld [ fb_ptr ]; \
2263 \
2264 subs span_num_blocks, span_num_blocks, #1; \
2265 bne 4b; \
2266 \
2267 5: \
2268 str fb_ptr, [ block_ptr_a, #44 ]; \
2269 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2270 \
2271 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2272 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2273 \
2274 vshr.u8 r_whole_8, r_whole_8, #3; \
2275 vdup.u8 draw_mask, right_mask; \
2276 \
2277 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2278 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
2279 \
2280 vtst.u16 draw_mask, draw_mask, test_mask; \
2281 \
2282 vmull.u8 pixels, r_whole_8, d64_1; \
2283 vmlal.u8 pixels, g_whole_8, d64_4; \
2284 vmlal.u8 pixels, b_whole_8, d64_128; \
2285 \
2286 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2287 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2288 \
2289 1: \
2290 add span_uvrg_offset, span_uvrg_offset, #16; \
2291 add span_b_offset, span_b_offset, #4; \
2292 \
2293 add span_edge_data, span_edge_data, #8; \
2294 subs num_spans, num_spans, #1; \
2295 \
2296 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2297 bne 0b; \
2298 \
2299 ldmia sp!, { r4 - r11, pc }; \
2300 \
2301 2: \
2302 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2303 vpush { rg_dx4 }; \
2304 \
2305 stmdb sp!, { r0 - r3, r12, r14 }; \
2306 bl flush_render_block_buffer; \
2307 ldmia sp!, { r0 - r3, r12, r14 }; \
2308 \
2309 vpop { rg_dx4 }; \
2310 \
2311 vmov.u8 d64_1, #1; \
2312 vmov.u8 d128_4, #4; \
2313 vmov.u8 d64_128, #128; \
2314 vmov.u8 d128_0x7, #0x7; \
2315 \
2316 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2317 \
2318 mov num_blocks, span_num_blocks; \
2319 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2320 bal 3b \
2321
2322
2323setup_blocks_shaded_untextured_indirect_builder(undithered)
2324setup_blocks_shaded_untextured_indirect_builder(dithered)
2325
2326
2327#undef draw_mask
2328
2329#define mask_msb_ptr r14
2330
2331#define draw_mask q0
2332#define pixels_low d16
3867c6ef 2333#define pixels_high d17
75e28f62
E
2334
2335
2336
2337#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2338.align 3; \
2339 \
2340function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
2341 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2342 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2343 \
2344 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2345 \
2346 cmp num_spans, #0; \
2347 bxeq lr; \
2348 \
2349 stmdb sp!, { r4 - r11, r14 }; \
2350 vshl.u32 rg_dx4, rg_dx, #2; \
2351 \
2352 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2353 vshl.u32 rg_dx8, rg_dx, #3; \
2354 \
2355 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2356 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2357 \
2358 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2359 vmov.u8 d64_1, #1; \
2360 \
2361 vmov.u8 d128_4, #4; \
2362 vmov.u8 d64_128, #128; \
2363 \
2364 vmov.u8 d128_0x7, #0x7; \
2365 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
2366 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
2367 \
2368 0: \
2369 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2370 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2371 \
2372 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2373 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2374 \
2375 cmp span_num_blocks, #0; \
2376 beq 1f; \
2377 \
2378 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2379 add fb_ptr, fb_ptr, y, lsl #11; \
2380 \
2381 ldr b, [ span_b_offset ]; \
2382 vdup.u32 v_left_x, left_x; \
2383 and y, y, #0x3; \
2384 \
2385 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2386 add fb_ptr, fb_ptr, left_x, lsl #1; \
2387 \
2388 mla b, b_dx, left_x, b; \
2389 and dither_shift, left_x, #0x03; \
2390 \
2391 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2392 vshr.u32 rg_dx, rg_dx4, #2; \
2393 \
2394 mov dither_shift, dither_shift, lsl #3; \
2395 vmla.u32 rg, rg_dx, v_left_x; \
2396 \
2397 subs span_num_blocks, span_num_blocks, #1; \
2398 \
2399 mov dither_row, dither_row, ror dither_shift; \
2400 mov b_dx4, b_dx, lsl #2; \
2401 \
2402 vdup.u32 dither_offsets, dither_row; \
2403 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2404 \
2405 vdup.u32 b_block, b; \
2406 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2407 \
2408 mov b_dx8, b_dx, lsl #3; \
2409 vdup.u32 r_block, rg[0]; \
2410 vdup.u32 g_block, rg[1]; \
2411 \
2412 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2413 \
2414 vadd.u32 r_block, r_block, block_span; \
2415 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2416 \
2417 vadd.u32 g_block, g_block, block_span; \
2418 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2419 \
2420 vadd.u32 b_block, b_block, block_span; \
2421 add block_ptr_b, block_ptr_a, #16; \
2422 \
2423 vshrn.u32 r_whole_low, r_block, #16; \
2424 vshrn.u32 g_whole_low, g_block, #16; \
2425 vshrn.u32 b_whole_low, b_block, #16; \
2426 vdup.u32 dx4, rg_dx4[0]; \
2427 \
2428 vaddhn.u32 r_whole_high, r_block, dx4; \
2429 vdup.u32 dx4, rg_dx4[1]; \
2430 \
2431 vaddhn.u32 g_whole_high, g_block, dx4; \
2432 vdup.u32 dx4, b_dx4; \
2433 \
2434 vaddhn.u32 b_whole_high, b_block, dx4; \
2435 vdup.u32 dx8, rg_dx8[0]; \
2436 \
2437 vadd.u32 r_block, r_block, dx8; \
2438 vdup.u32 dx8, rg_dx8[1]; \
2439 \
2440 vadd.u32 g_block, g_block, dx8; \
2441 vdup.u32 dx8, b_dx8; \
2442 \
2443 vadd.u32 b_block, b_block, dx8; \
2444 \
2445 vmovn.u16 r_whole_8, r_whole; \
2446 vmovn.u16 g_whole_8, g_whole; \
2447 vmovn.u16 b_whole_8, b_whole; \
2448 \
2449 beq 3f; \
2450 \
2451 2: \
2452 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2453 vshrn.u32 r_whole_low, r_block, #16; \
2454 \
2455 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2456 vshrn.u32 g_whole_low, g_block, #16; \
2457 \
2458 vshrn.u32 b_whole_low, b_block, #16; \
2459 \
2460 vdup.u32 dx4, rg_dx4[0]; \
2461 vshr.u8 r_whole_8, r_whole_8, #3; \
2462 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2463 \
2464 vaddhn.u32 r_whole_high, r_block, dx4; \
2465 vdup.u32 dx4, rg_dx4[1]; \
2466 \
2467 vmov pixels, msb_mask; \
2468 vaddhn.u32 g_whole_high, g_block, dx4; \
2469 vdup.u32 dx4, b_dx4; \
2470 \
2471 vaddhn.u32 b_whole_high, b_block, dx4; \
2472 vdup.u32 dx8, rg_dx8[0]; \
2473 \
2474 vmlal.u8 pixels, r_whole_8, d64_1; \
2475 vmlal.u8 pixels, g_whole_8, d64_4; \
2476 vmlal.u8 pixels, b_whole_8, d64_128; \
2477 \
2478 vadd.u32 r_block, r_block, dx8; \
2479 vdup.u32 dx8, rg_dx8[1]; \
2480 \
2481 vadd.u32 g_block, g_block, dx8; \
2482 vdup.u32 dx8, b_dx8; \
2483 \
2484 vadd.u32 b_block, b_block, dx8; \
2485 \
2486 vmovn.u16 r_whole_8, r_whole; \
2487 vmovn.u16 g_whole_8, g_whole; \
2488 vmovn.u16 b_whole_8, b_whole; \
2489 \
2490 vst1.u32 { pixels }, [ fb_ptr ]!; \
2491 subs span_num_blocks, span_num_blocks, #1; \
2492 bne 2b; \
2493 \
2494 3: \
2495 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2496 \
3867c6ef 2497 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
75e28f62
E
2498 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2499 \
2500 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2501 rbit right_mask, right_mask; \
75e28f62
E
2502 vmov pixels, msb_mask; \
2503 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2504 clz right_mask, right_mask; \
75e28f62
E
2505 \
2506 vmlal.u8 pixels, r_whole_8, d64_1; \
2507 vmlal.u8 pixels, g_whole_8, d64_4; \
2508 vmlal.u8 pixels, b_whole_8, d64_128; \
2509 \
3867c6ef
E
2510 ldr pc, [ pc, right_mask, lsl #2 ]; \
2511 nop; \
2512 nop; \
2513 .word 4f; \
2514 .word 5f; \
2515 .word 6f; \
2516 .word 7f; \
2517 .word 8f; \
2518 .word 9f; \
2519 .word 10f; \
2520 .word 11f; \
2521 \
75e28f62 2522 4: \
3867c6ef
E
2523 vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \
2524 bal 1f; \
2525 \
2526 5: \
2527 vst1.u32 { pixels_low[0] }, [ fb_ptr ]; \
2528 bal 1f; \
2529 \
2530 6: \
2531 vst1.u32 { pixels_low[0] }, [ fb_ptr ]!; \
2532 vst1.u16 { pixels_low[2] }, [ fb_ptr ]; \
2533 bal 1f; \
2534 \
2535 7: \
2536 vst1.u32 { pixels_low }, [ fb_ptr ]; \
2537 bal 1f; \
2538 \
2539 8: \
2540 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2541 vst1.u16 { pixels_high[0] }, [ fb_ptr ]; \
2542 bal 1f; \
2543 \
2544 9: \
2545 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2546 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2547 bal 1f; \
2548 \
2549 10: \
2550 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2551 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2552 vst1.u16 { pixels_high[2] }, [ fb_ptr ]; \
2553 bal 1f; \
2554 \
2555 11: \
2556 vst1.u32 { pixels }, [ fb_ptr ]; \
2557 bal 1f; \
75e28f62
E
2558 \
2559 1: \
2560 add span_uvrg_offset, span_uvrg_offset, #16; \
2561 add span_b_offset, span_b_offset, #4; \
2562 \
2563 add span_edge_data, span_edge_data, #8; \
2564 subs num_spans, num_spans, #1; \
2565 \
2566 bne 0b; \
2567 \
2568 ldmia sp!, { r4 - r11, pc } \
2569
2570setup_blocks_shaded_untextured_direct_builder(undithered)
2571setup_blocks_shaded_untextured_direct_builder(dithered)
2572
2573
2574#undef psx_gpu
2575#undef num_blocks
2576#undef triangle
2577#undef c_64
2578
2579#define psx_gpu r0
2580#define block_ptr r1
2581#define num_blocks r2
2582#define uv_01 r3
2583#define uv_23 r4
2584#define uv_45 r5
2585#define uv_67 r6
2586#define uv_0 r7
2587#define uv_1 r3
2588#define uv_2 r8
2589#define uv_3 r4
2590#define uv_4 r9
2591#define uv_5 r5
2592#define uv_6 r10
2593#define uv_7 r6
2594#define texture_ptr r11
2595
2596#define pixel_0 r7
2597#define pixel_1 r3
2598#define pixel_2 r8
2599#define pixel_3 r4
2600#define pixel_4 r9
2601#define pixel_5 r5
2602#define pixel_6 r10
2603#define pixel_7 r6
2604
2605#define pixels_a r7
2606#define pixels_b r9
2607#define pixels_c r8
2608#define pixels_d r10
2609
2610#define c_64 r0
2611
2612#define clut_ptr r12
2613#define current_texture_mask r5
2614#define dirty_textures_mask r6
2615
2616#define texels d0
2617
2618#define clut_low_a d2
2619#define clut_low_b d3
2620#define clut_high_a d4
2621#define clut_high_b d5
2622
2623#define clut_a q1
2624#define clut_b q2
2625
2626#define texels_low d6
2627#define texels_high d7
2628
2629.align 3
2630
2631function(texture_blocks_untextured)
2632 bx lr
2633
2634
2635.align 3
2636
2637function(texture_blocks_4bpp)
2638 stmdb sp!, { r3 - r11, r14 }
2639 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2640
2641 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2642 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2643
2644 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2645 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2646
2647 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2648 vuzp.u8 clut_a, clut_b
2649
2650 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2651 tst dirty_textures_mask, current_texture_mask
2652
2653 bne 1f
2654 mov c_64, #64
2655
26560:
2657 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2658
2659 uxtah uv_0, texture_ptr, uv_01
2660 uxtah uv_1, texture_ptr, uv_01, ror #16
2661
2662 uxtah uv_2, texture_ptr, uv_23
2663 uxtah uv_3, texture_ptr, uv_23, ror #16
2664
2665 uxtah uv_4, texture_ptr, uv_45
2666 ldrb pixel_0, [ uv_0 ]
2667
2668 uxtah uv_5, texture_ptr, uv_45, ror #16
2669 ldrb pixel_1, [ uv_1 ]
2670
2671 uxtah uv_6, texture_ptr, uv_67
2672 ldrb pixel_2, [ uv_2 ]
2673
2674 uxtah uv_7, texture_ptr, uv_67, ror #16
2675 ldrb pixel_3, [ uv_3 ]
2676
2677 ldrb pixel_4, [ uv_4 ]
2678 subs num_blocks, num_blocks, #1
2679
2680 ldrb pixel_5, [ uv_5 ]
2681 orr pixels_a, pixel_0, pixel_1, lsl #8
2682
2683 ldrb pixel_6, [ uv_6 ]
2684 orr pixels_b, pixel_4, pixel_5, lsl #8
2685
2686 ldrb pixel_7, [ uv_7 ]
2687 orr pixels_a, pixels_a, pixel_2, lsl #16
2688
2689 orr pixels_b, pixels_b, pixel_6, lsl #16
2690 orr pixels_a, pixels_a, pixel_3, lsl #24
2691
2692 orr pixels_b, pixels_b, pixel_7, lsl #24
2693 vmov.u32 texels, pixels_a, pixels_b
2694
2695 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2696 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2697
2698 vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2699 bne 0b
2700
2701 ldmia sp!, { r3 - r11, pc }
2702
27031:
2704 stmdb sp!, { r1 - r2 }
2705 bl update_texture_4bpp_cache
2706
2707 mov c_64, #64
2708 ldmia sp!, { r1 - r2 }
2709 bal 0b
2710
2711
2712.align 3
2713
2714function(texture_blocks_8bpp)
2715 stmdb sp!, { r3 - r11, r14 }
2716 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2717
2718 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2719 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2720
2721 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2722 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2723
2724 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2725 tst dirty_textures_mask, current_texture_mask
2726
2727 bne 1f
2728 nop
2729
27300:
2731 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2732
2733 uxtah uv_0, texture_ptr, uv_01
2734 uxtah uv_1, texture_ptr, uv_01, ror #16
2735
2736 uxtah uv_2, texture_ptr, uv_23
2737 uxtah uv_3, texture_ptr, uv_23, ror #16
2738
2739 uxtah uv_4, texture_ptr, uv_45
2740 ldrb pixel_0, [ uv_0 ]
2741
2742 uxtah uv_5, texture_ptr, uv_45, ror #16
2743 ldrb pixel_1, [ uv_1 ]
2744
2745 uxtah uv_6, texture_ptr, uv_67
2746 ldrb pixel_2, [ uv_2 ]
2747
2748 uxtah uv_7, texture_ptr, uv_67, ror #16
2749 ldrb pixel_3, [ uv_3 ]
2750
2751 ldrb pixel_4, [ uv_4 ]
2752 add pixel_0, pixel_0, pixel_0
2753
2754 ldrb pixel_5, [ uv_5 ]
2755 add pixel_1, pixel_1, pixel_1
2756
2757 ldrb pixel_6, [ uv_6 ]
2758 add pixel_2, pixel_2, pixel_2
2759
2760 ldrb pixel_7, [ uv_7 ]
2761 add pixel_3, pixel_3, pixel_3
2762
2763 ldrh pixel_0, [ clut_ptr, pixel_0 ]
2764 add pixel_4, pixel_4, pixel_4
2765
2766 ldrh pixel_1, [ clut_ptr, pixel_1 ]
2767 add pixel_5, pixel_5, pixel_5
2768
2769 ldrh pixel_2, [ clut_ptr, pixel_2 ]
2770 add pixel_6, pixel_6, pixel_6
2771
2772 ldrh pixel_3, [ clut_ptr, pixel_3 ]
2773 add pixel_7, pixel_7, pixel_7
2774
2775 ldrh pixel_4, [ clut_ptr, pixel_4 ]
2776 orr pixels_a, pixel_0, pixel_1, lsl #16
2777
2778 ldrh pixel_5, [ clut_ptr, pixel_5 ]
2779 orr pixels_c, pixel_2, pixel_3, lsl #16
2780
2781 ldrh pixel_6, [ clut_ptr, pixel_6 ]
2782 subs num_blocks, num_blocks, #1
2783
2784 ldrh pixel_7, [ clut_ptr, pixel_7 ]
2785 orr pixels_b, pixel_4, pixel_5, lsl #16
2786
2787 orr pixels_d, pixel_6, pixel_7, lsl #16
2788 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2789
2790 add block_ptr, block_ptr, #64
2791 bne 0b
2792
2793 ldmia sp!, { r3 - r11, pc }
2794
27951:
2796 stmdb sp!, { r1 - r2, r12 }
2797
2798 bl update_texture_8bpp_cache
2799
2800 ldmia sp!, { r1 - r2, r12 }
2801 bal 0b
2802
2803
2804#undef uv_0
2805#undef uv_1
2806#undef uv_2
2807#undef uv_3
2808#undef uv_4
2809#undef uv_5
2810#undef uv_6
2811#undef uv_7
2812
2813#undef pixel_0
2814#undef pixel_1
2815#undef pixel_2
2816#undef pixel_3
2817#undef pixel_4
2818#undef pixel_5
2819#undef pixel_6
2820#undef pixel_7
2821
2822#undef texture_ptr
2823
2824#undef pixels_a
2825#undef pixels_b
2826#undef pixels_c
2827#undef pixels_d
2828
2829#define psx_gpu r0
2830#define block_ptr r1
2831#define num_blocks r2
2832
2833#define uv_0 r3
2834#define uv_1 r4
2835#define u_0 r3
2836#define u_1 r4
2837#define v_0 r5
2838#define v_1 r6
2839
2840#define uv_2 r5
2841#define uv_3 r6
2842#define u_2 r5
2843#define u_3 r6
2844#define v_2 r7
2845#define v_3 r8
2846
2847#define uv_4 r7
2848#define uv_5 r8
2849#define u_4 r7
2850#define u_5 r8
2851#define v_4 r9
2852#define v_5 r10
2853
2854#define uv_6 r9
2855#define uv_7 r10
2856#define u_6 r9
2857#define u_7 r10
2858#define v_6 r11
2859#define v_7 r0
2860
2861#define pixel_0 r3
2862#define pixel_1 r4
2863#define pixel_2 r5
2864#define pixel_3 r6
2865#define pixel_4 r7
2866#define pixel_5 r8
2867#define pixel_6 r9
2868#define pixel_7 r10
2869
2870#define pixels_a r3
2871#define pixels_b r5
2872#define pixels_c r7
2873#define pixels_d r9
2874
2875#define texture_ptr r12
2876
2877
2878.align 3
2879
2880function(texture_blocks_16bpp)
2881 stmdb sp!, { r3 - r11, r14 }
2882 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2883
2884 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2885 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2886
28870:
2888 ldrh uv_0, [ block_ptr ]
2889 subs num_blocks, num_blocks, #1
2890
2891 ldrh uv_1, [ block_ptr, #2 ]
2892
2893 and v_0, uv_0, #0xFF00
2894 and v_1, uv_1, #0xFF00
2895
2896 and u_0, uv_0, #0xFF
2897 and u_1, uv_1, #0xFF
2898
2899 add uv_0, u_0, v_0, lsl #2
2900 ldrh uv_2, [ block_ptr, #4 ]
2901
2902 add uv_1, u_1, v_1, lsl #2
2903 ldrh uv_3, [ block_ptr, #6 ]
2904
2905 add uv_0, uv_0, uv_0
2906 add uv_1, uv_1, uv_1
2907
2908 and v_2, uv_2, #0xFF00
2909 and v_3, uv_3, #0xFF00
2910
2911 and u_2, uv_2, #0xFF
2912 and u_3, uv_3, #0xFF
2913
2914 add uv_2, u_2, v_2, lsl #2
2915 ldrh uv_4, [ block_ptr, #8 ]
2916
2917 add uv_3, u_3, v_3, lsl #2
2918 ldrh uv_5, [ block_ptr, #10 ]
2919
2920 add uv_2, uv_2, uv_2
2921 add uv_3, uv_3, uv_3
2922
2923 and v_4, uv_4, #0xFF00
2924 and v_5, uv_5, #0xFF00
2925
2926 and u_4, uv_4, #0xFF
2927 and u_5, uv_5, #0xFF
2928
2929 add uv_4, u_4, v_4, lsl #2
2930 ldrh uv_6, [ block_ptr, #12 ]
2931
2932 add uv_5, u_5, v_5, lsl #2
2933 ldrh uv_7, [ block_ptr, #14 ]
2934
2935 add uv_4, uv_4, uv_4
2936 ldrh pixel_0, [ texture_ptr, uv_0 ]
2937
2938 add uv_5, uv_5, uv_5
2939 ldrh pixel_1, [ texture_ptr, uv_1 ]
2940
2941 and v_6, uv_6, #0xFF00
2942 ldrh pixel_2, [ texture_ptr, uv_2 ]
2943
2944 and v_7, uv_7, #0xFF00
2945 ldrh pixel_3, [ texture_ptr, uv_3 ]
2946
2947 and u_6, uv_6, #0xFF
2948 ldrh pixel_4, [ texture_ptr, uv_4 ]
2949
2950 and u_7, uv_7, #0xFF
2951 ldrh pixel_5, [ texture_ptr, uv_5 ]
2952
2953 add uv_6, u_6, v_6, lsl #2
2954 add uv_7, u_7, v_7, lsl #2
2955
2956 add uv_6, uv_6, uv_6
2957 add uv_7, uv_7, uv_7
2958
2959 orr pixels_a, pixel_0, pixel_1, lsl #16
2960 orr pixels_b, pixel_2, pixel_3, lsl #16
2961
2962 ldrh pixel_6, [ texture_ptr, uv_6 ]
2963 orr pixels_c, pixel_4, pixel_5, lsl #16
2964
2965 ldrh pixel_7, [ texture_ptr, uv_7 ]
2966 orr pixels_d, pixel_6, pixel_7, lsl #16
2967
2968 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2969 add block_ptr, block_ptr, #64
2970
2971 bne 0b
2972
2973 ldmia sp!, { r3 - r11, pc }
2974
2975
2976#undef num_blocks
2977
2978#undef test_mask
2979#undef texels
2980#undef pixels_b
2981#undef pixels
2982#undef d64_1
2983#undef d64_4
2984#undef d64_128
2985#undef draw_mask
2986#undef msb_mask
2987#undef msb_mask_low
2988#undef msb_mask_high
2989#undef fb_pixels
2990
2991#undef c_32
2992#undef fb_ptr
2993#undef mask_msb_ptr
2994
2995#define psx_gpu r0
2996#define num_blocks r1
2997#define color_ptr r2
3867c6ef
E
2998#define colors_scalar r2
2999#define colors_scalar_compare r3
75e28f62
E
3000#define mask_msb_ptr r2
3001
3002#define block_ptr_load_a r0
3003#define block_ptr_store r3
3004#define block_ptr_load_b r12
3005#define c_32 r2
3006
3007#define c_48 r4
3008#define fb_ptr r14
3009#define draw_mask_bits_scalar r5
3010
3011#define d128_0x07 q0
3012#define d128_0x1F q1
3013#define d128_0x8000 q2
3014#define test_mask q3
3015#define texels q4
3016#define colors_rg q5
3017#define colors_b_dm_bits q6
3018#define texels_rg q7
3019#define pixels_r q8
3020#define pixels_g q9
3021#define pixels_b q10
3022#define pixels q11
3023#define zero_mask q4
3024#define draw_mask q12
3025#define msb_mask q13
3026
3027#define fb_pixels q8
3028
3029#define pixels_gb_low q9
3030
3031#define colors_r d10
3032#define colors_g d11
3033#define colors_b d12
3034#define draw_mask_bits d13
3035#define texels_r d14
3036#define texels_g d15
3037#define pixels_r_low d16
3038#define pixels_g_low d18
3039#define pixels_b_low d19
3040#define msb_mask_low d26
3041#define msb_mask_high d27
3042
3043#define d64_1 d28
3044#define d64_4 d29
3045#define d64_128 d30
3046#define texels_b d31
3047
3048#define shade_blocks_textured_modulated_prologue_indirect() \
3049 mov c_48, #48; \
3050 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3051
3052#define shade_blocks_textured_modulated_prologue_direct() \
3053 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3054 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \
3055
75e28f62 3056
3867c6ef
E
3057#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3058
3059#define shade_blocks_textured_false_modulation_check_undithered(target) \
3060 ldr colors_scalar, [ psx_gpu, #psx_gpu_triangle_color_offset ]; \
3061 movw colors_scalar_compare, #0x8080; \
3062 \
3063 movt colors_scalar_compare, #0x80; \
3064 cmp colors_scalar, colors_scalar_compare; \
3065 beq shade_blocks_textured_unmodulated_##target \
3066
3067#define shade_blocks_textured_false_modulation_check_dithered(target) \
3068
3069#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3070 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62
E
3071 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
3072 vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \
3073 vdup.u8 colors_g, colors_r[1]; \
3074 vdup.u8 colors_b, colors_r[2]; \
3075 vdup.u8 colors_r, colors_r[0] \
3076
3077
3078#define shade_blocks_textured_modulated_load_dithered(target) \
3079 vld1.u32 { target }, [ block_ptr_load_b, :128 ] \
3080
3081#define shade_blocks_textured_modulated_load_last_dithered(target) \
3082 vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \
3083
3084#define shade_blocks_textured_modulated_load_undithered(target) \
3085
3086#define shade_blocks_textured_modulated_load_last_undithered(target) \
3087 add block_ptr_load_b, block_ptr_load_b, #32 \
3088
3089#define shade_blocks_textured_modulate_dithered(channel) \
3090 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3091
3092#define shade_blocks_textured_modulate_undithered(channel) \
3093 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3094
3095
3096#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
3097 vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \
3098
3099#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
3100 ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \
3101 vld1.u32 { fb_pixels }, [ fb_ptr ]; \
3102 vbit.u16 pixels, fb_pixels, draw_mask \
3103
3104#define shade_blocks_textured_modulated_store_pixels_indirect() \
3105 vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \
3106
3107#define shade_blocks_textured_modulated_store_pixels_direct() \
3108 vst1.u32 { pixels }, [ fb_ptr ] \
3109
3110
3111#define shade_blocks_textured_modulated_load_rg_shaded() \
3112 vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \
3113
3114#define shade_blocks_textured_modulated_load_rg_unshaded() \
3115 add block_ptr_load_b, block_ptr_load_b, #32 \
3116
3117#define shade_blocks_textured_modulated_load_bdm_shaded() \
3118 vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \
3119
3120#define shade_blocks_textured_modulated_load_bdm_unshaded() \
3121 ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \
3122 add block_ptr_load_a, block_ptr_load_a, #32 \
3123
3124#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3125 vdup.u16 draw_mask, draw_mask_bits[0] \
3126
3127#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3128 vdup.u16 draw_mask, draw_mask_bits_scalar \
3129
3130
3131#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3132
3133#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3134 vorr.u16 pixels, pixels, msb_mask \
3135
3136
3137#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3138.align 3; \
3139 \
3140function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3867c6ef 3141 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62
E
3142 stmdb sp!, { r4 - r5, lr }; \
3143 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3144 \
3145 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
3146 \
3147 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3148 \
3149 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3150 mov c_32, #32; \
3151 \
3152 add block_ptr_load_b, block_ptr_load_a, #16; \
3153 vmov.u8 d64_1, #1; \
3154 vmov.u8 d64_4, #4; \
3155 vmov.u8 d64_128, #128; \
3156 \
3157 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3158 vmov.u8 d128_0x07, #0x07; \
3159 \
3160 shade_blocks_textured_modulated_load_rg_##shading(); \
3161 vmov.u8 d128_0x1F, #0x1F; \
3162 \
3163 shade_blocks_textured_modulated_load_bdm_##shading(); \
3164 vmov.u16 d128_0x8000, #0x8000; \
3165 \
3166 vmovn.u16 texels_r, texels; \
3167 vshrn.u16 texels_g, texels, #5; \
3168 \
3169 vshrn.u16 texels_b, texels, #7; \
3170 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3171 \
3172 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3173 vtst.u16 draw_mask, draw_mask, test_mask; \
3174 \
3175 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3176 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3177 \
3178 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3179 vshr.u8 texels_b, texels_b, #3; \
3180 \
3181 shade_blocks_textured_modulate_##dithering(r); \
3182 shade_blocks_textured_modulate_##dithering(g); \
3183 shade_blocks_textured_modulate_##dithering(b); \
3184 \
3185 vand.u16 pixels, texels, d128_0x8000; \
3186 vceq.u16 zero_mask, texels, #0; \
3187 \
3188 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3189 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3190 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3191 \
3192 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3193 vorr.u16 draw_mask, draw_mask, zero_mask; \
3194 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3195 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3196 \
3197 subs num_blocks, num_blocks, #1; \
3198 beq 1f; \
3199 \
3200 .align 3; \
3201 \
3202 0: \
3203 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3204 shade_blocks_textured_modulated_load_rg_##shading(); \
3205 vshrn.u16 texels_g, texels, #5; \
3206 \
3207 shade_blocks_textured_modulated_load_bdm_##shading(); \
3208 vshrn.u16 texels_b, texels, #7; \
3209 \
59d15d23 3210 pld [ block_ptr_load_a ]; \
75e28f62
E
3211 vmovn.u16 texels_r, texels; \
3212 vmlal.u8 pixels, pixels_r_low, d64_1; \
3213 \
3214 vmlal.u8 pixels, pixels_g_low, d64_4; \
3215 vmlal.u8 pixels, pixels_b_low, d64_128; \
3216 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3217 \
3218 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3219 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3220 \
3221 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3222 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3223 \
3224 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3225 vtst.u16 draw_mask, draw_mask, test_mask; \
3226 \
3227 shade_blocks_textured_modulated_store_pixels_##target(); \
3228 vshr.u8 texels_b, texels_b, #3; \
3229 \
3230 shade_blocks_textured_modulate_##dithering(r); \
3231 shade_blocks_textured_modulate_##dithering(g); \
3232 shade_blocks_textured_modulate_##dithering(b); \
3233 \
3234 vand.u16 pixels, texels, d128_0x8000; \
3235 vceq.u16 zero_mask, texels, #0; \
3236 \
3237 subs num_blocks, num_blocks, #1; \
3238 \
3239 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3240 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3241 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3242 \
3243 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3244 vorr.u16 draw_mask, draw_mask, zero_mask; \
3245 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3246 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3247 \
3248 bne 0b; \
3249 \
3250 1: \
3251 vmlal.u8 pixels, pixels_r_low, d64_1; \
3252 vmlal.u8 pixels, pixels_g_low, d64_4; \
3253 vmlal.u8 pixels, pixels_b_low, d64_128; \
3254 \
3255 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3256 shade_blocks_textured_modulated_store_pixels_##target(); \
3257 \
3258 ldmia sp!, { r4 - r5, pc } \
3259
3260
3261shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3262shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3263shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3264shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3265
3266shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3267shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3268shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3269shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3270
3271
3272#undef c_64
3273#undef fb_ptr
3274#undef color_ptr
3275
3276#undef color_r
3277#undef color_g
3278#undef color_b
3279
3280#undef test_mask
3281#undef pixels
3282#undef draw_mask
3283#undef zero_mask
3284#undef fb_pixels
3285#undef msb_mask
3286#undef msb_mask_low
3287#undef msb_mask_high
3288
3289#define psx_gpu r0
3290#define num_blocks r1
3291#define mask_msb_ptr r2
3292#define color_ptr r3
3293
3294#define block_ptr_load r0
3295#define draw_mask_store_ptr r3
3296#define draw_mask_bits_ptr r12
3297#define draw_mask_ptr r12
3298#define pixel_store_ptr r14
3299
3300#define fb_ptr_cmp r4
3301
3302#define fb_ptr r3
3303#define fb_ptr_next r14
3304
3305#define c_64 r2
3306
3307#define test_mask q0
3308#define pixels q1
3309#define draw_mask q2
3310#define zero_mask q3
3311#define draw_mask_combined q4
3312#define fb_pixels q5
3313#define fb_pixels_next q6
3314#define msb_mask q7
3315
3316#define draw_mask_low d4
3317#define draw_mask_high d5
3318#define msb_mask_low d14
3319#define msb_mask_high d15
3320
3321.align 3
3322function(shade_blocks_textured_unmodulated_indirect)
3323 str r14, [ sp, #-4 ]
3324 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3325
3326 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3327 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3328
3329 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3330 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3331
3332 mov c_64, #64
3333 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3334
3335 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3336 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3337 [ draw_mask_bits_ptr, :16 ], c_64
3338 vceq.u16 zero_mask, pixels, #0
3339
3340 vtst.u16 draw_mask, draw_mask, test_mask
3341 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3342
3343 subs num_blocks, num_blocks, #1
3344 beq 1f
3345
3346 0:
3347 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3348 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3349
3350 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3351 [ draw_mask_bits_ptr, :16 ], c_64
3352 vceq.u16 zero_mask, pixels, #0
3353
3354 vtst.u16 draw_mask, draw_mask, test_mask
3355 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3356
3357 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3358 subs num_blocks, num_blocks, #1
3359
3360 bne 0b
3361
3362 1:
3363 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3364 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3365
3366 ldr pc, [ sp, #-4 ]
3367
3368
3369.align 3
3370
3371function(shade_blocks_textured_unmodulated_direct)
3372 stmdb sp!, { r4, r14 }
3373 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3374
3375 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3376 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3377
3378 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3379 mov c_64, #64
3380
3381 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3382 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3383
3384 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3385 [ draw_mask_bits_ptr, :16 ], c_64
3386 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3387
3388 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3389 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3390 vceq.u16 zero_mask, pixels, #0
3391 vtst.u16 draw_mask, draw_mask, test_mask
3392
3393 subs num_blocks, num_blocks, #1
3394 beq 1f
3395
3396 0:
3397 mov fb_ptr, fb_ptr_next
3398 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3399
3400 vorr.u16 pixels, pixels, msb_mask
3401
3402 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3403 vmov fb_pixels, fb_pixels_next
3404
3405 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3406 [ draw_mask_bits_ptr, :16 ], c_64
3407 vbif.u16 fb_pixels, pixels, draw_mask_combined
3408
75e28f62 3409 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
8438c3c7 3410 pld [ fb_ptr_next, #64 ]
3411
75e28f62 3412 add fb_ptr_cmp, fb_ptr_cmp, #14
8438c3c7 3413 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3414
75e28f62
E
3415 cmp fb_ptr_cmp, #28
3416 bls 4f
3417
3418 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3419 vceq.u16 zero_mask, pixels, #0
3420
3421 vst1.u16 { fb_pixels }, [ fb_ptr ]
3422 vtst.u16 draw_mask, draw_mask, test_mask
3423
3424 3:
3425 subs num_blocks, num_blocks, #1
3426 bne 0b
3427
3428 1:
3429 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3430 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3431
3432 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3433
3434 ldmia sp!, { r4, pc }
3435
3436 4:
3437 vst1.u16 { fb_pixels }, [ fb_ptr ]
3438 vceq.u16 zero_mask, pixels, #0
3439
3440 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3441 vtst.u16 draw_mask, draw_mask, test_mask
3442
3443 bal 3b
3444
3445
3446function(shade_blocks_unshaded_untextured_indirect)
3447 bx lr
3448
3449.align 3
3450
3451function(shade_blocks_unshaded_untextured_direct)
3452 stmdb sp!, { r4, r14 }
3453 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3454
3455 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3456 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3457
3458 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3459 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3460
3461 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3462 vld1.u16 { pixels }, [ color_ptr, :128 ]
3463
3464 mov c_64, #64
3465 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3466
3467 vorr.u16 pixels, pixels, msb_mask
3468 subs num_blocks, num_blocks, #1
3469
3470 ldr fb_ptr_next, [ block_ptr_load ], #64
3471
3472 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3473 beq 1f
3474
3475 0:
3476 vmov fb_pixels, fb_pixels_next
3477 mov fb_ptr, fb_ptr_next
3478 ldr fb_ptr_next, [ block_ptr_load ], #64
3479
3480 vbif.u16 fb_pixels, pixels, draw_mask
3481 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3482
3483 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3484 add fb_ptr_cmp, fb_ptr_cmp, #14
3485 cmp fb_ptr_cmp, #28
3486 bls 4f
3487
3488 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3489 vst1.u16 { fb_pixels }, [ fb_ptr ]
3490
3491 3:
3492 subs num_blocks, num_blocks, #1
3493 bne 0b
3494
3495 1:
3496 vbif.u16 fb_pixels_next, pixels, draw_mask
3497 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3498
3499 ldmia sp!, { r4, pc }
3500
3501 4:
3502 vst1.u16 { fb_pixels }, [ fb_ptr ]
3503 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3504 bal 3b
3505
3506
3507#undef draw_mask_ptr
3508#undef c_64
3509#undef fb_ptr
3510#undef fb_ptr_next
3511#undef fb_ptr_cmp
3512
3513#define psx_gpu r0
3514#define num_blocks r1
3515#define msb_mask_ptr r2
3516#define pixel_ptr r3
3517#define draw_mask_ptr r0
3518#define c_64 r2
3519#define fb_ptr r12
3520#define fb_ptr_next r14
3521#define fb_ptr_cmp r4
3522
3523#undef msb_mask
3524#undef draw_mask
3525#undef pixels
3526#undef fb_pixels
3527#undef d128_0x8000
3528#undef msb_mask_low
3529#undef msb_mask_high
3530#undef draw_mask_next
3531#undef pixels_g
3532#undef blend_pixels
3533#undef fb_pixels_next
3534
3535#define msb_mask q0
3536#define draw_mask q1
3537#define pixels q2
3538#define fb_pixels q3
3539#define blend_pixels q4
3540#define pixels_no_msb q5
3541#define blend_mask q6
3542#define fb_pixels_no_msb q7
3543#define d128_0x8000 q8
3544#define d128_0x0421 q9
3545#define fb_pixels_next q10
3546#define blend_pixels_next q11
3547#define pixels_next q12
3548#define draw_mask_next q13
3549#define write_mask q14
3550
3551#define pixels_rb q5
3552#define pixels_mg q7
3553#define pixels_g q7
3554#define d128_0x7C1F q8
3555#define d128_0x03E0 q9
3556#define fb_pixels_rb q10
3557#define fb_pixels_g q11
3558#define fb_pixels_masked q11
3559#define d128_0x83E0 q15
3560#define pixels_fourth q7
3561#define d128_0x1C07 q12
3562#define d128_0x00E0 q13
3563#define d128_0x80E0 q13
3564
3565#define msb_mask_low d0
3566#define msb_mask_high d1
3567
3568#define blend_blocks_average_set_blend_mask_textured(source) \
3569 vclt.s16 blend_mask, source, #0 \
3570
3571#define blend_blocks_average_set_stp_bit_textured() \
3572 vorr.u16 blend_pixels, #0x8000 \
3573
3574#define blend_blocks_average_combine_textured(source) \
3575 vbif.u16 blend_pixels, source, blend_mask \
3576
3577#define blend_blocks_average_set_blend_mask_untextured(source) \
3578
3579#define blend_blocks_average_set_stp_bit_untextured() \
3580
3581#define blend_blocks_average_combine_untextured(source) \
3582
3583#define blend_blocks_average_mask_set_on() \
3584 vclt.s16 write_mask, fb_pixels_next, #0 \
3585
3586#define blend_blocks_average_mask_copy_on() \
3587 vorr.u16 draw_mask, draw_mask_next, write_mask \
3588
3589#define blend_blocks_average_mask_copy_b_on() \
3590 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3591
3592#define blend_blocks_average_mask_set_off() \
3593
3594#define blend_blocks_average_mask_copy_off() \
3595 vmov draw_mask, draw_mask_next \
3596
3597#define blend_blocks_average_mask_copy_b_off() \
3598
3599#define blend_blocks_average_builder(texturing, mask_evaluate) \
3600.align 3; \
3601 \
3602function(blend_blocks_##texturing##_average_##mask_evaluate) \
3603 stmdb sp!, { r4, r14 }; \
3604 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3605 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3606 \
3607 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3608 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3609 \
3610 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3611 mov c_64, #64; \
3612 \
3613 vmov.u16 d128_0x8000, #0x8000; \
3614 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3615 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3616 \
3617 vmov.u16 d128_0x0421, #0x0400; \
3618 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3619 \
3620 vorr.u16 d128_0x0421, #0x0021; \
3621 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3622 \
3623 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3624 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3625 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3626 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3627 blend_blocks_average_mask_set_##mask_evaluate(); \
3628 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3629 \
3630 subs num_blocks, num_blocks, #1; \
3631 beq 1f; \
3632 \
3633 0: \
3634 mov fb_ptr, fb_ptr_next; \
3635 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3636 \
3637 vmov pixels, pixels_next; \
3638 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3639 \
3640 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3641 \
3642 blend_blocks_average_mask_copy_##mask_evaluate(); \
3643 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3644 \
3645 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3646 blend_blocks_average_set_stp_bit_##texturing(); \
3647 vmov fb_pixels, fb_pixels_next; \
3648 blend_blocks_average_combine_##texturing(pixels); \
3649 \
3650 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3651 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3652 cmp fb_ptr_cmp, #28; \
3653 bls 2f; \
3654 \
3655 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3656 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3657 \
3658 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3659 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3660 \
3661 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3662 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3663 \
3664 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3665 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3666 blend_blocks_average_mask_set_##mask_evaluate(); \
3667 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3668 \
3669 3: \
3670 subs num_blocks, num_blocks, #1; \
3671 bne 0b; \
3672 \
3673 1: \
3674 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3675 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3676 \
3677 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3678 blend_blocks_average_set_stp_bit_##texturing(); \
3679 blend_blocks_average_combine_##texturing(pixels_next); \
3680 \
3681 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3682 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
3683 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3684 \
3685 ldmia sp!, { r4, pc }; \
3686 \
3687 2: \
3688 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3689 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3690 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3691 \
3692 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3693 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3694 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3695 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3696 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3697 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3698 \
3699 bal 3b \
3700
3701blend_blocks_average_builder(textured, off)
3702blend_blocks_average_builder(untextured, off)
3703blend_blocks_average_builder(textured, on)
3704blend_blocks_average_builder(untextured, on)
3705
3706
3707#define blend_blocks_add_mask_set_on() \
3708 vclt.s16 write_mask, fb_pixels, #0 \
3709
3710#define blend_blocks_add_mask_copy_on() \
3711 vorr.u16 draw_mask, draw_mask, write_mask \
3712
3713#define blend_blocks_add_mask_set_off() \
3714
3715#define blend_blocks_add_mask_copy_off() \
3716
3717
3718#define blend_blocks_add_textured_builder(mask_evaluate) \
3719.align 3; \
3720 \
3721function(blend_blocks_textured_add_##mask_evaluate) \
3722 stmdb sp!, { r4, r14 }; \
3723 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3724 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3725 \
3726 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3727 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3728 \
3729 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3730 mov c_64, #64; \
3731 \
3732 vmov.u16 d128_0x7C1F, #0x7C00; \
3733 vmov.u16 d128_0x03E0, #0x0300; \
3734 vmov.u16 d128_0x83E0, #0x8000; \
3735 vorr.u16 d128_0x03E0, #0x00E0; \
3736 vorr.u16 d128_0x7C1F, #0x001F; \
3737 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3738 \
3739 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3740 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3741 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3742 vclt.s16 blend_mask, pixels, #0; \
3743 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3744 blend_blocks_add_mask_set_##mask_evaluate(); \
3745 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3746 \
3747 blend_blocks_add_mask_copy_##mask_evaluate(); \
3748 vorr.u16 pixels, pixels, msb_mask; \
3749 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3750 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3751 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3752 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3753 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3754 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3755 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3756 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3757 \
3758 subs num_blocks, num_blocks, #1; \
3759 beq 1f; \
3760 \
3761 0: \
3762 mov fb_ptr, fb_ptr_next; \
3763 \
3764 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3765 \
3766 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3767 vclt.s16 blend_mask, pixels, #0; \
3768 \
3769 vorr.u16 pixels, pixels, msb_mask; \
3770 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3771 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3772 \
8438c3c7 3773 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3774 pld [ fb_ptr_next, #64 ]; \
75e28f62
E
3775 \
3776 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
8438c3c7 3777 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3778 \
75e28f62 3779 add fb_ptr_cmp, fb_ptr_cmp, #14; \
8438c3c7 3780 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3781 \
75e28f62
E
3782 cmp fb_ptr_cmp, #28; \
3783 bls 2f; \
3784 \
3785 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3786 blend_blocks_add_mask_set_##mask_evaluate(); \
3787 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3788 blend_blocks_add_mask_copy_##mask_evaluate(); \
3789 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3790 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3791 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3792 \
3793 3: \
3794 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3795 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3796 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3797 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3798 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3799 \
3800 subs num_blocks, num_blocks, #1; \
3801 bne 0b; \
3802 \
3803 1: \
3804 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3805 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3806 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3807 \
3808 ldmia sp!, { r4, pc }; \
3809 \
3810 2: \
3811 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3812 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3813 \
3814 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3815 blend_blocks_add_mask_set_##mask_evaluate(); \
3816 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3817 blend_blocks_add_mask_copy_##mask_evaluate(); \
3818 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3819 bal 3b \
3820
3821
3822#define blend_blocks_add_untextured_builder(mask_evaluate) \
3823.align 3; \
3824 \
3825function(blend_blocks_untextured_add_##mask_evaluate) \
3826 stmdb sp!, { r4, r14 }; \
3827 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3828 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3829 \
3830 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3831 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3832 \
3833 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3834 mov c_64, #64; \
3835 \
3836 vmov.u16 d128_0x7C1F, #0x7C00; \
3837 vmov.u16 d128_0x03E0, #0x0300; \
3838 vorr.u16 d128_0x7C1F, #0x001F; \
3839 vorr.u16 d128_0x03E0, #0x00E0; \
3840 \
3841 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3842 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3843 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3844 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3845 blend_blocks_add_mask_set_##mask_evaluate(); \
3846 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3847 \
3848 blend_blocks_add_mask_copy_##mask_evaluate(); \
3849 vand.u16 pixels_g, pixels, d128_0x03E0; \
3850 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3851 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3852 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3853 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3854 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3855 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3856 \
3857 subs num_blocks, num_blocks, #1; \
3858 beq 1f; \
3859 \
3860 0: \
3861 mov fb_ptr, fb_ptr_next; \
3862 \
3863 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3864 \
3865 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3866 \
3867 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3868 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3869 vand.u16 pixels_g, pixels, d128_0x03E0; \
3870 \
3871 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3872 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3873 \
3874 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3875 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3876 cmp fb_ptr_cmp, #28; \
3877 bls 2f; \
3878 \
3879 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3880 blend_blocks_add_mask_set_##mask_evaluate(); \
3881 blend_blocks_add_mask_copy_##mask_evaluate(); \
3882 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3883 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3884 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3885 \
3886 3: \
3887 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3888 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3889 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3890 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3891 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3892 \
3893 subs num_blocks, num_blocks, #1; \
3894 bne 0b; \
3895 \
3896 1: \
3897 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3898 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3899 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3900 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3901 \
3902 ldmia sp!, { r4, pc }; \
3903 \
3904 2: \
3905 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3906 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3907 \
3908 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3909 blend_blocks_add_mask_set_##mask_evaluate(); \
3910 blend_blocks_add_mask_copy_##mask_evaluate(); \
3911 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3912 bal 3b \
3913
3914
3915blend_blocks_add_textured_builder(off)
3916blend_blocks_add_textured_builder(on)
3917blend_blocks_add_untextured_builder(off)
3918blend_blocks_add_untextured_builder(on)
3919
3920#define blend_blocks_subtract_set_blend_mask_textured() \
3921 vclt.s16 blend_mask, pixels_next, #0 \
3922
3923#define blend_blocks_subtract_combine_textured() \
3924 vbif.u16 blend_pixels, pixels, blend_mask \
3925
3926#define blend_blocks_subtract_set_stb_textured() \
3927 vorr.u16 blend_pixels, #0x8000 \
3928
3929#define blend_blocks_subtract_msb_mask_textured() \
3930 vorr.u16 pixels, pixels_next, msb_mask \
3931
3932#define blend_blocks_subtract_set_blend_mask_untextured() \
3933
3934#define blend_blocks_subtract_combine_untextured() \
3935
3936#define blend_blocks_subtract_set_stb_untextured() \
3937 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3938
3939#define blend_blocks_subtract_msb_mask_untextured() \
3940
3941
3942#define blend_blocks_subtract_mask_set_on() \
3943 vclt.s16 write_mask, fb_pixels, #0 \
3944
3945#define blend_blocks_subtract_mask_copy_on() \
3946 vorr.u16 draw_mask, draw_mask_next, write_mask \
3947
3948#define blend_blocks_subtract_mask_set_off() \
3949
3950#define blend_blocks_subtract_mask_copy_off() \
3951 vmov draw_mask, draw_mask_next \
3952
3953
3954#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3955.align 3; \
3956 \
3957function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3958 stmdb sp!, { r4, r14 }; \
3959 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3960 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3961 \
3962 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3963 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3964 \
3965 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3966 mov c_64, #64; \
3967 \
3968 vmov.u16 d128_0x7C1F, #0x7C00; \
3969 vmov.u16 d128_0x03E0, #0x0300; \
3970 vorr.u16 d128_0x7C1F, #0x001F; \
3971 vorr.u16 d128_0x03E0, #0x00E0; \
3972 \
3973 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3974 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3975 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3976 blend_blocks_subtract_set_blend_mask_##texturing(); \
3977 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3978 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3979 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3980 \
3981 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3982 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3983 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3984 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3985 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3986 \
3987 subs num_blocks, num_blocks, #1; \
3988 beq 1f; \
3989 \
3990 0: \
3991 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3992 mov fb_ptr, fb_ptr_next; \
3993 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3994 \
3995 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3996 blend_blocks_subtract_msb_mask_##texturing(); \
3997 \
3998 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3999 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4000 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
4001 blend_blocks_subtract_set_stb_##texturing(); \
4002 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4003 blend_blocks_subtract_combine_##texturing(); \
4004 blend_blocks_subtract_set_blend_mask_##texturing(); \
4005 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4006 \
4007 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4008 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4009 cmp fb_ptr_cmp, #28; \
4010 bls 2f; \
4011 \
4012 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4013 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4014 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4015 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4016 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4017 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4018 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4019 \
4020 3: \
4021 subs num_blocks, num_blocks, #1; \
4022 bne 0b; \
4023 \
4024 1: \
4025 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4026 \
4027 blend_blocks_subtract_msb_mask_##texturing(); \
4028 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4029 blend_blocks_subtract_set_stb_##texturing(); \
4030 blend_blocks_subtract_combine_##texturing(); \
4031 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4032 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4033 \
4034 ldmia sp!, { r4, pc }; \
4035 \
4036 2: \
4037 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4038 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4039 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4040 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4041 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4042 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4043 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4044 bal 3b \
4045
4046
4047blend_blocks_subtract_builder(textured, off)
4048blend_blocks_subtract_builder(textured, on)
4049blend_blocks_subtract_builder(untextured, off)
4050blend_blocks_subtract_builder(untextured, on)
4051
4052
4053#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4054.align 3; \
4055 \
4056function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4057 stmdb sp!, { r4, r14 }; \
4058 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4059 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4060 \
4061 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4062 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4063 \
4064 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4065 mov c_64, #64; \
4066 \
4067 vmov.u16 d128_0x7C1F, #0x7C00; \
4068 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4069 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4070 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4071 vorr.u16 d128_0x7C1F, #0x001F; \
4072 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4073 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62
E
4074 \
4075 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4076 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4077 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4078 vclt.s16 blend_mask, pixels, #0; \
4079 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4080 blend_blocks_add_mask_set_##mask_evaluate(); \
4081 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4082 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4083 \
4084 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4085 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4086 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4087 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4088 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4089 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4090 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4091 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4092 \
4093 subs num_blocks, num_blocks, #1; \
4094 beq 1f; \
4095 \
4096 0: \
4097 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4098 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4099 \
d1c75d1e
E
4100 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4101 vbif.u16 blend_pixels, pixels, blend_mask; \
4102 \
75e28f62
E
4103 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4104 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4105 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4106 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4107 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4108 \
4109 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4110 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4111 \
4112 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4113 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4114 cmp fb_ptr_cmp, #28; \
4115 bls 2f; \
4116 \
4117 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4118 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4119 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4120 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4121 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4122 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4123 \
4124 3: \
d1c75d1e 4125 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4126 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4127 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4128 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4129 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4130 \
4131 subs num_blocks, num_blocks, #1; \
4132 bne 0b; \
4133 \
4134 1: \
4135 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
d1c75d1e
E
4136 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4137 vbif.u16 blend_pixels, pixels, blend_mask; \
75e28f62
E
4138 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4139 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4140 \
4141 ldmia sp!, { r4, pc }; \
4142 \
4143 2: \
4144 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
d1c75d1e 4145 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62
E
4146 \
4147 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4148 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4149 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4150 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4151 bal 3b \
4152
4153
d1c75d1e 4154
75e28f62
E
4155#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4156.align 3; \
4157 \
4158function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4159 stmdb sp!, { r4, r14 }; \
4160 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4161 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4162 \
4163 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4164 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4165 \
4166 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4167 mov c_64, #64; \
4168 \
4169 vmov.u16 d128_0x7C1F, #0x7C00; \
4170 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4171 vmov.u16 d128_0x1C07, #0x1C00; \
4172 vmov.u16 d128_0x00E0, #0x00E0; \
4173 vorr.u16 d128_0x7C1F, #0x001F; \
4174 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4175 vorr.u16 d128_0x1C07, #0x0007; \
4176 \
4177 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4178 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4179 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4180 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4181 blend_blocks_add_mask_set_##mask_evaluate(); \
4182 vshr.s16 pixels_fourth, pixels, #2; \
4183 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4184 \
4185 blend_blocks_add_mask_copy_##mask_evaluate(); \
4186 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4187 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4188 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4189 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4190 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4191 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4192 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4193 \
4194 subs num_blocks, num_blocks, #1; \
4195 beq 1f; \
4196 \
4197 0: \
4198 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4199 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4200 \
4201 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4202 \
4203 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4204 vshr.s16 pixels_fourth, pixels, #2; \
4205 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4206 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4207 \
4208 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4209 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4210 \
4211 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4212 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4213 cmp fb_ptr_cmp, #28; \
4214 bls 2f; \
4215 \
4216 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4217 blend_blocks_add_mask_set_##mask_evaluate(); \
4218 blend_blocks_add_mask_copy_##mask_evaluate(); \
4219 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4220 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4221 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4222 \
4223 3: \
4224 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4225 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4226 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4227 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4228 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4229 \
4230 subs num_blocks, num_blocks, #1; \
4231 bne 0b; \
4232 \
4233 1: \
4234 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4235 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4236 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4237 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4238 \
4239 ldmia sp!, { r4, pc }; \
4240 \
4241 2: \
4242 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4243 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4244 \
4245 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4246 blend_blocks_add_mask_set_##mask_evaluate(); \
4247 blend_blocks_add_mask_copy_##mask_evaluate(); \
4248 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4249 bal 3b \
4250
4251
4252blend_blocks_add_fourth_textured_builder(off)
4253blend_blocks_add_fourth_textured_builder(on)
4254blend_blocks_add_fourth_untextured_builder(off)
4255blend_blocks_add_fourth_untextured_builder(on)
4256
4257// TODO: Optimize this more. Need a scene that actually uses it for
4258// confirmation..
4259
4260.align 3
4261
4262function(blend_blocks_textured_unblended_on)
4263 stmdb sp!, { r4, r14 }
4264 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4265 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4266
4267 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4268 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4269
4270 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4271 mov c_64, #64
4272
4273 ldr fb_ptr, [ pixel_ptr, #28 ]
4274 vld1.u16 { fb_pixels }, [ fb_ptr ]
4275 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4276 vclt.s16 write_mask, fb_pixels, #0
4277 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4278
4279 subs num_blocks, num_blocks, #1
4280 beq 1f
4281
4282 0:
4283 vorr.u16 draw_mask, draw_mask, write_mask
4284 vbif.u16 fb_pixels, pixels, draw_mask
4285 vst1.u16 { fb_pixels }, [ fb_ptr ]
4286
4287 ldr fb_ptr, [ pixel_ptr, #28 ]
4288 vld1.u16 { fb_pixels }, [ fb_ptr ]
4289 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4290 vclt.s16 write_mask, fb_pixels, #0
4291 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4292
4293 subs num_blocks, num_blocks, #1
4294 bne 0b
4295
4296 1:
4297 vorr.u16 draw_mask, draw_mask, write_mask
4298 vbif.u16 fb_pixels, pixels, draw_mask
4299 vst1.u16 { fb_pixels }, [ fb_ptr ]
4300
4301 ldmia sp!, { r4, pc }
4302
4303
4304function(blend_blocks_textured_unblended_off)
4305 bx lr
4306
4307
4308function(warmup)
4309 mov r3, #64
4310 cmp r0, #0
4311 bxeq lr
4312
4313 0:
4314 vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4315
4316 subs r0, r0, #1
4317 bne 0b
4318
4319 bx lr
4320
6c4a10c4 4321#undef vram_ptr
75e28f62 4322#undef color
6c4a10c4 4323#undef width
75e28f62 4324#undef height
6c4a10c4 4325#undef pitch
75e28f62
E
4326
4327#define vram_ptr r0
6c4a10c4
E
4328#define color r1
4329#define width r2
4330#define height r3
75e28f62 4331
6c4a10c4 4332#define pitch r1
75e28f62 4333
6c4a10c4 4334#define num_width r12
75e28f62 4335
87c45ad1
E
4336#undef colors_a
4337#undef colors_b
75e28f62 4338
87c45ad1
E
4339#define colors_a q0
4340#define colors_b q1
75e28f62
E
4341
4342.align 3
4343
4344function(render_block_fill_body)
87c45ad1 4345 vdup.u16 colors_a, color
6c4a10c4 4346 mov pitch, #2048
75e28f62 4347
87c45ad1 4348 vmov colors_b, colors_a
75e28f62 4349 sub pitch, pitch, width, lsl #1
75e28f62 4350
6c4a10c4 4351 mov num_width, width
75e28f62 4352
6c4a10c4
E
4353 0:
4354 vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]!
75e28f62 4355
d1c75d1e 4356 subs num_width, num_width, #16
6c4a10c4 4357 bne 0b
75e28f62 4358
75e28f62 4359 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4360 mov num_width, width
4361
75e28f62
E
4362 subs height, height, #1
4363 bne 0b
75e28f62 4364
6c4a10c4
E
4365 bx lr
4366
75e28f62
E
4367
4368#undef x
4369#undef y
4370#undef width
4371#undef height
4372#undef fb_ptr
4373#undef texture_mask
4374#undef num_blocks
4375#undef temp
4376#undef dirty_textures_mask
4377#undef clut_ptr
4378#undef current_texture_mask
4379
4380#define psx_gpu r0
4381#define x r1
4382#define y r2
4383#define u r3
4384#define v r4
4385#define width r5
4386#define height r6
4387#define offset_u r8
4388#define offset_v r9
4389#define offset_u_right r10
4390#define width_rounded r11
4391#define height_rounded r12
4392
4393#define texture_offset_base r1
4394#define tile_width r2
4395#define tile_height r3
4396#define num_blocks r4
4397#define block r5
4398#define sub_tile_height r6
4399#define fb_ptr r7
4400#define texture_mask r8
4401#define column_data r9
4402#define texture_offset r10
4403#define tiles_remaining r11
4404#define fb_ptr_advance_column r12
4405#define texture_block_ptr r14
4406
4407#define texture_page_ptr r3
4408#define left_block_mask r4
4409#define right_block_mask r5
4410#define texture_mask_rev r10
4411#define control_mask r11
4412
4413#define dirty_textures_mask r4
4414#define clut_ptr r5
4415#define current_texture_mask r6
4416
4417
4418#undef texels
4419#undef clut_low_a
4420#undef clut_low_b
4421#undef clut_high_a
4422#undef clut_high_b
4423#undef clut_a
4424#undef clut_b
4425#undef texels_low
4426#undef texels_high
4427
4428#define texels d0
4429#define draw_masks_fb_ptrs q1
4430
4431#define draw_mask_fb_ptr_left d2
4432#define draw_mask_fb_ptr_right d3
4433
59d15d23 4434#define draw_mask_fb_ptr_left_a d2
4435#define draw_mask_fb_ptr_left_b d3
4436#define draw_mask_fb_ptr_right_a d10
4437#define draw_mask_fb_ptr_right_b d11
4438#define draw_masks_fb_ptrs2 q5
4439
75e28f62
E
4440#define clut_low_a d4
4441#define clut_low_b d5
4442#define clut_high_a d6
4443#define clut_high_b d7
4444
4445#define block_masks d8
4446#define block_masks_shifted d9
4447
4448#define clut_a q2
4449#define clut_b q3
4450
59d15d23 4451#define texels_low d12
4452#define texels_high d13
75e28f62 4453
59d15d23 4454#define texels_wide_low d14
4455#define texels_wide_high d15
4456#define texels_wide q7
75e28f62
E
4457
4458
59d15d23 4459setup_sprite_flush_blocks:
4460 vpush { q1 - q5 }
75e28f62
E
4461
4462 stmdb sp!, { r0 - r3, r12, r14 }
4463 bl flush_render_block_buffer
4464 ldmia sp!, { r0 - r3, r12, r14 }
4465
59d15d23 4466 vpop { q1 - q5 }
75e28f62
E
4467
4468 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4469 bx lr
4470
4471
4472setup_sprite_update_texture_4bpp_cache:
4473 stmdb sp!, { r0 - r3, r14 }
4474 bl update_texture_4bpp_cache
4475 ldmia sp!, { r0 - r3, pc }
4476
4477
4478setup_sprite_update_texture_8bpp_cache:
4479 stmdb sp!, { r0 - r3, r14 }
4480 bl update_texture_8bpp_cache
4481 ldmia sp!, { r0 - r3, pc }
4482
4483
4484#define setup_sprite_tiled_initialize_4bpp() \
4485 ldr dirty_textures_mask, \
4486 [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \
4487 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4488 \
4489 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4490 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4491 \
4492 tst current_texture_mask, dirty_textures_mask; \
4493 vuzp.u8 clut_a, clut_b; \
4494 \
4495 blne setup_sprite_update_texture_4bpp_cache \
4496
4497#define setup_sprite_tiled_initialize_8bpp() \
4498 ldr dirty_textures_mask, \
4499 [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \
4500 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4501 \
4502 tst current_texture_mask, dirty_textures_mask; \
4503 blne setup_sprite_update_texture_8bpp_cache \
4504
4505
75e28f62
E
4506#define setup_sprite_block_count_single() \
4507 sub_tile_height \
4508
4509#define setup_sprite_block_count_double() \
4510 sub_tile_height, lsl #1 \
4511
4512#define setup_sprite_tile_add_blocks(type) \
4513 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4514 cmp num_blocks, #MAX_BLOCKS; \
4515 \
59d15d23 4516 movgt num_blocks, setup_sprite_block_count_##type(); \
4517 blgt setup_sprite_flush_blocks \
75e28f62
E
4518
4519
4520#define setup_sprite_tile_full_4bpp(edge) \
4521 setup_sprite_tile_add_blocks(double); \
4522 \
4523 4: \
4524 and texture_block_ptr, texture_offset, texture_mask; \
4525 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4526 \
4527 pld [ fb_ptr ]; \
4528 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4529 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4530 \
4531 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4532 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4533 \
4534 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4535 add texture_block_ptr, texture_offset, #8; \
4536 \
4537 and texture_block_ptr, texture_block_ptr, texture_mask; \
4538 add block, block, #40; \
4539 \
4540 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4541 add fb_ptr, fb_ptr, #16; \
4542 \
4543 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4544 add block, block, #24; \
4545 \
4546 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4547 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4548 \
4549 pld [ fb_ptr ]; \
4550 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4551 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4552 \
4553 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4554 add block, block, #40; \
4555 \
4556 add texture_offset, texture_offset, #0x10; \
4557 add fb_ptr, fb_ptr, #(2048 - 16); \
4558 \
4559 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4560 add block, block, #24; \
4561 \
4562 subs sub_tile_height, sub_tile_height, #1; \
4563 bne 4b; \
4564 \
4565 add texture_offset, texture_offset, #0xF00; \
4566 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4567
4568
4569#define setup_sprite_tile_half_4bpp(edge) \
4570 setup_sprite_tile_add_blocks(single); \
4571 \
4572 4: \
4573 and texture_block_ptr, texture_offset, texture_mask; \
4574 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4575 \
4576 pld [ fb_ptr ]; \
4577 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4578 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4579 \
4580 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4581 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4582 \
4583 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4584 add block, block, #40; \
4585 \
4586 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4587 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4588 \
4589 add block, block, #24; \
4590 add texture_offset, texture_offset, #0x10; \
4591 \
4592 add fb_ptr, fb_ptr, #2048; \
4593 subs sub_tile_height, sub_tile_height, #1; \
4594 \
4595 bne 4b; \
4596 \
4597 add texture_offset, texture_offset, #0xF00; \
4598 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4599
4600
4601#define setup_sprite_tile_full_8bpp(edge) \
4602 setup_sprite_tile_add_blocks(double); \
4603 add block, block, #16; \
4604 \
4605 4: \
4606 and texture_block_ptr, texture_offset, texture_mask; \
4607 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4608 \
4609 pld [ fb_ptr ]; \
4610 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4611 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4612 \
4613 add texture_block_ptr, texture_offset, #8; \
4614 vst1.u32 { texels }, [ block, :64 ]; \
4615 \
4616 and texture_block_ptr, texture_block_ptr, texture_mask; \
4617 add block, block, #24; \
4618 \
4619 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4620 \
4621 add fb_ptr, fb_ptr, #16; \
4622 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4623 \
4624 add block, block, #40; \
4625 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4626 pld [ fb_ptr ]; \
4627 \
4628 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4629 vst1.u32 { texels }, [ block, :64 ]; \
4630 add block, block, #24; \
4631 \
4632 add texture_offset, texture_offset, #0x10; \
4633 add fb_ptr, fb_ptr, #(2048 - 16); \
4634 \
4635 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4636 add block, block, #40; \
4637 \
4638 subs sub_tile_height, sub_tile_height, #1; \
4639 bne 4b; \
4640 \
4641 sub block, block, #16; \
4642 add texture_offset, texture_offset, #0xF00; \
4643 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4644
4645
4646#define setup_sprite_tile_half_8bpp(edge) \
4647 setup_sprite_tile_add_blocks(single); \
4648 add block, block, #16; \
4649 \
4650 4: \
4651 and texture_block_ptr, texture_offset, texture_mask; \
4652 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4653 pld [ fb_ptr ]; \
4654 \
4655 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4656 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4657 \
4658 vst1.u32 { texels }, [ block, :64 ]; \
4659 add block, block, #24; \
4660 \
4661 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4662 add block, block, #40; \
4663 \
4664 add texture_offset, texture_offset, #0x10; \
4665 add fb_ptr, fb_ptr, #2048; \
4666 \
4667 subs sub_tile_height, sub_tile_height, #1; \
4668 bne 4b; \
4669 \
4670 sub block, block, #16; \
4671 add texture_offset, texture_offset, #0xF00; \
4672 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4673
4674
4675#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4676 add texture_offset, texture_offset_base, #8; \
4677 add fb_ptr, fb_ptr, #16 \
4678
4679#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4680 mov texture_offset, texture_offset_base \
4681
4682#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4683 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4684
4685#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4686 mov texture_offset, texture_offset_base \
4687
4688#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4689 sub fb_ptr, fb_ptr, #16 \
4690
4691#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4692
4693#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4694 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4695
4696#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4697
4698
59d15d23 4699#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4700 x4mode) \
75e28f62 4701 mov sub_tile_height, column_data; \
59d15d23 4702 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4703 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4704 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4705
59d15d23 4706#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4707 x4mode) \
75e28f62
E
4708 and sub_tile_height, column_data, #0xFF; \
4709 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4710 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4711 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4712 \
4713 subs tiles_remaining, tiles_remaining, #1; \
4714 beq 2f; \
4715 \
4716 3: \
4717 mov sub_tile_height, #16; \
59d15d23 4718 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4719 subs tiles_remaining, tiles_remaining, #1; \
4720 bne 3b; \
4721 \
4722 2: \
4723 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4724 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4725 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4726
4727
4728#define setup_sprite_column_data_single() \
4729 mov column_data, height; \
4730 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \
4731
4732#define setup_sprite_column_data_multi() \
4733 and height_rounded, height_rounded, #0xF; \
4734 rsb column_data, offset_v, #16; \
4735 \
4736 add height_rounded, height_rounded, #1; \
4737 sub tile_height, tile_height, #1; \
4738 \
4739 orr column_data, column_data, tile_height, lsl #16; \
4740 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \
4741 \
4742 orr column_data, column_data, height_rounded, lsl #8 \
4743
59d15d23 4744#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4745 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4746 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4747
4748#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4749 mov fb_ptr_advance_column, #32; \
4750 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4751 \
4752 sub fb_ptr_advance_column, height, lsl #11; \
4753 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4754
4755#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4756 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4757 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4758
4759#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4760 edge, x4mode) \
4761 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4762 setup_sprite_column_data_##multi_height(); \
4763 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4764 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4765 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4766 \
59d15d23 4767 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
75e28f62
E
4768 ldmia sp!, { r4 - r11, pc } \
4769
4770#define setup_sprite_tiled_advance_column() \
4771 add texture_offset_base, texture_offset_base, #0x100; \
4772 tst texture_offset_base, #0xF00; \
4773 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4774
4775#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4776 right_mode, x4mode) \
4777 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4778 setup_sprite_column_data_##multi_height(); \
75e28f62 4779 \
59d15d23 4780 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4781 \
59d15d23 4782 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4783 \
4784 subs tile_width, tile_width, #2; \
4785 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4786 \
75e28f62
E
4787 beq 1f; \
4788 \
59d15d23 4789 vmov.u8 draw_masks_fb_ptrs, #0; \
4790 vmov.u8 draw_masks_fb_ptrs2, #0; \
4791 \
75e28f62
E
4792 0: \
4793 setup_sprite_tiled_advance_column(); \
59d15d23 4794 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4795 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4796 subs tile_width, tile_width, #1; \
4797 bne 0b; \
4798 \
4799 1: \
59d15d23 4800 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4801 \
4802 setup_sprite_tiled_advance_column(); \
59d15d23 4803 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
75e28f62
E
4804 ldmia sp!, { r4 - r11, pc } \
4805
4806
59d15d23 4807#define setup_sprite_offset_u_adjust() \
4808
4809#define setup_sprite_get_left_block_mask() \
4810 and left_block_mask, left_block_mask, #0xFF \
4811
4812#define setup_sprite_compare_left_block_mask() \
4813 cmp left_block_mask, #0xFF \
4814
4815#define setup_sprite_get_right_block_mask() \
4816 uxtb right_block_mask, right_block_mask, ror #8 \
4817
4818#define setup_sprite_compare_right_block_mask() \
4819 cmp right_block_mask, #0xFF \
4820
4821
4822
4823/* 4x stuff */
4824#define fb_ptr2 column_data
4825
4826#define setup_sprite_offset_u_adjust_4x() \
4827 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4828 lsl offset_u_right, #1; \
4829 lsl offset_u, #1; \
4830 add offset_u_right, #1 \
4831
4832#define setup_sprite_get_left_block_mask_4x() \
4833 sxth left_block_mask, left_block_mask \
4834
4835#define setup_sprite_compare_left_block_mask_4x() \
4836 cmp left_block_mask, #0xFFFFFFFF \
4837
4838#define setup_sprite_get_right_block_mask_4x() \
4839 sxth right_block_mask, right_block_mask, ror #16 \
4840
4841#define setup_sprite_compare_right_block_mask_4x() \
4842 cmp right_block_mask, #0xFFFFFFFF \
4843
4844
4845#define widen_texels_16bpp(texels_) \
4846 vmov texels_wide_low, texels_; \
4847 vmov texels_wide_high, texels_; \
4848 vzip.16 texels_wide_low, texels_wide_high \
4849
4850#define widen_texels_8bpp(texels_) \
4851 vmov texels_wide_low, texels_; \
4852 vmov texels_wide_high, texels_; \
4853 vzip.8 texels_wide_low, texels_wide_high \
4854
4855#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4856 vst1.u32 { texels_ }, [ block_, :128 ]; \
4857 add block_, block_, #40; \
4858 \
4859 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4860 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4861 add block_, block_, #24 \
4862
4863/* assumes 16-byte offset already added to block_ */
4864#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4865 vst1.u32 { texels_ }, [ block_, :64 ]; \
4866 add block_, block_, #24; \
4867 \
4868 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4869 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4870 add block_, block_, #40 \
4871
4872#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4873 draw_mask_fb_ptr_b_) \
4874 widen_texels_16bpp(texels_low); \
4875 add fb_ptr_tmp, fb_ptr, #1024*2; \
4876 \
4877 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4878 \
4879 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4880 widen_texels_16bpp(texels_high); \
4881 \
4882 add fb_ptr_tmp, fb_ptr, #8*2; \
4883 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4884 \
4885 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4886 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4887
4888#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4889 draw_mask_fb_ptr_b_) \
4890 widen_texels_8bpp(texels); \
4891 add fb_ptr_tmp, fb_ptr, #1024*2; \
4892 \
4893 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4894 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4895 \
4896 add fb_ptr_tmp, fb_ptr, #8*2; \
4897 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4898 \
4899 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4900 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4901
4902
4903#define setup_sprite_tiled_initialize_4bpp_4x() \
4904 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4905 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4906 \
4907 vuzp.u8 clut_a, clut_b \
4908
4909#define setup_sprite_tiled_initialize_8bpp_4x() \
4910
4911
4912#define setup_sprite_block_count_single_4x() \
4913 sub_tile_height, lsl #2 \
4914
4915#define setup_sprite_block_count_double_4x() \
4916 sub_tile_height, lsl #(1+2) \
4917
4918#define setup_sprite_tile_full_4bpp_4x(edge) \
4919 setup_sprite_tile_add_blocks(double_4x); \
4920 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4921 \
4922 4: \
4923 and texture_block_ptr, texture_offset, texture_mask; \
4924 pld [ fb_ptr ]; \
4925 \
4926 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4927 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4928 \
4929 add texture_block_ptr, texture_offset, #8; \
4930 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4931 \
4932 and texture_block_ptr, texture_block_ptr, texture_mask; \
4933 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4934 \
4935 vzip.8 texels_low, texels_high; \
4936 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4937 draw_mask_fb_ptr_left_b); \
4938 \
4939 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
8438c3c7 4940 pld [ fb_ptr, #2048 ]; \
59d15d23 4941 \
4942 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
8438c3c7 4943 add fb_ptr, fb_ptr, #16*2; \
59d15d23 4944 \
8438c3c7 4945 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
59d15d23 4946 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4947 \
4948 vzip.8 texels_low, texels_high; \
4949 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4950 draw_mask_fb_ptr_right_b); \
4951 \
4952 add texture_offset, texture_offset, #0x10; \
4953 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4954 \
4955 subs sub_tile_height, sub_tile_height, #1; \
4956 bne 4b; \
4957 \
4958 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4959 add texture_offset, texture_offset, #0xF00; \
4960 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4961
4962
4963#define setup_sprite_tile_half_4bpp_4x(edge) \
4964 setup_sprite_tile_add_blocks(single_4x); \
4965 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4966 \
4967 4: \
4968 and texture_block_ptr, texture_offset, texture_mask; \
4969 pld [ fb_ptr ]; \
4970 \
4971 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4972 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4973 \
4974 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4975 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4976 \
4977 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4978 add texture_offset, texture_offset, #0x10; \
4979 \
4980 vzip.8 texels_low, texels_high; \
4981 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
4982 draw_mask_fb_ptr_##edge##_b); \
4983 \
8438c3c7 4984 pld [ fb_ptr, #2048 ]; \
59d15d23 4985 add fb_ptr, fb_ptr, #2048 * 2; \
59d15d23 4986 \
8438c3c7 4987 subs sub_tile_height, sub_tile_height, #1; \
59d15d23 4988 bne 4b; \
4989 \
4990 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4991 add texture_offset, texture_offset, #0xF00; \
4992 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4993
4994
4995#define setup_sprite_tile_full_8bpp_4x(edge) \
4996 setup_sprite_tile_add_blocks(double_4x); \
4997 add block, block, #16; \
4998 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4999 \
5000 4: \
5001 and texture_block_ptr, texture_offset, texture_mask; \
5002 pld [ fb_ptr ]; \
5003 \
5004 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5005 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5006 \
5007 add texture_block_ptr, texture_offset, #8; \
5008 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5009 draw_mask_fb_ptr_left_b); \
5010 \
8438c3c7 5011 pld [ fb_ptr, #2048 ]; \
59d15d23 5012 and texture_block_ptr, texture_block_ptr, texture_mask; \
5013 \
5014 add fb_ptr, fb_ptr, #16*2; \
5015 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5016 \
5017 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
59d15d23 5018 \
5019 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5020 draw_mask_fb_ptr_right_b); \
5021 \
5022 add texture_offset, texture_offset, #0x10; \
5023 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5024 \
5025 subs sub_tile_height, sub_tile_height, #1; \
5026 bne 4b; \
5027 \
5028 sub block, block, #16; \
5029 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5030 add texture_offset, texture_offset, #0xF00; \
5031 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5032
5033
5034#define setup_sprite_tile_half_8bpp_4x(edge) \
5035 setup_sprite_tile_add_blocks(single_4x); \
5036 add block, block, #16; \
5037 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5038 \
5039 4: \
5040 and texture_block_ptr, texture_offset, texture_mask; \
5041 pld [ fb_ptr ]; \
5042 \
5043 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5044 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5045 \
8438c3c7 5046 pld [ fb_ptr, #2048 ]; \
59d15d23 5047 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5048 draw_mask_fb_ptr_##edge##_b); \
5049 \
5050 add texture_offset, texture_offset, #0x10; \
5051 add fb_ptr, fb_ptr, #2048 * 2; \
5052 \
5053 subs sub_tile_height, sub_tile_height, #1; \
5054 bne 4b; \
5055 \
5056 sub block, block, #16; \
5057 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5058 add texture_offset, texture_offset, #0xF00; \
5059 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5060
5061
5062#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5063 add texture_offset, texture_offset_base, #8; \
5064 add fb_ptr, fb_ptr, #16 * 2 \
5065
5066#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5067 mov texture_offset, texture_offset_base \
5068
5069#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5070 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5071
5072#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5073 mov texture_offset, texture_offset_base \
5074
5075#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5076 sub fb_ptr, fb_ptr, #16 * 2 \
5077
5078#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5079
5080#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5081 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5082
5083#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5084
5085
5086#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5087 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5088 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5089 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5090 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5091
5092#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5093 mov fb_ptr_advance_column, #32 * 2; \
5094 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5095 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5096 sub fb_ptr_advance_column, height, lsl #11 + 1; \
5097 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5098 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5099
5100#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5101 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5102 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5103 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5104 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5105
5106
75e28f62
E
5107// r0: psx_gpu
5108// r1: x
5109// r2: y
5110// r3: u
5111// [ sp ]: v
5112// [ sp + 4 ]: width
5113// [ sp + 8 ]: height
5114// [ sp + 12 ]: color (unused)
5115
59d15d23 5116#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5117 \
5118setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5119 x4mode); \
5120setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5121 x4mode); \
5122setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5123 x4mode); \
5124setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5125 x4mode); \
5126setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5127 x4mode); \
5128setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5129 x4mode); \
5130setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5131 x4mode); \
5132setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5133 x4mode); \
5134setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5135 x4mode); \
5136setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5137 x4mode); \
5138setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5139 x4mode); \
5140setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5141 x4mode); \
5142setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5143 x4mode); \
5144setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5145 x4mode); \
75e28f62
E
5146 \
5147.align 4; \
5148 \
59d15d23 5149function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5150 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5151 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62
E
5152 \
5153 ldr v, [ sp, #36 ]; \
5154 and offset_u, u, #0xF; \
5155 \
5156 ldr width, [ sp, #40 ]; \
c1817bd9 5157 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
5158 \
5159 ldr height, [ sp, #44 ]; \
5160 add fb_ptr, fb_ptr, y, lsl #11; \
5161 \
5162 add fb_ptr, fb_ptr, x, lsl #1; \
5163 and offset_v, v, #0xF; \
5164 \
5165 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5166 add width_rounded, offset_u, width; \
5167 \
5168 add height_rounded, offset_v, height; \
5169 add width_rounded, width_rounded, #15; \
5170 \
5171 add height_rounded, height_rounded, #15; \
5172 mov tile_width, width_rounded, lsr #4; \
5173 \
5174 /* texture_offset_base = VH-VL-00-00 */\
5175 mov texture_offset_base, v, lsl #8; \
5176 and offset_u_right, width_rounded, #0xF; \
5177 \
5178 /* texture_offset_base = VH-UH-UL-00 */\
5179 bfi texture_offset_base, u, #4, #8; \
59d15d23 5180 mov right_block_mask, #0xFFFFFFFE; \
5181 \
5182 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5183 \
5184 /* texture_offset_base = VH-UH-VL-00 */\
5185 bfi texture_offset_base, v, #4, #4; \
59d15d23 5186 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5187 \
5188 mov tile_height, height_rounded, lsr #4; \
5189 mvn left_block_mask, left_block_mask, lsl offset_u; \
5190 \
5191 /* texture_mask = HH-HL-WH-WL */\
5192 ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \
5193 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5194 \
5195 /* texture_mask_rev = WH-WL-HH-HL */\
5196 rev16 texture_mask_rev, texture_mask; \
5197 vmov block_masks, left_block_mask, right_block_mask; \
5198 \
5199 /* texture_mask = HH-HL-HL-WL */\
5200 bfi texture_mask, texture_mask_rev, #4, #4; \
5201 /* texture_mask_rev = 00-00-00-WH */\
5202 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5203 \
5204 /* texture_mask = HH-WH-HL-WL */\
5205 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5206 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5207 \
5208 mov control_mask, #0; \
59d15d23 5209 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5210 \
59d15d23 5211 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5212 orreq control_mask, control_mask, #0x4; \
5213 \
5214 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
59d15d23 5215 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5216 \
5217 orreq control_mask, control_mask, #0x8; \
5218 cmp tile_width, #1; \
5219 \
5220 add block, psx_gpu, #psx_gpu_blocks_offset; \
5221 orreq control_mask, control_mask, #0x1; \
5222 \
5223 cmp tile_height, #1; \
5224 add block, block, num_blocks, lsl #6; \
5225 \
5226 orreq control_mask, control_mask, #0x2; \
5227 ldr pc, [ pc, control_mask, lsl #2 ]; \
5228 nop; \
5229 \
59d15d23 5230 .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \
5231 .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \
5232 .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \
5233 .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \
5234 .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \
5235 .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \
5236 .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \
5237 .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \
5238 .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \
5239 .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \
5240 .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \
5241 .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \
5242 .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \
75e28f62 5243 .word 0x00000000; \
59d15d23 5244 .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \
5245
5246
5247setup_sprite_tiled_builder(4bpp,);
5248setup_sprite_tiled_builder(8bpp,);
75e28f62 5249
59d15d23 5250#undef draw_mask_fb_ptr_left
5251#undef draw_mask_fb_ptr_right
75e28f62 5252
59d15d23 5253setup_sprite_tiled_builder(4bpp, _4x);
5254setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5255
5256
5257#undef block_ptr
5258#undef num_blocks
5259#undef clut_ptr
5260
5261#define psx_gpu r0
5262#define block_ptr r0
5263#define num_blocks r1
5264#define clut_ptr r2
5265#define texel_shift_mask r3
5266#define block_pixels_a r4
5267#define block_pixels_b r5
5268#define texel_0 r6
5269#define texel_2 r7
5270#define texel_4 r8
5271#define texel_6 r9
5272#define texel_1 r10
5273#define texel_3 r11
5274#define texel_5 r12
5275#define texel_7 r14
5276#define texels_01 r6
5277#define texels_23 r7
5278#define texels_45 r8
5279#define texels_67 r9
5280
5281function(texture_sprite_blocks_8bpp)
5282 stmdb sp!, { r4 - r11, r14 }
5283 movw texel_shift_mask, #(0xFF << 1)
5284
5285 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5286 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
5287
5288 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5289 ldr block_pixels_a, [ block_ptr, #16 ]
5290
5291 0:
5292 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5293 ldr block_pixels_b, [ block_ptr, #20 ]
5294
5295 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5296 ldrh texel_0, [ clut_ptr, texel_0 ]
5297
5298 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5299 ldrh texel_1, [ clut_ptr, texel_1 ]
5300
5301 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5302 ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
5303
5304 ldrh texel_2, [ clut_ptr, texel_2 ]
5305 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5306
5307 ldrh texel_3, [ clut_ptr, texel_3 ]
5308 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5309
5310 ldrh texel_4, [ clut_ptr, texel_4 ]
5311 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5312
5313 ldrh texel_5, [ clut_ptr, texel_5 ]
5314 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5315
5316 ldrh texel_6, [ clut_ptr, texel_6 ]
5317 orr texels_01, texel_0, texel_1, lsl #16
5318
5319 ldrh texel_7, [ clut_ptr, texel_7 ]
5320 orr texels_23, texel_2, texel_3, lsl #16
5321
5322 orr texels_45, texel_4, texel_5, lsl #16
5323 str texels_01, [ block_ptr, #0 ]
5324
5325 orr texels_67, texel_6, texel_7, lsl #16
5326 str texels_23, [ block_ptr, #4 ]
5327
5328 subs num_blocks, num_blocks, #1
5329 str texels_45, [ block_ptr, #8 ]
5330
5331 str texels_67, [ block_ptr, #12 ]
5332 add block_ptr, block_ptr, #64
5333
5334 bne 0b
5335
5336 ldmia sp!, { r4 - r11, pc }
5337
5338
5339#undef width_rounded
5340#undef texture_mask
5341#undef num_blocks
5342#undef texture_offset
59d15d23 5343#undef texels_low
5344#undef texels_high
5345#undef texels_wide_low
5346#undef texels_wide_high
5347#undef texels_wide
5348#undef fb_ptr2
75e28f62
E
5349
5350#define psx_gpu r0
5351#define x r1
5352#define y r2
5353#define u r3
5354#define v r4
5355#define width r5
5356#define height r6
5357#define left_offset r8
5358#define width_rounded r9
5359#define right_width r10
59d15d23 5360
75e28f62
E
5361#define block_width r11
5362
5363#define texture_offset_base r1
5364#define texture_mask r2
5365#define texture_page_ptr r3
5366#define num_blocks r4
5367#define block r5
5368#define fb_ptr r7
5369#define texture_offset r8
5370#define blocks_remaining r9
59d15d23 5371#define fb_ptr2 r10
75e28f62
E
5372#define fb_ptr_pitch r12
5373#define texture_block_ptr r14
5374
5375#define texture_mask_width r2
5376#define texture_mask_height r3
5377#define left_mask_bits r4
5378#define right_mask_bits r5
5379
5380
5381#undef block_masks
5382#undef block_masks_shifted
5383#undef texels
5384
5385#define block_masks d0
5386#define block_masks_shifted d1
5387#define draw_mask_fb_ptr d2
5388#define texels q2
5389
59d15d23 5390#define draw_mask_fb_ptr_a d2
5391#define draw_mask_fb_ptr_b d3
5392#define texels_low d4
5393#define texels_high d5
5394#define texels_wide_low d6
5395#define texels_wide_high d7
5396#define texels_wide q3
75e28f62 5397
75e28f62 5398
59d15d23 5399setup_sprites_16bpp_flush:
5400 vpush { d0 - d3 }
75e28f62
E
5401
5402 stmdb sp!, { r0 - r3, r12, r14 }
5403 bl flush_render_block_buffer
5404 ldmia sp!, { r0 - r3, r12, r14 }
5405
59d15d23 5406 vpop { d0 - d3 }
75e28f62
E
5407
5408 add block, psx_gpu, #psx_gpu_blocks_offset
5409 mov num_blocks, block_width
5410
5411 bx lr
5412
5413function(setup_sprite_16bpp)
5414 stmdb sp!, { r4 - r11, r14 }
c1817bd9 5415 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
5416
5417 ldr v, [ sp, #36 ]
5418 add fb_ptr, fb_ptr, y, lsl #11
5419
5420 ldr width, [ sp, #40 ]
5421 add fb_ptr, fb_ptr, x, lsl #1
5422
5423 ldr height, [ sp, #44 ]
5424 and left_offset, u, #0x7
5425
5426 add texture_offset_base, u, u
5427 add width_rounded, width, #7
5428
5429 add texture_offset_base, v, lsl #11
5430 mov left_mask_bits, #0xFF
5431
5432 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5433 add width_rounded, width_rounded, left_offset
5434
5435 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5436 sub fb_ptr, fb_ptr, left_offset, lsl #1
5437
5438 add texture_mask, texture_mask_width, texture_mask_width
5439 mov right_mask_bits, #0xFE
5440
5441 and right_width, width_rounded, #0x7
5442 mvn left_mask_bits, left_mask_bits, lsl left_offset
5443
5444 add texture_mask, texture_mask_height, lsl #11
5445 mov block_width, width_rounded, lsr #3
5446
5447 mov right_mask_bits, right_mask_bits, lsl right_width
5448 movw fb_ptr_pitch, #(2048 + 16)
5449
5450 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5451 vmov block_masks, left_mask_bits, right_mask_bits
5452
5453 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5454 add block, psx_gpu, #psx_gpu_blocks_offset
5455
6ea0f7bf 5456 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5457 cmp block_width, #1
5458
5459 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5460 add block, block, num_blocks, lsl #6
5461
5462 bne 0f
5463
5464 vext.32 block_masks_shifted, block_masks, block_masks, #1
5465 vorr.u32 block_masks, block_masks, block_masks_shifted
5466 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5467
5468 1:
5469 add num_blocks, num_blocks, #1
5470 cmp num_blocks, #MAX_BLOCKS
59d15d23 5471 blgt setup_sprites_16bpp_flush
75e28f62
E
5472
5473 and texture_block_ptr, texture_offset_base, texture_mask
5474 subs height, height, #1
5475
5476 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5477 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5478
5479 vst1.u32 { texels }, [ block, :128 ]
5480 add block, block, #40
5481
5482 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5483 pld [ fb_ptr ]
5484
5485 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5486
5487 add block, block, #24
5488 add texture_offset_base, texture_offset_base, #2048
5489 add fb_ptr, fb_ptr, #2048
5490 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5491 bne 1b
5492
5493 ldmia sp!, { r4 - r11, pc }
5494
5495 0:
5496 add num_blocks, num_blocks, block_width
5497 mov texture_offset, texture_offset_base
5498
5499 cmp num_blocks, #MAX_BLOCKS
59d15d23 5500 blgt setup_sprites_16bpp_flush
75e28f62
E
5501
5502 add texture_offset_base, texture_offset_base, #2048
5503 and texture_block_ptr, texture_offset, texture_mask
5504
5505 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5506 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5507
5508 vst1.u32 { texels }, [ block, :128 ]
5509 add block, block, #40
5510
5511 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5512 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5513 pld [ fb_ptr ]
5514
5515 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5516 subs blocks_remaining, block_width, #2
5517
5518 add texture_offset, texture_offset, #16
5519 add fb_ptr, fb_ptr, #16
5520
5521 vmov.u8 draw_mask_fb_ptr, #0
5522
5523 add block, block, #24
5524 beq 2f
5525
5526 1:
5527 and texture_block_ptr, texture_offset, texture_mask
5528 subs blocks_remaining, blocks_remaining, #1
5529
5530 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5531 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5532
5533 vst1.u32 { texels }, [ block, :128 ]
5534 add block, block, #40
5535
5536 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5537 pld [ fb_ptr ]
5538
5539 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5540
5541 add texture_offset, texture_offset, #16
5542 add fb_ptr, fb_ptr, #16
5543
5544 add block, block, #24
5545 bne 1b
5546
5547 2:
5548 and texture_block_ptr, texture_offset, texture_mask
5549 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5550
5551 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5552 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5553
5554 vst1.u32 { texels }, [ block, :128 ]
5555 add block, block, #40
5556
5557 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5558 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5559
5560 add block, block, #24
5561 subs height, height, #1
5562
5563 add fb_ptr, fb_ptr, fb_ptr_pitch
5564 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5565
5566 bne 0b
5567
5568 ldmia sp!, { r4 - r11, pc }
5569
5570
59d15d23 5571// 4x version
5572// FIXME: duplicate code with normal version :(
5573#undef draw_mask_fb_ptr
5574
5575function(setup_sprite_16bpp_4x)
5576 stmdb sp!, { r4 - r11, r14 }
5577 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5578
5579 ldr v, [ sp, #36 ]
5580 add fb_ptr, fb_ptr, y, lsl #11
5581
5582 ldr width, [ sp, #40 ]
5583 add fb_ptr, fb_ptr, x, lsl #1
5584
5585 ldr height, [ sp, #44 ]
5586 and left_offset, u, #0x7
5587
5588 add texture_offset_base, u, u
5589 add width_rounded, width, #7
5590
5591 add texture_offset_base, v, lsl #11
5592 movw left_mask_bits, #0xFFFF
5593
5594 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5595 add width_rounded, width_rounded, left_offset
5596
5597 lsl left_offset, #1
5598
5599 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5600 sub fb_ptr, fb_ptr, left_offset, lsl #1
5601
5602 add texture_mask, texture_mask_width, texture_mask_width
5603 movw right_mask_bits, #0xFFFC
5604
5605 and right_width, width_rounded, #0x7
5606 mvn left_mask_bits, left_mask_bits, lsl left_offset
5607
5608 lsl right_width, #1
5609
5610 add texture_mask, texture_mask_height, lsl #11
5611 mov block_width, width_rounded, lsr #3
5612
5613 mov right_mask_bits, right_mask_bits, lsl right_width
5614 movw fb_ptr_pitch, #(2048 + 16) * 2
5615
5616 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5617 vmov block_masks, left_mask_bits, right_mask_bits
5618
5619 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5620 add block, psx_gpu, #psx_gpu_blocks_offset
5621
5622 bic texture_offset_base, texture_offset_base, #0xF
5623 cmp block_width, #1
5624
5625 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5626 add block, block, num_blocks, lsl #6
5627
5628 lsl block_width, #2
5629 bne 0f
5630
5631 vext.32 block_masks_shifted, block_masks, block_masks, #1
5632 vorr.u32 block_masks, block_masks, block_masks_shifted
5633 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5634 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5635
5636 1:
5637 add num_blocks, num_blocks, block_width
5638 cmp num_blocks, #MAX_BLOCKS
5639 blgt setup_sprites_16bpp_flush
5640
5641 and texture_block_ptr, texture_offset_base, texture_mask
5642 subs height, height, #1
5643
5644 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5645 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5646
5647 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5648
5649 add texture_offset_base, texture_offset_base, #2048
5650 add fb_ptr, fb_ptr, #2048*2
5651 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5652 bne 1b
5653
5654 ldmia sp!, { r4 - r11, pc }
5655
5656 0:
5657 add num_blocks, num_blocks, block_width
5658 mov texture_offset, texture_offset_base
5659
5660 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5661 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5662
5663 cmp num_blocks, #MAX_BLOCKS
5664 blgt setup_sprites_16bpp_flush
5665
5666 add texture_offset_base, texture_offset_base, #2048
5667 and texture_block_ptr, texture_offset, texture_mask
5668
5669 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5670 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5671
5672 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5673
5674 subs blocks_remaining, block_width, #2*4
5675 add texture_offset, texture_offset, #16
5676
5677 vmov.u8 draw_mask_fb_ptr_a, #0
5678 vmov.u8 draw_mask_fb_ptr_b, #0
5679
5680 add fb_ptr, fb_ptr, #16*2
5681 beq 2f
5682
5683 1:
5684 and texture_block_ptr, texture_offset, texture_mask
5685 subs blocks_remaining, blocks_remaining, #4
5686
5687 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5688 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5689
5690 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5691 add texture_offset, texture_offset, #16
5692
5693 add fb_ptr, fb_ptr, #16*2
5694 bgt 1b
5695
5696 2:
5697 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5698 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5699
5700 and texture_block_ptr, texture_offset, texture_mask
5701 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5702
5703 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5704
5705 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5706 subs height, height, #1
5707
5708 add fb_ptr, fb_ptr, fb_ptr_pitch
5709 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5710
5711 bne 0b
5712
5713 ldmia sp!, { r4 - r11, pc }
5714
5715
f0931e56 5716#undef width
5717#undef right_width
5718#undef right_mask_bits
5719#undef color
5720#undef height
5721#undef blocks_remaining
5722#undef colors
5723#undef right_mask
5724#undef test_mask
5725#undef draw_mask
5726
5727#define psx_gpu r0
5728#define x r1
5729#define y r2
5730#define width r3
5731#define right_width r5
5732#define right_mask_bits r6
5733#define fb_ptr r7
5734#define color r8
5735#define height r9
5736#define fb_ptr_pitch r12
5737
5738// referenced by setup_sprites_16bpp_flush
5739#define num_blocks r4
5740#define block r5
5741#define block_width r11
5742
5743#define color_r r1
5744#define color_g r2
5745#define color_b r8
5746#define blocks_remaining r6
5747
5748#define colors q0
5749#define right_mask q1
5750#define test_mask q2
5751#define draw_mask q2
5752#define draw_mask_bits_fb_ptr d6
5753
5754
5755.align 3
5756
5757function(setup_sprite_untextured)
5758 ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
5759 tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
5760 | RENDER_FLAGS_BLEND)
d5c08ed3 5761 ldreqb r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
5762 tsteq r12, #RENDER_INTERLACE_ENABLED
f0931e56 5763 beq setup_sprite_untextured_simple
5764
5765 stmdb sp!, { r4 - r11, r14 }
5766
5767 ldr width, [ sp, #40 ]
5768 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5769
5770 ldr height, [ sp, #44 ]
5771 add fb_ptr, fb_ptr, y, lsl #11
5772
5773 add fb_ptr, fb_ptr, x, lsl #1
5774 sub right_width, width, #1
5775
5776 ldr color, [ sp, #48 ]
5777 and right_width, #7
5778
5779 add block_width, width, #7
5780 add right_width, #1
5781
5782 lsr block_width, #3
5783 mov right_mask_bits, #0xff
5784
5785 sub fb_ptr_pitch, block_width, #1
5786 lsl right_mask_bits, right_width
5787
5788 lsl fb_ptr_pitch, #3+1
5789 ubfx color_r, color, #3, #5
5790
5791 rsb fb_ptr_pitch, #1024*2
5792 ubfx color_g, color, #11, #5
5793
5794 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
5795 ubfx color_b, color, #19, #5
5796
5797 vdup.u16 right_mask, right_mask_bits
5798 orr color, color_r, color_b, lsl #10
5799
5800 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5801 orr color, color, color_g, lsl #5
5802
5803 vtst.u16 right_mask, right_mask, test_mask
5804 add block, psx_gpu, #psx_gpu_blocks_offset
5805
5806 vdup.u16 colors, color
5807 add block, block, num_blocks, lsl #6
5808
5809
5810setup_sprite_untextured_height_loop:
5811 add num_blocks, block_width
5812 sub blocks_remaining, block_width, #1
5813
5814 cmp num_blocks, #MAX_BLOCKS
5815 blgt setup_sprites_16bpp_flush
5816
5817 cmp blocks_remaining, #0
5818 ble 1f
5819
5820 vmov.u8 draw_mask, #0 /* zero_mask */
5821 vmov.u8 draw_mask_bits_fb_ptr, #0
5822
5823 0:
5824 vst1.u32 { draw_mask }, [ block, :128 ]!
5825 subs blocks_remaining, #1
5826
5827 vst1.u32 { colors }, [ block, :128 ]
5828 add block, block, #24
5829
5830 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5831 vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5832
5833 add block, block, #24
5834 add fb_ptr, #8*2
5835 bgt 0b
5836
5837 1:
5838 vst1.u32 { right_mask }, [ block, :128 ]!
5839 subs height, #1
5840
5841 vst1.u32 { colors }, [ block, :128 ]
5842 add block, block, #24
5843
5844 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5845 vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5846
5847 add block, block, #24
5848 add fb_ptr, fb_ptr_pitch
5849
5850 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5851 bgt setup_sprite_untextured_height_loop
5852
5853 ldmia sp!, { r4 - r11, pc }
5854
5855
5856
75e28f62
E
5857#undef texture_page_ptr
5858#undef vram_ptr
5859#undef dirty_textures_mask
5860#undef current_texture_mask
5861
5862#define psx_gpu r0
5863#define current_texture_page r1
5864#define texture_page_ptr r2
5865#define vram_ptr_a r3
5866#define current_texture_page_x r12
5867#define current_texture_page_y r4
5868#define dirty_textures_mask r5
5869#define tile_y r6
5870#define tile_x r7
5871#define sub_y r8
5872#define current_texture_mask r9
5873#define c_4096 r10
5874#define vram_ptr_b r11
5875
5876#define texel_block_a d0
5877#define texel_block_b d1
5878#define texel_block_expanded_a q1
5879#define texel_block_expanded_b q2
5880#define texel_block_expanded_ab q2
5881#define texel_block_expanded_c q3
5882#define texel_block_expanded_d q4
5883#define texel_block_expanded_cd q3
5884
5885function(update_texture_4bpp_cache)
5886 stmdb sp!, { r4 - r11, r14 }
5887 vpush { q0 - q3 }
5888
5889 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5890
3867c6ef 5891 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5892 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5893
5894 and current_texture_page_x, current_texture_page, #0xF
5895 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5896
5897 mov current_texture_page_y, current_texture_page, lsr #4
5898 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5899
5900 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5901 mov tile_y, #16
5902
5903 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5904 bic dirty_textures_mask, current_texture_mask
5905
5906 mov tile_x, #16
5907 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5908
5909 mov sub_y, #8
5910 movw c_4096, #4096
5911
5912 add vram_ptr_b, vram_ptr_a, #2048
5913
5914 0:
5915 vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5916 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5917
5918 vmovl.u8 texel_block_expanded_a, texel_block_a
5919 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5920 vmovl.u8 texel_block_expanded_c, texel_block_b
5921 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5922
5923 vbic.u16 texel_block_expanded_a, #0x00F0
5924 vbic.u16 texel_block_expanded_b, #0x00F0
5925 vbic.u16 texel_block_expanded_c, #0x00F0
5926 vbic.u16 texel_block_expanded_d, #0x00F0
5927
5928 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5929 texel_block_expanded_b
5930 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5931 texel_block_expanded_d
5932
5933 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
5934 [ texture_page_ptr, :256 ]!
5935
5936 subs sub_y, sub_y, #1
5937 bne 0b
5938
5939 mov sub_y, #8
5940 add vram_ptr_a, vram_ptr_a, #8
5941 add vram_ptr_b, vram_ptr_b, #8
5942
5943 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5944 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5945
5946 subs tile_x, tile_x, #1
5947 bne 0b
5948
5949 mov tile_x, #16
5950 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5951 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5952
5953 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5954 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5955
5956 subs tile_y, tile_y, #1
5957 bne 0b
5958
5959 vpop { q0 - q3 }
5960 ldmia sp!, { r4 - r11, pc }
5961
5962
5963#undef current_texture_page
5964
5965#define psx_gpu r0
5966#define texture_page r1
5967#define texture_page_ptr r2
5968#define vram_ptr_a r3
5969#define texture_page_x r12
5970#define texture_page_y r4
5971#define current_texture_page r5
5972#define tile_y r6
5973#define tile_x r7
5974#define sub_y r8
5975#define c_4096 r10
5976#define vram_ptr_b r11
5977
5978
5979#undef texels_a
5980#undef texels_b
5981
5982#define texels_a q0
5983#define texels_b q1
5984#define texels_c q2
5985#define texels_d q3
5986
5987
5988function(update_texture_8bpp_cache_slice)
5989 stmdb sp!, { r4 - r11, r14 }
5990 vpush { q0 - q3 }
5991
5992 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5993 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5994
3867c6ef 5995 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5996 mov tile_y, #16
5997
5998 and texture_page_x, texture_page, #0xF
5999 mov texture_page_y, texture_page, lsr #4
6000
6001 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
6002 mov tile_x, #8
6003
6004 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6005 eor current_texture_page, current_texture_page, texture_page
6006
6007 ands current_texture_page, current_texture_page, #0x1
6008 mov sub_y, #4
6009
6010 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6011 movw c_4096, #4096
6012
6013 add vram_ptr_b, vram_ptr_a, #2048
6014
6015 0:
6016 vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
6017 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
6018 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
6019 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
6020
6021 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
6022 vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
6023
6024 subs sub_y, sub_y, #1
6025 bne 0b
6026
6027 mov sub_y, #4
6028
6029 add vram_ptr_a, vram_ptr_a, #16
6030 add vram_ptr_b, vram_ptr_b, #16
6031
6032 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6033 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6034
6035 subs tile_x, tile_x, #1
6036 bne 0b
6037
6038 mov tile_x, #8
6039
6040 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6041 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6042
6043 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6044 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6045
6046 subs tile_y, tile_y, #1
6047 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6048
6049 bne 0b
6050
6051 vpop { q0 - q3 }
6052 ldmia sp!, { r4 - r11, pc }
6053
50f9355a 6054
6055/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6056function(scale2x_tiles8)
6057 push { r4, r14 }
6058
6059 mov r4, r1
6060 add r12, r0, #1024*2
6061 mov r14, r2
6062
60630:
6064 vld1.u16 { q0 }, [ r1, :128 ]!
6065 vld1.u16 { q2 }, [ r1, :128 ]!
6066 vmov q1, q0
6067 vmov q3, q2
6068 vzip.16 q0, q1
6069 vzip.16 q2, q3
6070 subs r14, #2
6071 vst1.u16 { q0, q1 }, [ r0, :128 ]!
6072 vst1.u16 { q0, q1 }, [ r12, :128 ]!
6073 blt 1f
6074 vst1.u16 { q2, q3 }, [ r0, :128 ]!
6075 vst1.u16 { q2, q3 }, [ r12, :128 ]!
6076 bgt 0b
60771:
6078 subs r3, #1
6079 mov r14, r2
6080 add r0, #1024*2*2
6081 add r4, #1024*2
6082 sub r0, r2, lsl #4+1
6083 mov r1, r4
6084 add r12, r0, #1024*2
6085 bgt 0b
6086 nop
6087
6088 pop { r4, pc }
59d15d23 6089
6090// vim:filetype=armasm